diff --git a/RELEASE.md b/RELEASE.md index a181412be2719..efd075dabcba9 100644 --- a/RELEASE.md +++ b/RELEASE.md @@ -1,6 +1,6 @@ Release Notes ============= -The list of changes to pandas between each release can be found +The list of changes to Pandas between each release can be found [here](http://pandas.pydata.org/pandas-docs/stable/whatsnew.html). For full details, see the commit logs at http://github.com/pandas-dev/pandas. diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 51c3d5578ae6c..1aebfc140284d 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -884,6 +884,8 @@ Bug Fixes - Bug in ``.at`` when selecting from a tz-aware column (:issue:`15822`) - Bug in the display of ``.info()`` where a qualifier (+) would always be displayed with a ``MultiIndex`` that contains only non-strings (:issue:`15245`) - Bug in ``.replace()`` may result in incorrect dtypes. (:issue:`12747`, :issue:`15765`) +- Bug in ``Series.replace`` and ``DataFrame.replace`` which failed on empty replacement dicts (:issue:`15289`) +- Bug in ``Series.replace`` which replaced a numeric by string (:issue:`15743`) - Bug in ``.asfreq()``, where frequency was not set for empty ``Series`` (:issue:`14320`) @@ -986,7 +988,6 @@ Bug Fixes - Bug in ``DataFrame.hist`` where ``plt.tight_layout`` caused an ``AttributeError`` (use ``matplotlib >= 2.0.1``) (:issue:`9351`) - Bug in ``DataFrame.boxplot`` where ``fontsize`` was not applied to the tick labels on both axes (:issue:`15108`) -- Bug in ``Series.replace`` and ``DataFrame.replace`` which failed on empty replacement dicts (:issue:`15289`) - Bug in ``pd.melt()`` where passing a tuple value for ``value_vars`` caused a ``TypeError`` (:issue:`15348`) - Bug in ``.eval()`` which caused multiline evals to fail with local variables not on the first line (:issue:`15342`) - Bug in ``pd.read_msgpack`` which did not allow to load dataframe with an index of type ``CategoricalIndex`` (:issue:`15487`) diff --git a/pandas/core/missing.py b/pandas/core/missing.py index 3b9bfe1de48e7..91039f3270af2 100644 --- a/pandas/core/missing.py +++ b/pandas/core/missing.py @@ -9,10 +9,16 @@ from pandas.compat import range, string_types from pandas.types.common import (is_numeric_v_string_like, - is_float_dtype, is_datetime64_dtype, - is_datetime64tz_dtype, is_integer_dtype, - _ensure_float64, is_scalar, - needs_i8_conversion, is_integer) + is_float_dtype, + is_datetime64_dtype, + is_datetime64tz_dtype, + is_integer_dtype, + is_scalar, + is_integer, + needs_i8_conversion, + _ensure_float64) + +from pandas.types.cast import infer_dtype_from_array from pandas.types.missing import isnull @@ -21,11 +27,11 @@ def mask_missing(arr, values_to_mask): Return a masking array of same size/shape as arr with entries equaling any member of values_to_mask set to True """ - if not isinstance(values_to_mask, (list, np.ndarray)): - values_to_mask = [values_to_mask] + dtype, values_to_mask = infer_dtype_from_array(values_to_mask) try: - values_to_mask = np.array(values_to_mask, dtype=arr.dtype) + values_to_mask = np.array(values_to_mask, dtype=dtype) + except Exception: values_to_mask = np.array(values_to_mask, dtype=object) @@ -409,7 +415,7 @@ def interpolate_2d(values, method='pad', axis=0, limit=None, fill_value=None, if axis != 0: # pragma: no cover raise AssertionError("cannot interpolate on a ndim == 1 with " "axis != 0") - values = values.reshape(tuple((1, ) + values.shape)) + values = values.reshape(tuple((1,) + values.shape)) if fill_value is None: mask = None @@ -447,7 +453,6 @@ def wrapper(arr, mask, limit=None): def pad_1d(values, limit=None, mask=None, dtype=None): - if dtype is None: dtype = values.dtype _method = None @@ -472,7 +477,6 @@ def pad_1d(values, limit=None, mask=None, dtype=None): def backfill_1d(values, limit=None, mask=None, dtype=None): - if dtype is None: dtype = values.dtype _method = None @@ -498,7 +502,6 @@ def backfill_1d(values, limit=None, mask=None, dtype=None): def pad_2d(values, limit=None, mask=None, dtype=None): - if dtype is None: dtype = values.dtype _method = None @@ -528,7 +531,6 @@ def pad_2d(values, limit=None, mask=None, dtype=None): def backfill_2d(values, limit=None, mask=None, dtype=None): - if dtype is None: dtype = values.dtype _method = None diff --git a/pandas/tests/frame/test_replace.py b/pandas/tests/frame/test_replace.py index 8b50036cd50f8..fce59e10bf4bd 100644 --- a/pandas/tests/frame/test_replace.py +++ b/pandas/tests/frame/test_replace.py @@ -795,7 +795,7 @@ def test_replace_dtypes(self): expected = DataFrame({'datetime64': Index([now] * 3)}) assert_frame_equal(result, expected) - def test_replace_input_formats(self): + def test_replace_input_formats_listlike(self): # both dicts to_rep = {'A': np.nan, 'B': 0, 'C': ''} values = {'A': 0, 'B': -1, 'C': 'missing'} @@ -812,15 +812,6 @@ def test_replace_input_formats(self): 'C': ['', 'asdf', 'fd']}) assert_frame_equal(result, expected) - # dict to scalar - filled = df.replace(to_rep, 0) - expected = {} - for k, v in compat.iteritems(df): - expected[k] = v.replace(to_rep[k], 0) - assert_frame_equal(filled, DataFrame(expected)) - - self.assertRaises(TypeError, df.replace, to_rep, [np.nan, 0, '']) - # scalar to dict values = {'A': 0, 'B': -1, 'C': 'missing'} df = DataFrame({'A': [np.nan, 0, np.nan], 'B': [0, 2, 5], @@ -842,6 +833,20 @@ def test_replace_input_formats(self): self.assertRaises(ValueError, df.replace, to_rep, values[1:]) + def test_replace_input_formats_scalar(self): + df = DataFrame({'A': [np.nan, 0, np.inf], 'B': [0, 2, 5], + 'C': ['', 'asdf', 'fd']}) + + # dict to scalar + to_rep = {'A': np.nan, 'B': 0, 'C': ''} + filled = df.replace(to_rep, 0) + expected = {} + for k, v in compat.iteritems(df): + expected[k] = v.replace(to_rep[k], 0) + assert_frame_equal(filled, DataFrame(expected)) + + self.assertRaises(TypeError, df.replace, to_rep, [np.nan, 0, '']) + # list to scalar to_rep = [np.nan, 0, ''] result = df.replace(to_rep, -1) diff --git a/pandas/tests/series/test_replace.py b/pandas/tests/series/test_replace.py index 0a53581e24ba5..5190eb110f4cf 100644 --- a/pandas/tests/series/test_replace.py +++ b/pandas/tests/series/test_replace.py @@ -10,7 +10,6 @@ class TestSeriesReplace(TestData, tm.TestCase): - def test_replace(self): N = 100 ser = pd.Series(np.random.randn(N)) @@ -227,3 +226,24 @@ def test_replace_with_empty_dictlike(self): s = pd.Series(list('abcd')) tm.assert_series_equal(s, s.replace(dict())) tm.assert_series_equal(s, s.replace(pd.Series([]))) + + def test_replace_string_with_number(self): + # GH 15743 + s = pd.Series([1, 2, 3]) + result = s.replace('2', np.nan) + expected = pd.Series([1, 2, 3]) + tm.assert_series_equal(expected, result) + + def test_replace_unicode_with_number(self): + # GH 15743 + s = pd.Series([1, 2, 3]) + result = s.replace(u'2', np.nan) + expected = pd.Series([1, 2, 3]) + tm.assert_series_equal(expected, result) + + def test_replace_mixed_types_with_string(self): + # Testing mixed + s = pd.Series([1, 2, 3, '4', 4, 5]) + result = s.replace([2, '4'], np.nan) + expected = pd.Series([1, np.nan, 3, np.nan, 4, 5]) + tm.assert_series_equal(expected, result) diff --git a/pandas/tests/types/test_cast.py b/pandas/tests/types/test_cast.py index dd4ea3bb02be9..de6ef7af9d7f9 100644 --- a/pandas/tests/types/test_cast.py +++ b/pandas/tests/types/test_cast.py @@ -5,13 +5,15 @@ """ -from datetime import datetime +import pytest +from datetime import datetime, timedelta, date import numpy as np from pandas import Timedelta, Timestamp, DatetimeIndex from pandas.types.cast import (maybe_downcast_to_dtype, maybe_convert_objects, infer_dtype_from_scalar, + infer_dtype_from_array, maybe_convert_string_to_object, maybe_convert_scalar, find_common_type) @@ -82,7 +84,7 @@ def test_datetime_with_timezone(self): tm.assert_index_equal(res, exp) -class TestInferDtype(tm.TestCase): +class TestInferDtype(object): def test_infer_dtype_from_scalar(self): # Test that _infer_dtype_from_scalar is returning correct dtype for int @@ -92,44 +94,62 @@ def test_infer_dtype_from_scalar(self): np.int32, np.uint64, np.int64]: data = dtypec(12) dtype, val = infer_dtype_from_scalar(data) - self.assertEqual(dtype, type(data)) + assert dtype == type(data) data = 12 dtype, val = infer_dtype_from_scalar(data) - self.assertEqual(dtype, np.int64) + assert dtype == np.int64 for dtypec in [np.float16, np.float32, np.float64]: data = dtypec(12) dtype, val = infer_dtype_from_scalar(data) - self.assertEqual(dtype, dtypec) + assert dtype == dtypec data = np.float(12) dtype, val = infer_dtype_from_scalar(data) - self.assertEqual(dtype, np.float64) + assert dtype == np.float64 for data in [True, False]: dtype, val = infer_dtype_from_scalar(data) - self.assertEqual(dtype, np.bool_) + assert dtype == np.bool_ for data in [np.complex64(1), np.complex128(1)]: dtype, val = infer_dtype_from_scalar(data) - self.assertEqual(dtype, np.complex_) + assert dtype == np.complex_ - import datetime for data in [np.datetime64(1, 'ns'), Timestamp(1), - datetime.datetime(2000, 1, 1, 0, 0)]: + datetime(2000, 1, 1, 0, 0)]: dtype, val = infer_dtype_from_scalar(data) - self.assertEqual(dtype, 'M8[ns]') + assert dtype == 'M8[ns]' for data in [np.timedelta64(1, 'ns'), Timedelta(1), - datetime.timedelta(1)]: + timedelta(1)]: dtype, val = infer_dtype_from_scalar(data) - self.assertEqual(dtype, 'm8[ns]') + assert dtype == 'm8[ns]' - for data in [datetime.date(2000, 1, 1), + for data in [date(2000, 1, 1), Timestamp(1, tz='US/Eastern'), 'foo']: dtype, val = infer_dtype_from_scalar(data) - self.assertEqual(dtype, np.object_) + assert dtype == np.object_ + + @pytest.mark.parametrize( + "arr, expected", + [('foo', np.object_), + (b'foo', np.object_), + (1, np.int_), + (1.5, np.float_), + ([1], np.int_), + (np.array([1]), np.int_), + ([np.nan, 1, ''], np.object_), + (np.array([[1.0, 2.0]]), np.float_), + (Timestamp('20160101'), np.object_), + (np.datetime64('2016-01-01'), np.dtype('>> np.asarray([1, '1']) + array(['1', '1'], dtype='>> infer_dtype_from_array([1, '1']) + (numpy.object_, [1, '1']) + + """ + + if isinstance(arr, np.ndarray): + return arr.dtype, arr + + if not is_list_like(arr): + arr = [arr] + + # don't force numpy coerce with nan's + inferred = lib.infer_dtype(arr) + if inferred in ['string', 'bytes', 'unicode', + 'mixed', 'mixed-integer']: + return (np.object_, arr) + + arr = np.asarray(arr) + return arr.dtype, arr + + def maybe_upcast(values, fill_value=np.nan, dtype=None, copy=False): """ provide explict type promotion and coercion