Skip to content

Commit

Permalink
BUG: replace of numeric by string / dtype coversion (GH15743)
Browse files Browse the repository at this point in the history
closes pandas-dev#15743

Author: Carlos Souza <[email protected]>
Author: Jeff Reback <[email protected]>

Closes pandas-dev#15812 from ucals/bug-fix-15743 and squashes the following commits:

e6e4971 [Carlos Souza] Adding replace unicode with number and replace mixed types with string tests
bd31b2b [Carlos Souza] Resolving merge conflict by incorporating @jreback suggestions
73805ce [Jeff Reback] CLN: add infer_dtype_from_array
45e67e4 [Carlos Souza] Fixing PEP8 line indent
0a98557 [Carlos Souza] BUG: replace of numeric by string fixed
97e1f18 [Carlos Souza] Test
e62763c [Carlos Souza] Fixing PEP8 line indent
080c71e [Carlos Souza] BUG: replace of numeric by string fixed
8b463cb [Carlos Souza] Merge remote-tracking branch 'upstream/master'
9fc617b [Carlos Souza] Merge remote-tracking branch 'upstream/master'
e12bca7 [Carlos Souza] Sync fork
676a4e5 [Carlos Souza] Test
  • Loading branch information
Carlos Souza authored and jreback committed Mar 28, 2017
1 parent d96ff29 commit 6f789e1
Show file tree
Hide file tree
Showing 7 changed files with 132 additions and 40 deletions.
2 changes: 1 addition & 1 deletion RELEASE.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Release Notes
=============

The list of changes to pandas between each release can be found
The list of changes to Pandas between each release can be found
[here](http://pandas.pydata.org/pandas-docs/stable/whatsnew.html). For full
details, see the commit logs at http://github.com/pandas-dev/pandas.
3 changes: 2 additions & 1 deletion doc/source/whatsnew/v0.20.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -884,6 +884,8 @@ Bug Fixes
- Bug in ``.at`` when selecting from a tz-aware column (:issue:`15822`)
- Bug in the display of ``.info()`` where a qualifier (+) would always be displayed with a ``MultiIndex`` that contains only non-strings (:issue:`15245`)
- Bug in ``.replace()`` may result in incorrect dtypes. (:issue:`12747`, :issue:`15765`)
- Bug in ``Series.replace`` and ``DataFrame.replace`` which failed on empty replacement dicts (:issue:`15289`)
- Bug in ``Series.replace`` which replaced a numeric by string (:issue:`15743`)

- Bug in ``.asfreq()``, where frequency was not set for empty ``Series`` (:issue:`14320`)

Expand Down Expand Up @@ -986,7 +988,6 @@ Bug Fixes

- Bug in ``DataFrame.hist`` where ``plt.tight_layout`` caused an ``AttributeError`` (use ``matplotlib >= 2.0.1``) (:issue:`9351`)
- Bug in ``DataFrame.boxplot`` where ``fontsize`` was not applied to the tick labels on both axes (:issue:`15108`)
- Bug in ``Series.replace`` and ``DataFrame.replace`` which failed on empty replacement dicts (:issue:`15289`)
- Bug in ``pd.melt()`` where passing a tuple value for ``value_vars`` caused a ``TypeError`` (:issue:`15348`)
- Bug in ``.eval()`` which caused multiline evals to fail with local variables not on the first line (:issue:`15342`)
- Bug in ``pd.read_msgpack`` which did not allow to load dataframe with an index of type ``CategoricalIndex`` (:issue:`15487`)
26 changes: 14 additions & 12 deletions pandas/core/missing.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,16 @@

from pandas.compat import range, string_types
from pandas.types.common import (is_numeric_v_string_like,
is_float_dtype, is_datetime64_dtype,
is_datetime64tz_dtype, is_integer_dtype,
_ensure_float64, is_scalar,
needs_i8_conversion, is_integer)
is_float_dtype,
is_datetime64_dtype,
is_datetime64tz_dtype,
is_integer_dtype,
is_scalar,
is_integer,
needs_i8_conversion,
_ensure_float64)

from pandas.types.cast import infer_dtype_from_array
from pandas.types.missing import isnull


Expand All @@ -21,11 +27,11 @@ def mask_missing(arr, values_to_mask):
Return a masking array of same size/shape as arr
with entries equaling any member of values_to_mask set to True
"""
if not isinstance(values_to_mask, (list, np.ndarray)):
values_to_mask = [values_to_mask]
dtype, values_to_mask = infer_dtype_from_array(values_to_mask)

try:
values_to_mask = np.array(values_to_mask, dtype=arr.dtype)
values_to_mask = np.array(values_to_mask, dtype=dtype)

except Exception:
values_to_mask = np.array(values_to_mask, dtype=object)

Expand Down Expand Up @@ -409,7 +415,7 @@ def interpolate_2d(values, method='pad', axis=0, limit=None, fill_value=None,
if axis != 0: # pragma: no cover
raise AssertionError("cannot interpolate on a ndim == 1 with "
"axis != 0")
values = values.reshape(tuple((1, ) + values.shape))
values = values.reshape(tuple((1,) + values.shape))

if fill_value is None:
mask = None
Expand Down Expand Up @@ -447,7 +453,6 @@ def wrapper(arr, mask, limit=None):


def pad_1d(values, limit=None, mask=None, dtype=None):

if dtype is None:
dtype = values.dtype
_method = None
Expand All @@ -472,7 +477,6 @@ def pad_1d(values, limit=None, mask=None, dtype=None):


def backfill_1d(values, limit=None, mask=None, dtype=None):

if dtype is None:
dtype = values.dtype
_method = None
Expand All @@ -498,7 +502,6 @@ def backfill_1d(values, limit=None, mask=None, dtype=None):


def pad_2d(values, limit=None, mask=None, dtype=None):

if dtype is None:
dtype = values.dtype
_method = None
Expand Down Expand Up @@ -528,7 +531,6 @@ def pad_2d(values, limit=None, mask=None, dtype=None):


def backfill_2d(values, limit=None, mask=None, dtype=None):

if dtype is None:
dtype = values.dtype
_method = None
Expand Down
25 changes: 15 additions & 10 deletions pandas/tests/frame/test_replace.py
Original file line number Diff line number Diff line change
Expand Up @@ -795,7 +795,7 @@ def test_replace_dtypes(self):
expected = DataFrame({'datetime64': Index([now] * 3)})
assert_frame_equal(result, expected)

def test_replace_input_formats(self):
def test_replace_input_formats_listlike(self):
# both dicts
to_rep = {'A': np.nan, 'B': 0, 'C': ''}
values = {'A': 0, 'B': -1, 'C': 'missing'}
Expand All @@ -812,15 +812,6 @@ def test_replace_input_formats(self):
'C': ['', 'asdf', 'fd']})
assert_frame_equal(result, expected)

# dict to scalar
filled = df.replace(to_rep, 0)
expected = {}
for k, v in compat.iteritems(df):
expected[k] = v.replace(to_rep[k], 0)
assert_frame_equal(filled, DataFrame(expected))

self.assertRaises(TypeError, df.replace, to_rep, [np.nan, 0, ''])

# scalar to dict
values = {'A': 0, 'B': -1, 'C': 'missing'}
df = DataFrame({'A': [np.nan, 0, np.nan], 'B': [0, 2, 5],
Expand All @@ -842,6 +833,20 @@ def test_replace_input_formats(self):

self.assertRaises(ValueError, df.replace, to_rep, values[1:])

def test_replace_input_formats_scalar(self):
df = DataFrame({'A': [np.nan, 0, np.inf], 'B': [0, 2, 5],
'C': ['', 'asdf', 'fd']})

# dict to scalar
to_rep = {'A': np.nan, 'B': 0, 'C': ''}
filled = df.replace(to_rep, 0)
expected = {}
for k, v in compat.iteritems(df):
expected[k] = v.replace(to_rep[k], 0)
assert_frame_equal(filled, DataFrame(expected))

self.assertRaises(TypeError, df.replace, to_rep, [np.nan, 0, ''])

# list to scalar
to_rep = [np.nan, 0, '']
result = df.replace(to_rep, -1)
Expand Down
22 changes: 21 additions & 1 deletion pandas/tests/series/test_replace.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@


class TestSeriesReplace(TestData, tm.TestCase):

def test_replace(self):
N = 100
ser = pd.Series(np.random.randn(N))
Expand Down Expand Up @@ -227,3 +226,24 @@ def test_replace_with_empty_dictlike(self):
s = pd.Series(list('abcd'))
tm.assert_series_equal(s, s.replace(dict()))
tm.assert_series_equal(s, s.replace(pd.Series([])))

def test_replace_string_with_number(self):
# GH 15743
s = pd.Series([1, 2, 3])
result = s.replace('2', np.nan)
expected = pd.Series([1, 2, 3])
tm.assert_series_equal(expected, result)

def test_replace_unicode_with_number(self):
# GH 15743
s = pd.Series([1, 2, 3])
result = s.replace(u'2', np.nan)
expected = pd.Series([1, 2, 3])
tm.assert_series_equal(expected, result)

def test_replace_mixed_types_with_string(self):
# Testing mixed
s = pd.Series([1, 2, 3, '4', 4, 5])
result = s.replace([2, '4'], np.nan)
expected = pd.Series([1, np.nan, 3, np.nan, 4, 5])
tm.assert_series_equal(expected, result)
50 changes: 35 additions & 15 deletions pandas/tests/types/test_cast.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,15 @@
"""

from datetime import datetime
import pytest
from datetime import datetime, timedelta, date
import numpy as np

from pandas import Timedelta, Timestamp, DatetimeIndex
from pandas.types.cast import (maybe_downcast_to_dtype,
maybe_convert_objects,
infer_dtype_from_scalar,
infer_dtype_from_array,
maybe_convert_string_to_object,
maybe_convert_scalar,
find_common_type)
Expand Down Expand Up @@ -82,7 +84,7 @@ def test_datetime_with_timezone(self):
tm.assert_index_equal(res, exp)


class TestInferDtype(tm.TestCase):
class TestInferDtype(object):

def test_infer_dtype_from_scalar(self):
# Test that _infer_dtype_from_scalar is returning correct dtype for int
Expand All @@ -92,44 +94,62 @@ def test_infer_dtype_from_scalar(self):
np.int32, np.uint64, np.int64]:
data = dtypec(12)
dtype, val = infer_dtype_from_scalar(data)
self.assertEqual(dtype, type(data))
assert dtype == type(data)

data = 12
dtype, val = infer_dtype_from_scalar(data)
self.assertEqual(dtype, np.int64)
assert dtype == np.int64

for dtypec in [np.float16, np.float32, np.float64]:
data = dtypec(12)
dtype, val = infer_dtype_from_scalar(data)
self.assertEqual(dtype, dtypec)
assert dtype == dtypec

data = np.float(12)
dtype, val = infer_dtype_from_scalar(data)
self.assertEqual(dtype, np.float64)
assert dtype == np.float64

for data in [True, False]:
dtype, val = infer_dtype_from_scalar(data)
self.assertEqual(dtype, np.bool_)
assert dtype == np.bool_

for data in [np.complex64(1), np.complex128(1)]:
dtype, val = infer_dtype_from_scalar(data)
self.assertEqual(dtype, np.complex_)
assert dtype == np.complex_

import datetime
for data in [np.datetime64(1, 'ns'), Timestamp(1),
datetime.datetime(2000, 1, 1, 0, 0)]:
datetime(2000, 1, 1, 0, 0)]:
dtype, val = infer_dtype_from_scalar(data)
self.assertEqual(dtype, 'M8[ns]')
assert dtype == 'M8[ns]'

for data in [np.timedelta64(1, 'ns'), Timedelta(1),
datetime.timedelta(1)]:
timedelta(1)]:
dtype, val = infer_dtype_from_scalar(data)
self.assertEqual(dtype, 'm8[ns]')
assert dtype == 'm8[ns]'

for data in [datetime.date(2000, 1, 1),
for data in [date(2000, 1, 1),
Timestamp(1, tz='US/Eastern'), 'foo']:
dtype, val = infer_dtype_from_scalar(data)
self.assertEqual(dtype, np.object_)
assert dtype == np.object_

@pytest.mark.parametrize(
"arr, expected",
[('foo', np.object_),
(b'foo', np.object_),
(1, np.int_),
(1.5, np.float_),
([1], np.int_),
(np.array([1]), np.int_),
([np.nan, 1, ''], np.object_),
(np.array([[1.0, 2.0]]), np.float_),
(Timestamp('20160101'), np.object_),
(np.datetime64('2016-01-01'), np.dtype('<M8[D]')),
])
def test_infer_dtype_from_array(self, arr, expected):

# these infer specifically to numpy dtypes
dtype, _ = infer_dtype_from_array(arr)
assert dtype == expected


class TestMaybe(tm.TestCase):
Expand Down
44 changes: 44 additions & 0 deletions pandas/types/cast.py
Original file line number Diff line number Diff line change
Expand Up @@ -387,6 +387,50 @@ def infer_dtype_from_scalar(val, pandas_dtype=False):
return dtype, val


def infer_dtype_from_array(arr):
"""
infer the dtype from a scalar or array
Parameters
----------
arr : scalar or array
Returns
-------
tuple (numpy-compat dtype, array)
Notes
-----
These infer to numpy dtypes exactly
with the exception that mixed / object dtypes
are not coerced by stringifying or conversion
Examples
--------
>>> np.asarray([1, '1'])
array(['1', '1'], dtype='<U21')
>>> infer_dtype_from_array([1, '1'])
(numpy.object_, [1, '1'])
"""

if isinstance(arr, np.ndarray):
return arr.dtype, arr

if not is_list_like(arr):
arr = [arr]

# don't force numpy coerce with nan's
inferred = lib.infer_dtype(arr)
if inferred in ['string', 'bytes', 'unicode',
'mixed', 'mixed-integer']:
return (np.object_, arr)

arr = np.asarray(arr)
return arr.dtype, arr


def maybe_upcast(values, fill_value=np.nan, dtype=None, copy=False):
""" provide explict type promotion and coercion
Expand Down

0 comments on commit 6f789e1

Please sign in to comment.