Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

TypeError: data type not understood #34

Open
mattharrison opened this issue Sep 3, 2021 · 2 comments
Open

TypeError: data type not understood #34

mattharrison opened this issue Sep 3, 2021 · 2 comments

Comments

@mattharrison
Copy link

Brief Description

I'm trying to run pandas-log on my chain and it fails with the error:

TypeError: data type not understood

System Information

  • Python version (required): Python 3.8.5
  • Pandas version: 1.3.2

Minimally Reproducible Code

import pandas as pd
autos = pd.read_csv('https://github.com/mattharrison/datasets/raw/master/data/vehicles.csv.zip')
def to_tz(df_, time_col, tz_offset, tz_name):
    return (df_
             .groupby(tz_offset)
             [time_col]
             .transform(lambda s: pd.to_datetime(s)
                 .dt.tz_localize(s.name, ambiguous=True)
                 .dt.tz_convert(tz_name))
            )


def tweak_autos(autos):
    cols = ['city08', 'comb08', 'highway08', 'cylinders', 'displ', 'drive', 'eng_dscr', 
        'fuelCost08', 'make', 'model', 'trany', 'range', 'createdOn', 'year']
    return (autos
     [cols]
     .assign(cylinders=autos.cylinders.fillna(0).astype('int8'),
             displ=autos.displ.fillna(0).astype('float16'),
             drive=autos.drive.fillna('Other').astype('category'),
             automatic=autos.trany.str.contains('Auto'),
             speeds=autos.trany.str.extract(r'(\d)+').fillna('20').astype('int8'),
             tz=autos.createdOn.str.extract(r'\d\d:\d\d ([A-Z]{3}?)').replace('EDT', 'EST5EDT'),
             str_date=(autos.createdOn.str.slice(4,19) + ' ' + autos.createdOn.str.slice(-4)),
             createdOn=lambda df_: to_tz(df_, 'str_date', 'tz', 'US/Eastern'),
             ffs=autos.eng_dscr.str.contains('FFS')
            )
     .pipe(show, rows=2, title='New Cols')            
     .astype({'highway08': 'int8', 'city08': 'int16', 'comb08': 'int16', 'fuelCost08': 'int16',
              'range': 'int16',  'year': 'int16', 'make': 'category'})
     .drop(columns=['trany', 'eng_dscr'])
    )
import pandas_log
with pandas_log.enable():
    tweak_autos(autos)

Error Messages

1) fillna(value: 'object | ArrayLike | None' ="20", method: 'FillnaOptions | None' = None, axis: 'Axis | None' = None, inplace: 'bool' = False, limit=None, downcast=None):
	Metadata:
	* Filled 837 with 20.
	Execution Stats:
	* Execution time: Step Took 0.001512 seconds.

1) replace(to_replace="EDT", value="EST5EDT", inplace: 'bool' = False, limit=None, regex: 'bool' = False, method: 'str' = 'pad'):
	Execution Stats:
	* Execution time: Step Took 0.001215 seconds.

1) groupby(by="tz", axis: 'Axis' = 0, level: 'Level | None' = None, as_index: 'bool' = True, sort: 'bool' = True, group_keys: 'bool' = True, squeeze: 'bool | lib.NoDefault' = <no_default>, observed: 'bool' = False, dropna: 'bool' = True):
	Metadata:
	* Grouping by tz resulted in 2 groups like 
		EST,
		EST5EDT,
	  and more.
	Execution Stats:
	* Execution time: Step Took 0.006409 seconds.
/home/matt/envs/menv/lib/python3.8/site-packages/pandas_log/patched_logs_functions.py:249: UserWarning: Some pandas logging may involve copying dataframes, which can be time-/memory-intensive. Consider passing copy_ok=False to the enable/auto_enable functions in pandas_log if issues arise.
  warnings.warn(COPY_WARNING_MSG)
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-1-f6bfc55c635b> in <module>
     33 import pandas_log
     34 with pandas_log.enable():
---> 35     tweak_autos(autos)

<ipython-input-1-f6bfc55c635b> in tweak_autos(autos)
     14     cols = ['city08', 'comb08', 'highway08', 'cylinders', 'displ', 'drive', 'eng_dscr', 
     15         'fuelCost08', 'make', 'model', 'trany', 'range', 'createdOn', 'year']
---> 16     return (autos
     17      [cols]
     18      .assign(cylinders=autos.cylinders.fillna(0).astype('int8'),

~/envs/menv/lib/python3.8/site-packages/pandas_flavor/register.py in __call__(self, *args, **kwargs)
     27             @wraps(method)
     28             def __call__(self, *args, **kwargs):
---> 29                 return method(self._obj, *args, **kwargs)
     30 
     31         register_dataframe_accessor(method.__name__)(AccessorMethod)

~/envs/menv/lib/python3.8/site-packages/pandas_log/pandas_log.py in wrapped(*args, **fn_kwargs)
    184 
    185             input_df, fn_args = args[0], args[1:]
--> 186             output_df = _run_method_and_calc_stats(
    187                 fn,
    188                 fn_args,

~/envs/menv/lib/python3.8/site-packages/pandas_log/pandas_log.py in _run_method_and_calc_stats(fn, fn_args, fn_kwargs, input_df, full_signature, silent, verbose, copy_ok, calculate_memory)
    168             output_df,
    169         )
--> 170         step_stats.log_stats_if_needed(silent, verbose, copy_ok)
    171         if isinstance(output_df, pd.DataFrame) or isinstance(output_df, pd.Series):
    172             step_stats.persist_execution_stats()

~/envs/menv/lib/python3.8/site-packages/pandas_log/pandas_execution_stats.py in log_stats_if_needed(self, silent, verbose, copy_ok)
    106 
    107         if verbose or self.fn.__name__ not in DATAFRAME_ADDITIONAL_METHODS_TO_OVERIDE:
--> 108             s = self.__repr__(verbose, copy_ok)
    109             if s:
    110                 # If this method isn't patched and verbose is False, __repr__ will give an empty string, which

~/envs/menv/lib/python3.8/site-packages/pandas_log/pandas_execution_stats.py in __repr__(self, verbose, copy_ok)
    147 
    148         # Step Metadata stats
--> 149         logs, tips = self.get_logs_for_specifc_method(verbose, copy_ok)
    150         metadata_stats = f"\033[4mMetadata\033[0m:\n{logs}" if logs else ""
    151         metadata_tips = f"\033[4mTips\033[0m:\n{tips}" if tips else ""

~/envs/menv/lib/python3.8/site-packages/pandas_log/pandas_execution_stats.py in get_logs_for_specifc_method(self, verbose, copy_ok)
    128 
    129         log_method = partial(log_method, self.output_df, self.input_df)
--> 130         logs, tips = log_method(*self.fn_args, **self.fn_kwargs)
    131         return logs, tips
    132 

~/envs/menv/lib/python3.8/site-packages/pandas_log/patched_logs_functions.py in log_assign(output_df, input_df, **kwargs)
    250             # If copying is ok, we can check how many values actually changed
    251             for col in changed_cols:
--> 252                 values_changed, values_unchanged = num_values_changed(
    253                     input_df[col], output_df[col]
    254                 )

~/envs/menv/lib/python3.8/site-packages/pandas_log/patched_logs_functions.py in num_values_changed(input_obj, output_obj)
    127         isinstance(input_obj, pd.Series)
    128         and isinstance(output_obj, pd.Series)
--> 129         and input_obj.dtype != output_obj.dtype
    130     ):
    131         # Comparing values for equality across dtypes wouldn't be well-defined so we just say they all changed

TypeError: Cannot interpret 'datetime64[ns, US/Eastern]' as a data type
@mattharrison
Copy link
Author

Note that I commented out the line:

createdOn=lambda df_: to_tz(df_, 'str_date', 'tz', 'US/Eastern'),

And re-ran the code and got this error:

/home/matt/envs/menv/lib/python3.8/site-packages/IPython/core/interactiveshell.py:3418: DtypeWarning: Columns (68,70,71,72,73,74,76,79) have mixed types.Specify dtype option on import or set low_memory=False.
  exec(code_obj, self.user_global_ns, self.user_ns)

1) fillna(value: 'object | ArrayLike | None' ="20", method: 'FillnaOptions | None' = None, axis: 'Axis | None' = None, inplace: 'bool' = False, limit=None, downcast=None):
	Metadata:
	* Filled 837 with 20.
	Execution Stats:
	* Execution time: Step Took 0.001567 seconds.

1) replace(to_replace="EDT", value="EST5EDT", inplace: 'bool' = False, limit=None, regex: 'bool' = False, method: 'str' = 'pad'):
	Execution Stats:
	* Execution time: Step Took 0.003579 seconds.
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-4-376a7e1a7d6b> in <module>
     33 import pandas_log
     34 with pandas_log.enable():
---> 35     tweak_autos(autos)

<ipython-input-4-376a7e1a7d6b> in tweak_autos(autos)
     14     cols = ['city08', 'comb08', 'highway08', 'cylinders', 'displ', 'drive', 'eng_dscr', 
     15         'fuelCost08', 'make', 'model', 'trany', 'range', 'createdOn', 'year']
---> 16     return (autos
     17      [cols]
     18      .assign(cylinders=autos.cylinders.fillna(0).astype('int8'),

~/envs/menv/lib/python3.8/site-packages/pandas_flavor/register.py in __call__(self, *args, **kwargs)
     27             @wraps(method)
     28             def __call__(self, *args, **kwargs):
---> 29                 return method(self._obj, *args, **kwargs)
     30 
     31         register_dataframe_accessor(method.__name__)(AccessorMethod)

~/envs/menv/lib/python3.8/site-packages/pandas_log/pandas_log.py in wrapped(*args, **fn_kwargs)
    184 
    185             input_df, fn_args = args[0], args[1:]
--> 186             output_df = _run_method_and_calc_stats(
    187                 fn,
    188                 fn_args,

~/envs/menv/lib/python3.8/site-packages/pandas_log/pandas_log.py in _run_method_and_calc_stats(fn, fn_args, fn_kwargs, input_df, full_signature, silent, verbose, copy_ok, calculate_memory)
    168             output_df,
    169         )
--> 170         step_stats.log_stats_if_needed(silent, verbose, copy_ok)
    171         if isinstance(output_df, pd.DataFrame) or isinstance(output_df, pd.Series):
    172             step_stats.persist_execution_stats()

~/envs/menv/lib/python3.8/site-packages/pandas_log/pandas_execution_stats.py in log_stats_if_needed(self, silent, verbose, copy_ok)
    106 
    107         if verbose or self.fn.__name__ not in DATAFRAME_ADDITIONAL_METHODS_TO_OVERIDE:
--> 108             s = self.__repr__(verbose, copy_ok)
    109             if s:
    110                 # If this method isn't patched and verbose is False, __repr__ will give an empty string, which

~/envs/menv/lib/python3.8/site-packages/pandas_log/pandas_execution_stats.py in __repr__(self, verbose, copy_ok)
    147 
    148         # Step Metadata stats
--> 149         logs, tips = self.get_logs_for_specifc_method(verbose, copy_ok)
    150         metadata_stats = f"\033[4mMetadata\033[0m:\n{logs}" if logs else ""
    151         metadata_tips = f"\033[4mTips\033[0m:\n{tips}" if tips else ""

~/envs/menv/lib/python3.8/site-packages/pandas_log/pandas_execution_stats.py in get_logs_for_specifc_method(self, verbose, copy_ok)
    128 
    129         log_method = partial(log_method, self.output_df, self.input_df)
--> 130         logs, tips = log_method(*self.fn_args, **self.fn_kwargs)
    131         return logs, tips
    132 

~/envs/menv/lib/python3.8/site-packages/pandas_log/patched_logs_functions.py in log_assign(output_df, input_df, **kwargs)
    250             # If copying is ok, we can check how many values actually changed
    251             for col in changed_cols:
--> 252                 values_changed, values_unchanged = num_values_changed(
    253                     input_df[col], output_df[col]
    254                 )

~/envs/menv/lib/python3.8/site-packages/pandas_log/patched_logs_functions.py in num_values_changed(input_obj, output_obj)
    127         isinstance(input_obj, pd.Series)
    128         and isinstance(output_obj, pd.Series)
--> 129         and input_obj.dtype != output_obj.dtype
    130     ):
    131         # Comparing values for equality across dtypes wouldn't be well-defined so we just say they all changed

TypeError: Cannot interpret 'CategoricalDtype(categories=['2-Wheel Drive', '4-Wheel Drive',
                  '4-Wheel or All-Wheel Drive', 'All-Wheel Drive',
                  'Front-Wheel Drive', 'Other', 'Part-time 4-Wheel Drive',
                  'Rear-Wheel Drive'],
, ordered=False)' as a data type

@mattharrison
Copy link
Author

Also note that these failures were not handled correctly by the context manager, and if I try to run tweak_autos normally, it tries to use pandas-log and fails. Maybe this warrants its own bug.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

1 participant