Brief Description
I'm trying to run pandas-log on my chain and it fails with the error:
TypeError: data type not understood
System Information
- Python version (required): Python 3.8.5
- Pandas version: 1.3.2
Minimally Reproducible Code
import pandas as pd
autos = pd.read_csv('https://github.com/mattharrison/datasets/raw/master/data/vehicles.csv.zip')
def to_tz(df_, time_col, tz_offset, tz_name):
return (df_
.groupby(tz_offset)
[time_col]
.transform(lambda s: pd.to_datetime(s)
.dt.tz_localize(s.name, ambiguous=True)
.dt.tz_convert(tz_name))
)
def tweak_autos(autos):
cols = ['city08', 'comb08', 'highway08', 'cylinders', 'displ', 'drive', 'eng_dscr',
'fuelCost08', 'make', 'model', 'trany', 'range', 'createdOn', 'year']
return (autos
[cols]
.assign(cylinders=autos.cylinders.fillna(0).astype('int8'),
displ=autos.displ.fillna(0).astype('float16'),
drive=autos.drive.fillna('Other').astype('category'),
automatic=autos.trany.str.contains('Auto'),
speeds=autos.trany.str.extract(r'(\d)+').fillna('20').astype('int8'),
tz=autos.createdOn.str.extract(r'\d\d:\d\d ([A-Z]{3}?)').replace('EDT', 'EST5EDT'),
str_date=(autos.createdOn.str.slice(4,19) + ' ' + autos.createdOn.str.slice(-4)),
createdOn=lambda df_: to_tz(df_, 'str_date', 'tz', 'US/Eastern'),
ffs=autos.eng_dscr.str.contains('FFS')
)
.pipe(show, rows=2, title='New Cols')
.astype({'highway08': 'int8', 'city08': 'int16', 'comb08': 'int16', 'fuelCost08': 'int16',
'range': 'int16', 'year': 'int16', 'make': 'category'})
.drop(columns=['trany', 'eng_dscr'])
)
import pandas_log
with pandas_log.enable():
tweak_autos(autos)
Error Messages
1) fillna(value: 'object | ArrayLike | None' ="20", method: 'FillnaOptions | None' = None, axis: 'Axis | None' = None, inplace: 'bool' = False, limit=None, downcast=None):
Metadata:
* Filled 837 with 20.
Execution Stats:
* Execution time: Step Took 0.001512 seconds.
1) replace(to_replace="EDT", value="EST5EDT", inplace: 'bool' = False, limit=None, regex: 'bool' = False, method: 'str' = 'pad'):
Execution Stats:
* Execution time: Step Took 0.001215 seconds.
1) groupby(by="tz", axis: 'Axis' = 0, level: 'Level | None' = None, as_index: 'bool' = True, sort: 'bool' = True, group_keys: 'bool' = True, squeeze: 'bool | lib.NoDefault' = <no_default>, observed: 'bool' = False, dropna: 'bool' = True):
Metadata:
* Grouping by tz resulted in 2 groups like
EST,
EST5EDT,
and more.
Execution Stats:
* Execution time: Step Took 0.006409 seconds.
/home/matt/envs/menv/lib/python3.8/site-packages/pandas_log/patched_logs_functions.py:249: UserWarning: Some pandas logging may involve copying dataframes, which can be time-/memory-intensive. Consider passing copy_ok=False to the enable/auto_enable functions in pandas_log if issues arise.
warnings.warn(COPY_WARNING_MSG)
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-1-f6bfc55c635b> in <module>
33 import pandas_log
34 with pandas_log.enable():
---> 35 tweak_autos(autos)
<ipython-input-1-f6bfc55c635b> in tweak_autos(autos)
14 cols = ['city08', 'comb08', 'highway08', 'cylinders', 'displ', 'drive', 'eng_dscr',
15 'fuelCost08', 'make', 'model', 'trany', 'range', 'createdOn', 'year']
---> 16 return (autos
17 [cols]
18 .assign(cylinders=autos.cylinders.fillna(0).astype('int8'),
~/envs/menv/lib/python3.8/site-packages/pandas_flavor/register.py in __call__(self, *args, **kwargs)
27 @wraps(method)
28 def __call__(self, *args, **kwargs):
---> 29 return method(self._obj, *args, **kwargs)
30
31 register_dataframe_accessor(method.__name__)(AccessorMethod)
~/envs/menv/lib/python3.8/site-packages/pandas_log/pandas_log.py in wrapped(*args, **fn_kwargs)
184
185 input_df, fn_args = args[0], args[1:]
--> 186 output_df = _run_method_and_calc_stats(
187 fn,
188 fn_args,
~/envs/menv/lib/python3.8/site-packages/pandas_log/pandas_log.py in _run_method_and_calc_stats(fn, fn_args, fn_kwargs, input_df, full_signature, silent, verbose, copy_ok, calculate_memory)
168 output_df,
169 )
--> 170 step_stats.log_stats_if_needed(silent, verbose, copy_ok)
171 if isinstance(output_df, pd.DataFrame) or isinstance(output_df, pd.Series):
172 step_stats.persist_execution_stats()
~/envs/menv/lib/python3.8/site-packages/pandas_log/pandas_execution_stats.py in log_stats_if_needed(self, silent, verbose, copy_ok)
106
107 if verbose or self.fn.__name__ not in DATAFRAME_ADDITIONAL_METHODS_TO_OVERIDE:
--> 108 s = self.__repr__(verbose, copy_ok)
109 if s:
110 # If this method isn't patched and verbose is False, __repr__ will give an empty string, which
~/envs/menv/lib/python3.8/site-packages/pandas_log/pandas_execution_stats.py in __repr__(self, verbose, copy_ok)
147
148 # Step Metadata stats
--> 149 logs, tips = self.get_logs_for_specifc_method(verbose, copy_ok)
150 metadata_stats = f"\033[4mMetadata\033[0m:\n{logs}" if logs else ""
151 metadata_tips = f"\033[4mTips\033[0m:\n{tips}" if tips else ""
~/envs/menv/lib/python3.8/site-packages/pandas_log/pandas_execution_stats.py in get_logs_for_specifc_method(self, verbose, copy_ok)
128
129 log_method = partial(log_method, self.output_df, self.input_df)
--> 130 logs, tips = log_method(*self.fn_args, **self.fn_kwargs)
131 return logs, tips
132
~/envs/menv/lib/python3.8/site-packages/pandas_log/patched_logs_functions.py in log_assign(output_df, input_df, **kwargs)
250 # If copying is ok, we can check how many values actually changed
251 for col in changed_cols:
--> 252 values_changed, values_unchanged = num_values_changed(
253 input_df[col], output_df[col]
254 )
~/envs/menv/lib/python3.8/site-packages/pandas_log/patched_logs_functions.py in num_values_changed(input_obj, output_obj)
127 isinstance(input_obj, pd.Series)
128 and isinstance(output_obj, pd.Series)
--> 129 and input_obj.dtype != output_obj.dtype
130 ):
131 # Comparing values for equality across dtypes wouldn't be well-defined so we just say they all changed
TypeError: Cannot interpret 'datetime64[ns, US/Eastern]' as a data type