diff --git a/google/colab/_dataframe_summarizer.py b/google/colab/_dataframe_summarizer.py index b5fd7c69..74ffd346 100644 --- a/google/colab/_dataframe_summarizer.py +++ b/google/colab/_dataframe_summarizer.py @@ -36,14 +36,16 @@ def _check_type(dtype: str, value): def _summarize_columns(df: pd.DataFrame, n_samples: int = 3): """Summarize properties of each column in a pandas DataFrame.""" properties_list = [] - for column in df.columns: - dtype = df[column].dtype + # Include index if it is a named index. + df_with_columns = df.reset_index() if df.index.name else df + for column_name, column in df_with_columns.items(): + dtype = column.dtype properties = {} if dtype in (int, float, complex): properties["dtype"] = "number" - properties["std"] = _check_type(dtype, df[column].std()) - properties["min"] = _check_type(dtype, df[column].min()) - properties["max"] = _check_type(dtype, df[column].max()) + properties["std"] = _check_type(dtype, column.std()) + properties["min"] = _check_type(dtype, column.min()) + properties["max"] = _check_type(dtype, column.max()) elif dtype == bool: properties["dtype"] = "boolean" @@ -52,7 +54,7 @@ def _summarize_columns(df: pd.DataFrame, n_samples: int = 3): try: with warnings.catch_warnings(): warnings.simplefilter("ignore") - pd.to_datetime(df[column], errors="raise") + pd.to_datetime(column, errors="raise") if ( not column.empty and column.dtype.kind == "O" @@ -64,15 +66,15 @@ def _summarize_columns(df: pd.DataFrame, n_samples: int = 3): except (TypeError, ValueError): try: # Check if the string column has a limited number of values - if df[column].nunique() / len(df[column]) < 0.5: + if column.nunique() / len(column) < 0.5: properties["dtype"] = "category" else: properties["dtype"] = "string" except TypeError: properties["dtype"] = str(dtype) - elif pd.api.types.is_categorical_dtype(df[column]): + elif pd.api.types.is_categorical_dtype(column): properties["dtype"] = "category" - elif pd.api.types.is_datetime64_any_dtype(df[column]): + elif pd.api.types.is_datetime64_any_dtype(column): properties["dtype"] = "date" else: properties["dtype"] = str(dtype) @@ -80,21 +82,21 @@ def _summarize_columns(df: pd.DataFrame, n_samples: int = 3): # add min max if dtype is date if properties["dtype"] == "date": try: - properties["min"] = df[column].min() - properties["max"] = df[column].max() + properties["min"] = column.min() + properties["max"] = column.max() except TypeError: - cast_date_col = pd.to_datetime(df[column], errors="coerce") + cast_date_col = pd.to_datetime(column, errors="coerce") properties["min"] = cast_date_col.min() properties["max"] = cast_date_col.max() # Add additional properties to the output dictionary try: - nunique = df[column].nunique() + nunique = column.nunique() properties["num_unique_values"] = nunique except TypeError: pass if "samples" not in properties: try: - non_null_values = df[column][df[column].notnull()].unique() + non_null_values = column[column.notnull()].unique() n_samples = min(n_samples, len(non_null_values)) samples = ( pd.Series(non_null_values) @@ -107,6 +109,6 @@ def _summarize_columns(df: pd.DataFrame, n_samples: int = 3): pass properties["semantic_type"] = "" properties["description"] = "" - properties_list.append({"column": column, "properties": properties}) + properties_list.append({"column": column_name, "properties": properties}) return properties_list