No public description

PiperOrigin-RevId: 613009142
googlecolab · Mar 6, 2024 · 6985d4d · 6985d4d
1 parent 7f20e6e
commit 6985d4d
Showing 1 changed file with 17 additions and 15 deletions.
diff --git a/google/colab/_dataframe_summarizer.py b/google/colab/_dataframe_summarizer.py
@@ -36,14 +36,16 @@ def _check_type(dtype: str, value):
 def _summarize_columns(df: pd.DataFrame, n_samples: int = 3):
   """Summarize properties of each column in a pandas DataFrame."""
   properties_list = []
-  for column in df.columns:
-    dtype = df[column].dtype
+  # Include index if it is a named index.
+  df_with_columns = df.reset_index() if df.index.name else df
+  for column_name, column in df_with_columns.items():
+    dtype = column.dtype
     properties = {}
     if dtype in (int, float, complex):
       properties["dtype"] = "number"
-      properties["std"] = _check_type(dtype, df[column].std())
-      properties["min"] = _check_type(dtype, df[column].min())
-      properties["max"] = _check_type(dtype, df[column].max())
+      properties["std"] = _check_type(dtype, column.std())
+      properties["min"] = _check_type(dtype, column.min())
+      properties["max"] = _check_type(dtype, column.max())
 
     elif dtype == bool:
       properties["dtype"] = "boolean"
@@ -52,7 +54,7 @@ def _summarize_columns(df: pd.DataFrame, n_samples: int = 3):
       try:
         with warnings.catch_warnings():
           warnings.simplefilter("ignore")
-          pd.to_datetime(df[column], errors="raise")
+          pd.to_datetime(column, errors="raise")
           if (
               not column.empty
               and column.dtype.kind == "O"
@@ -64,37 +66,37 @@ def _summarize_columns(df: pd.DataFrame, n_samples: int = 3):
       except (TypeError, ValueError):
         try:
           # Check if the string column has a limited number of values
-          if df[column].nunique() / len(df[column]) < 0.5:
+          if column.nunique() / len(column) < 0.5:
             properties["dtype"] = "category"
           else:
             properties["dtype"] = "string"
         except TypeError:
           properties["dtype"] = str(dtype)
-    elif pd.api.types.is_categorical_dtype(df[column]):
+    elif pd.api.types.is_categorical_dtype(column):
       properties["dtype"] = "category"
-    elif pd.api.types.is_datetime64_any_dtype(df[column]):
+    elif pd.api.types.is_datetime64_any_dtype(column):
       properties["dtype"] = "date"
     else:
       properties["dtype"] = str(dtype)
 
     # add min max if dtype is date
     if properties["dtype"] == "date":
       try:
-        properties["min"] = df[column].min()
-        properties["max"] = df[column].max()
+        properties["min"] = column.min()
+        properties["max"] = column.max()
       except TypeError:
-        cast_date_col = pd.to_datetime(df[column], errors="coerce")
+        cast_date_col = pd.to_datetime(column, errors="coerce")
         properties["min"] = cast_date_col.min()
         properties["max"] = cast_date_col.max()
     # Add additional properties to the output dictionary
     try:
-      nunique = df[column].nunique()
+      nunique = column.nunique()
       properties["num_unique_values"] = nunique
     except TypeError:
       pass
     if "samples" not in properties:
       try:
-        non_null_values = df[column][df[column].notnull()].unique()
+        non_null_values = column[column.notnull()].unique()
         n_samples = min(n_samples, len(non_null_values))
         samples = (
             pd.Series(non_null_values)
@@ -107,6 +109,6 @@ def _summarize_columns(df: pd.DataFrame, n_samples: int = 3):
         pass
     properties["semantic_type"] = ""
     properties["description"] = ""
-    properties_list.append({"column": column, "properties": properties})
+    properties_list.append({"column": column_name, "properties": properties})
 
   return properties_list