Skip to content

Commit

Permalink
No public description
Browse files Browse the repository at this point in the history
PiperOrigin-RevId: 613009142
  • Loading branch information
blois authored and colaboratory-team committed Mar 6, 2024
1 parent 7f20e6e commit 6985d4d
Showing 1 changed file with 17 additions and 15 deletions.
32 changes: 17 additions & 15 deletions google/colab/_dataframe_summarizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,14 +36,16 @@ def _check_type(dtype: str, value):
def _summarize_columns(df: pd.DataFrame, n_samples: int = 3):
"""Summarize properties of each column in a pandas DataFrame."""
properties_list = []
for column in df.columns:
dtype = df[column].dtype
# Include index if it is a named index.
df_with_columns = df.reset_index() if df.index.name else df
for column_name, column in df_with_columns.items():
dtype = column.dtype
properties = {}
if dtype in (int, float, complex):
properties["dtype"] = "number"
properties["std"] = _check_type(dtype, df[column].std())
properties["min"] = _check_type(dtype, df[column].min())
properties["max"] = _check_type(dtype, df[column].max())
properties["std"] = _check_type(dtype, column.std())
properties["min"] = _check_type(dtype, column.min())
properties["max"] = _check_type(dtype, column.max())

elif dtype == bool:
properties["dtype"] = "boolean"
Expand All @@ -52,7 +54,7 @@ def _summarize_columns(df: pd.DataFrame, n_samples: int = 3):
try:
with warnings.catch_warnings():
warnings.simplefilter("ignore")
pd.to_datetime(df[column], errors="raise")
pd.to_datetime(column, errors="raise")
if (
not column.empty
and column.dtype.kind == "O"
Expand All @@ -64,37 +66,37 @@ def _summarize_columns(df: pd.DataFrame, n_samples: int = 3):
except (TypeError, ValueError):
try:
# Check if the string column has a limited number of values
if df[column].nunique() / len(df[column]) < 0.5:
if column.nunique() / len(column) < 0.5:
properties["dtype"] = "category"
else:
properties["dtype"] = "string"
except TypeError:
properties["dtype"] = str(dtype)
elif pd.api.types.is_categorical_dtype(df[column]):
elif pd.api.types.is_categorical_dtype(column):
properties["dtype"] = "category"
elif pd.api.types.is_datetime64_any_dtype(df[column]):
elif pd.api.types.is_datetime64_any_dtype(column):
properties["dtype"] = "date"
else:
properties["dtype"] = str(dtype)

# add min max if dtype is date
if properties["dtype"] == "date":
try:
properties["min"] = df[column].min()
properties["max"] = df[column].max()
properties["min"] = column.min()
properties["max"] = column.max()
except TypeError:
cast_date_col = pd.to_datetime(df[column], errors="coerce")
cast_date_col = pd.to_datetime(column, errors="coerce")
properties["min"] = cast_date_col.min()
properties["max"] = cast_date_col.max()
# Add additional properties to the output dictionary
try:
nunique = df[column].nunique()
nunique = column.nunique()
properties["num_unique_values"] = nunique
except TypeError:
pass
if "samples" not in properties:
try:
non_null_values = df[column][df[column].notnull()].unique()
non_null_values = column[column.notnull()].unique()
n_samples = min(n_samples, len(non_null_values))
samples = (
pd.Series(non_null_values)
Expand All @@ -107,6 +109,6 @@ def _summarize_columns(df: pd.DataFrame, n_samples: int = 3):
pass
properties["semantic_type"] = ""
properties["description"] = ""
properties_list.append({"column": column, "properties": properties})
properties_list.append({"column": column_name, "properties": properties})

return properties_list

0 comments on commit 6985d4d

Please sign in to comment.