diff --git a/py-polars/polars/_utils/various.py b/py-polars/polars/_utils/various.py index 4f72d8947765..126929d6d627 100644 --- a/py-polars/polars/_utils/various.py +++ b/py-polars/polars/_utils/various.py @@ -5,6 +5,7 @@ import re import sys import warnings +from collections import Counter from collections.abc import ( Collection, Generator, @@ -42,7 +43,7 @@ from polars.dependencies import numpy as np if TYPE_CHECKING: - from collections.abc import Iterator, Reversible + from collections.abc import Iterator, MutableMapping, Reversible from polars import DataFrame, Expr from polars._typing import PolarsDataType, SizeUnit @@ -247,6 +248,16 @@ def ordered_unique(values: Sequence[Any]) -> list[Any]: return [v for v in values if not (v in seen or add_(v))] +def deduplicate_names(names: Iterable[str]) -> list[str]: + """Ensure name uniqueness by appending a counter to subsequent duplicates.""" + seen: MutableMapping[str, int] = Counter() + deduped = [] + for nm in names: + deduped.append(f"{nm}{seen[nm] - 1}" if nm in seen else nm) + seen[nm] += 1 + return deduped + + @overload def scale_bytes(sz: int, unit: SizeUnit) -> int | float: ... diff --git a/py-polars/polars/io/spreadsheet/functions.py b/py-polars/polars/io/spreadsheet/functions.py index 700d3e5cea13..1d32589302bc 100644 --- a/py-polars/polars/io/spreadsheet/functions.py +++ b/py-polars/polars/io/spreadsheet/functions.py @@ -1,12 +1,12 @@ from __future__ import annotations +import os import re import warnings from collections.abc import Sequence from datetime import time from glob import glob from io import BufferedReader, BytesIO, StringIO, TextIOWrapper -from os import PathLike from pathlib import Path from typing import IO, TYPE_CHECKING, Any, Callable, NoReturn, overload @@ -17,7 +17,7 @@ deprecate_renamed_parameter, issue_deprecation_warning, ) -from polars._utils.various import normalize_filepath, parse_version +from polars._utils.various import deduplicate_names, normalize_filepath, parse_version from polars.datatypes import ( N_INFER_DEFAULT, Boolean, @@ -57,14 +57,20 @@ def _sources( source = [source] # type: ignore[assignment] for src in source: # type: ignore[union-attr] - if isinstance(src, (str, PathLike)) and not Path(src).exists(): - sources.extend(glob(str(src), recursive=True)) # noqa: PTH207 + if isinstance(src, (str, os.PathLike)) and not Path(src).exists(): + src = os.path.expanduser(str(src)) # noqa: PTH111 + sources.extend(glob(src, recursive=True)) # noqa: PTH207 else: sources.append(src) return sources, read_multiple_workbooks +def _standardize_duplicates(s: str) -> str: + """Standardize columns with '_duplicated_n' names.""" + return re.sub(r"_duplicated_(\d+)", repl=r"\1", string=s) + + @overload def read_excel( source: FileSource, @@ -79,6 +85,7 @@ def read_excel( schema_overrides: SchemaDict | None = ..., infer_schema_length: int | None = ..., drop_empty_rows: bool = ..., + drop_empty_cols: bool = ..., raise_if_empty: bool = ..., ) -> pl.DataFrame: ... @@ -97,6 +104,7 @@ def read_excel( schema_overrides: SchemaDict | None = ..., infer_schema_length: int | None = ..., drop_empty_rows: bool = ..., + drop_empty_cols: bool = ..., raise_if_empty: bool = ..., ) -> pl.DataFrame: ... @@ -115,6 +123,7 @@ def read_excel( schema_overrides: SchemaDict | None = ..., infer_schema_length: int | None = ..., drop_empty_rows: bool = ..., + drop_empty_cols: bool = ..., raise_if_empty: bool = ..., ) -> NoReturn: ... @@ -135,6 +144,7 @@ def read_excel( schema_overrides: SchemaDict | None = ..., infer_schema_length: int | None = ..., drop_empty_rows: bool = ..., + drop_empty_cols: bool = ..., raise_if_empty: bool = ..., ) -> dict[str, pl.DataFrame]: ... @@ -153,6 +163,7 @@ def read_excel( schema_overrides: SchemaDict | None = ..., infer_schema_length: int | None = ..., drop_empty_rows: bool = ..., + drop_empty_cols: bool = ..., raise_if_empty: bool = ..., ) -> pl.DataFrame: ... @@ -171,6 +182,7 @@ def read_excel( schema_overrides: SchemaDict | None = ..., infer_schema_length: int | None = ..., drop_empty_rows: bool = ..., + drop_empty_cols: bool = ..., raise_if_empty: bool = ..., ) -> dict[str, pl.DataFrame]: ... @@ -190,6 +202,7 @@ def read_excel( schema_overrides: SchemaDict | None = None, infer_schema_length: int | None = N_INFER_DEFAULT, drop_empty_rows: bool = True, + drop_empty_cols: bool = True, raise_if_empty: bool = True, ) -> pl.DataFrame | dict[str, pl.DataFrame]: """ @@ -262,6 +275,10 @@ def read_excel( this parameter. drop_empty_rows Indicate whether to omit empty rows when reading data into the DataFrame. + drop_empty_cols + Indicate whether to omit empty columns (with no headers) when reading data into + the DataFrame (note that empty column identification may vary depending on the + underlying engine being used). raise_if_empty When there is no data in the sheet,`NoDataError` is raised. If this parameter is set to False, an empty DataFrame (with no columns) is returned instead. @@ -335,6 +352,7 @@ def read_excel( has_header=has_header, columns=columns, drop_empty_rows=drop_empty_rows, + drop_empty_cols=drop_empty_cols, read_multiple_workbooks=read_multiple_workbooks, ) for src in sources @@ -355,6 +373,7 @@ def read_ods( schema_overrides: SchemaDict | None = ..., infer_schema_length: int | None = ..., drop_empty_rows: bool = ..., + drop_empty_cols: bool = ..., raise_if_empty: bool = ..., ) -> pl.DataFrame: ... @@ -370,6 +389,7 @@ def read_ods( schema_overrides: SchemaDict | None = ..., infer_schema_length: int | None = ..., drop_empty_rows: bool = ..., + drop_empty_cols: bool = ..., raise_if_empty: bool = ..., ) -> pl.DataFrame: ... @@ -385,6 +405,7 @@ def read_ods( schema_overrides: SchemaDict | None = ..., infer_schema_length: int | None = ..., drop_empty_rows: bool = ..., + drop_empty_cols: bool = ..., raise_if_empty: bool = ..., ) -> NoReturn: ... @@ -400,6 +421,7 @@ def read_ods( schema_overrides: SchemaDict | None = ..., infer_schema_length: int | None = ..., drop_empty_rows: bool = ..., + drop_empty_cols: bool = ..., raise_if_empty: bool = ..., ) -> dict[str, pl.DataFrame]: ... @@ -415,6 +437,7 @@ def read_ods( schema_overrides: SchemaDict | None = ..., infer_schema_length: int | None = ..., drop_empty_rows: bool = ..., + drop_empty_cols: bool = ..., raise_if_empty: bool = ..., ) -> pl.DataFrame: ... @@ -430,6 +453,7 @@ def read_ods( schema_overrides: SchemaDict | None = ..., infer_schema_length: int | None = ..., drop_empty_rows: bool = ..., + drop_empty_cols: bool = ..., raise_if_empty: bool = ..., ) -> dict[str, pl.DataFrame]: ... @@ -444,6 +468,7 @@ def read_ods( schema_overrides: SchemaDict | None = None, infer_schema_length: int | None = N_INFER_DEFAULT, drop_empty_rows: bool = True, + drop_empty_cols: bool = True, raise_if_empty: bool = True, ) -> pl.DataFrame | dict[str, pl.DataFrame]: """ @@ -479,6 +504,10 @@ def read_ods( large workbooks. drop_empty_rows Indicate whether to omit empty rows when reading data into the DataFrame. + drop_empty_cols + Indicate whether to omit empty columns (with no headers) when reading data into + the DataFrame (note that empty column identification may vary depending on the + underlying engine being used). raise_if_empty When there is no data in the sheet,`NoDataError` is raised. If this parameter is set to False, an empty DataFrame (with no columns) is returned instead. @@ -523,6 +552,7 @@ def read_ods( infer_schema_length=infer_schema_length, raise_if_empty=raise_if_empty, drop_empty_rows=drop_empty_rows, + drop_empty_cols=drop_empty_cols, has_header=has_header, columns=columns, read_multiple_workbooks=read_multiple_workbooks, @@ -548,6 +578,7 @@ def _read_spreadsheet( has_header: bool = True, raise_if_empty: bool = True, drop_empty_rows: bool = True, + drop_empty_cols: bool = True, read_multiple_workbooks: bool = False, ) -> pl.DataFrame | dict[str, pl.DataFrame]: if isinstance(source, (str, Path)): @@ -587,6 +618,7 @@ def _read_spreadsheet( raise_if_empty=raise_if_empty, columns=columns, drop_empty_rows=drop_empty_rows, + drop_empty_cols=drop_empty_cols, ) for name in sheet_names } @@ -774,8 +806,9 @@ def _csv_buffer_to_frame( separator: str, read_options: dict[str, Any], schema_overrides: SchemaDict | None, - raise_if_empty: bool, drop_empty_rows: bool, + drop_empty_cols: bool, + raise_if_empty: bool, ) -> pl.DataFrame: """Translate StringIO buffer containing delimited data as a DataFrame.""" # handle (completely) empty sheet data @@ -810,35 +843,39 @@ def _csv_buffer_to_frame( **read_options, ) return _drop_null_data( - df, raise_if_empty=raise_if_empty, drop_empty_rows=drop_empty_rows + df, + raise_if_empty=raise_if_empty, + drop_empty_rows=drop_empty_rows, + drop_empty_cols=drop_empty_cols, ) def _drop_null_data( - df: pl.DataFrame, *, raise_if_empty: bool, drop_empty_rows: bool = True + df: pl.DataFrame, + *, + raise_if_empty: bool, + drop_empty_rows: bool = True, + drop_empty_cols: bool = True, ) -> pl.DataFrame: - """ - If DataFrame contains columns/rows that contain only nulls, drop them. - - If `drop_empty_rows` is set to `False`, empty rows are not dropped. - """ + """If DataFrame contains columns/rows that contain only nulls, drop them.""" null_cols: list[str] = [] - for col_name in df.columns: - # note that if multiple unnamed columns are found then all but the first one - # will be named as "_duplicated_{n}" (or "__UNNAMED__{n}" from calamine) - if col_name == "" or re.match(r"(_duplicated_|__UNNAMED__)\d+$", col_name): - col = df[col_name] - if ( - col.dtype == Null - or col.null_count() == len(df) - or ( - col.dtype in NUMERIC_DTYPES - and col.replace(0, None).null_count() == len(df) - ) - ): - null_cols.append(col_name) - if null_cols: - df = df.drop(*null_cols) + if drop_empty_cols: + for col_name in df.columns: + # note that if multiple unnamed columns are found then all but the first one + # will be named as "_duplicated_{n}" (or "__UNNAMED__{n}" from calamine) + if col_name == "" or re.match(r"(_duplicated_|__UNNAMED__)\d+$", col_name): + col = df[col_name] + if ( + col.dtype == Null + or col.null_count() == len(df) + or ( + col.dtype in NUMERIC_DTYPES + and col.replace(0, None).null_count() == len(df) + ) + ): + null_cols.append(col_name) + if null_cols: + df = df.drop(*null_cols) if len(df) == 0 and len(df.columns) == 0: return _empty_frame(raise_if_empty) @@ -875,8 +912,9 @@ def _read_spreadsheet_openpyxl( read_options: dict[str, Any], schema_overrides: SchemaDict | None, columns: Sequence[int] | Sequence[str] | None, - raise_if_empty: bool, drop_empty_rows: bool, + drop_empty_cols: bool, + raise_if_empty: bool, ) -> pl.DataFrame: """Use the 'openpyxl' library to read data from the given worksheet.""" infer_schema_length = read_options.pop("infer_schema_length", None) @@ -916,9 +954,9 @@ def _read_spreadsheet_openpyxl( dtype = String if no_inference else None series_data = [] for name, column_data in zip(header, zip(*rows_iter)): - if name: + if name or not drop_empty_cols: values = [cell.value for cell in column_data] - if no_inference or (dtype := (schema_overrides or {}).get(name)) == String: # type: ignore[assignment] + if no_inference or (dtype := (schema_overrides or {}).get(name)) == String: # type: ignore[assignment,arg-type] # note: if we initialise the series with mixed-type data (eg: str/int) # then the non-strings will become null, so we handle the cast here values = [str(v) if (v is not None) else v for v in values] @@ -926,15 +964,18 @@ def _read_spreadsheet_openpyxl( s = pl.Series(name, values, dtype=dtype, strict=False) series_data.append(s) + names = deduplicate_names(s.name for s in series_data) df = pl.DataFrame( - {s.name: s for s in series_data}, + dict(zip(names, series_data)), schema_overrides=schema_overrides, infer_schema_length=infer_schema_length, strict=False, ) - df = _drop_null_data( - df, raise_if_empty=raise_if_empty, drop_empty_rows=drop_empty_rows + df, + raise_if_empty=raise_if_empty, + drop_empty_rows=drop_empty_rows, + drop_empty_cols=drop_empty_cols, ) df = _reorder_columns(df, columns) return df @@ -947,8 +988,9 @@ def _read_spreadsheet_calamine( read_options: dict[str, Any], schema_overrides: SchemaDict | None, columns: Sequence[int] | Sequence[str] | None, - raise_if_empty: bool, drop_empty_rows: bool, + drop_empty_cols: bool, + raise_if_empty: bool, ) -> pl.DataFrame: # if we have 'schema_overrides' and a more recent version of `fastexcel` # we can pass translated dtypes to the engine to refine the initial parse @@ -1002,7 +1044,10 @@ def _read_spreadsheet_calamine( df.columns = [f"column_{i}" for i in range(1, len(df.columns) + 1)] df = _drop_null_data( - df, raise_if_empty=raise_if_empty, drop_empty_rows=drop_empty_rows + df, + raise_if_empty=raise_if_empty, + drop_empty_rows=drop_empty_rows, + drop_empty_cols=drop_empty_cols, ) # note: even if we applied parser dtypes we still re-apply schema_overrides @@ -1050,8 +1095,9 @@ def _read_spreadsheet_xlsx2csv( read_options: dict[str, Any], schema_overrides: SchemaDict | None, columns: Sequence[int] | Sequence[str] | None, - raise_if_empty: bool, drop_empty_rows: bool, + drop_empty_cols: bool, + raise_if_empty: bool, ) -> pl.DataFrame: """Use the 'xlsx2csv' library to read data from the given worksheet.""" csv_buffer = StringIO() @@ -1080,8 +1126,10 @@ def _read_spreadsheet_xlsx2csv( schema_overrides=schema_overrides, raise_if_empty=raise_if_empty, drop_empty_rows=drop_empty_rows, + drop_empty_cols=drop_empty_cols, ) if cast_to_boolean: df = df.with_columns(*cast_to_boolean) + df = df.rename(_standardize_duplicates) return _reorder_columns(df, columns) diff --git a/py-polars/tests/unit/io/files/example.xlsx b/py-polars/tests/unit/io/files/example.xlsx index 13e80e618fa2..0dc2081a9249 100644 Binary files a/py-polars/tests/unit/io/files/example.xlsx and b/py-polars/tests/unit/io/files/example.xlsx differ diff --git a/py-polars/tests/unit/io/test_spreadsheet.py b/py-polars/tests/unit/io/test_spreadsheet.py index 2bfc830c4712..433d652b07ec 100644 --- a/py-polars/tests/unit/io/test_spreadsheet.py +++ b/py-polars/tests/unit/io/test_spreadsheet.py @@ -309,6 +309,49 @@ def test_read_excel_basic_datatypes(engine: ExcelSpreadsheetEngine) -> None: ) +@pytest.mark.parametrize( + ("read_spreadsheet", "source", "params"), + [ + # TODO: uncomment once fastexcel offers a suitable param + # (pl.read_excel, "path_xlsx", {"engine": "xlsx2csv"}), + (pl.read_excel, "path_xlsx", {"engine": "xlsx2csv"}), + (pl.read_excel, "path_xlsx", {"engine": "openpyxl"}), + ], +) +def test_read_dropped_cols( + read_spreadsheet: Callable[..., dict[str, pl.DataFrame]], + source: str, + params: dict[str, str], + request: pytest.FixtureRequest, +) -> None: + spreadsheet_path = request.getfixturevalue(source) + + df1 = read_spreadsheet( + spreadsheet_path, + sheet_name="test4", + **params, + ) + df2 = read_spreadsheet( + spreadsheet_path, + sheet_name="test4", + drop_empty_cols=False, + **params, + ) + assert df1.to_dict(as_series=False) == { # type: ignore[attr-defined] + "cardinality": [1, 3, 15, 30, 150, 300], + "rows_by_key": [0.05059, 0.04478, 0.04414, 0.05245, 0.05395, 0.05677], + "iter_groups": [0.04806, 0.04223, 0.04774, 0.04864, 0.0572, 0.06945], + } + assert df2.to_dict(as_series=False) == { # type: ignore[attr-defined] + "": [None, None, None, None, None, None], + "cardinality": [1, 3, 15, 30, 150, 300], + "rows_by_key": [0.05059, 0.04478, 0.04414, 0.05245, 0.05395, 0.05677], + "iter_groups": [0.04806, 0.04223, 0.04774, 0.04864, 0.0572, 0.06945], + "0": [None, None, None, None, None, None], + "1": [None, None, None, None, None, None], + } + + @pytest.mark.parametrize( ("read_spreadsheet", "source", "params"), [