From 7d97781dff16a1213cf9ca62185008caba116971 Mon Sep 17 00:00:00 2001 From: Stijn de Gooijer Date: Sun, 7 Apr 2024 20:05:18 +0200 Subject: [PATCH 1/9] Fix util names --- py-polars/polars/_utils/various.py | 10 ------ py-polars/polars/io/_utils.py | 32 ++++++++++++------- py-polars/polars/io/csv/batched_reader.py | 5 ++- py-polars/polars/io/csv/functions.py | 16 +++++----- py-polars/polars/io/ipc/anonymous_scan.py | 4 +-- py-polars/polars/io/ipc/functions.py | 22 ++++++------- py-polars/polars/io/ndjson.py | 5 +-- py-polars/polars/io/parquet/anonymous_scan.py | 4 +-- py-polars/polars/io/parquet/functions.py | 24 +++++++------- py-polars/polars/io/spreadsheet/functions.py | 6 ++-- py-polars/tests/unit/io/cloud/test_utils.py | 4 +-- py-polars/tests/unit/utils/test_utils.py | 4 +-- 12 files changed, 68 insertions(+), 68 deletions(-) diff --git a/py-polars/polars/_utils/various.py b/py-polars/polars/_utils/various.py index 5856168ed95c..292ef6d73fc2 100644 --- a/py-polars/polars/_utils/various.py +++ b/py-polars/polars/_utils/various.py @@ -158,16 +158,6 @@ def range_to_slice(rng: range) -> slice: return slice(rng.start, rng.stop, rng.step) -def _prepare_row_index_args( - row_index_name: str | None = None, - row_index_offset: int = 0, -) -> tuple[str, int] | None: - if row_index_name is not None: - return (row_index_name, row_index_offset) - else: - return None - - def _in_notebook() -> bool: try: from IPython import get_ipython diff --git a/py-polars/polars/io/_utils.py b/py-polars/polars/io/_utils.py index b5e249c6b807..da7efe21b562 100644 --- a/py-polars/polars/io/_utils.py +++ b/py-polars/polars/io/_utils.py @@ -38,15 +38,25 @@ def handle_projection_columns( return projection, new_columns -def _is_glob_pattern(file: str) -> bool: +def prepare_row_index_args( + row_index_name: str | None = None, + row_index_offset: int = 0, +) -> tuple[str, int] | None: + if row_index_name is not None: + return (row_index_name, row_index_offset) + else: + return None + + +def is_glob_pattern(file: str) -> bool: return any(char in file for char in ["*", "?", "["]) -def _is_supported_cloud(file: str) -> bool: +def is_supported_cloud(file: str) -> bool: return bool(re.match("^(s3a?|gs|gcs|file|abfss?|azure|az|adl|https?)://", file)) -def _is_local_file(file: str) -> bool: +def is_local_file(file: str) -> bool: try: next(glob.iglob(file, recursive=True)) # noqa: PTH207 except StopIteration: @@ -56,7 +66,7 @@ def _is_local_file(file: str) -> bool: @overload -def _prepare_file_arg( +def prepare_file_arg( file: str | Path | list[str] | IO[bytes] | bytes, encoding: str | None = ..., *, @@ -67,7 +77,7 @@ def _prepare_file_arg( @overload -def _prepare_file_arg( +def prepare_file_arg( file: str | Path | IO[str] | IO[bytes] | bytes, encoding: str | None = ..., *, @@ -78,7 +88,7 @@ def _prepare_file_arg( @overload -def _prepare_file_arg( +def prepare_file_arg( file: str | Path | list[str] | IO[str] | IO[bytes] | bytes, encoding: str | None = ..., *, @@ -88,7 +98,7 @@ def _prepare_file_arg( ) -> ContextManager[str | list[str] | BytesIO | list[BytesIO]]: ... -def _prepare_file_arg( +def prepare_file_arg( file: str | Path | list[str] | IO[str] | IO[bytes] | bytes, encoding: str | None = None, *, @@ -181,8 +191,8 @@ def managed_file(file: Any) -> Iterator[Any]: # make sure that this is before fsspec # as fsspec needs requests to be installed # to read from http - if _looks_like_url(file): - return _process_file_url(file, encoding_str) + if looks_like_url(file): + return process_file_url(file, encoding_str) if _FSSPEC_AVAILABLE: from fsspec.utils import infer_storage_options @@ -245,11 +255,11 @@ def _check_empty( return b -def _looks_like_url(path: str) -> bool: +def looks_like_url(path: str) -> bool: return re.match("^(ht|f)tps?://", path, re.IGNORECASE) is not None -def _process_file_url(path: str, encoding: str | None = None) -> BytesIO: +def process_file_url(path: str, encoding: str | None = None) -> BytesIO: from urllib.request import urlopen with urlopen(path) as f: diff --git a/py-polars/polars/io/csv/batched_reader.py b/py-polars/polars/io/csv/batched_reader.py index 6f79d0e71b63..fa93ff3d5a57 100644 --- a/py-polars/polars/io/csv/batched_reader.py +++ b/py-polars/polars/io/csv/batched_reader.py @@ -4,13 +4,12 @@ from typing import TYPE_CHECKING, Sequence from polars._utils.various import ( - _prepare_row_index_args, _process_null_values, normalize_filepath, ) from polars._utils.wrap import wrap_df from polars.datatypes import N_INFER_DEFAULT, py_type_to_dtype -from polars.io._utils import handle_projection_columns +from polars.io._utils import handle_projection_columns, prepare_row_index_args from polars.io.csv._utils import _update_columns with contextlib.suppress(ImportError): # Module not available when building docs @@ -98,7 +97,7 @@ def __init__( missing_utf8_is_empty_string=missing_utf8_is_empty_string, try_parse_dates=try_parse_dates, skip_rows_after_header=skip_rows_after_header, - row_index=_prepare_row_index_args(row_index_name, row_index_offset), + row_index=prepare_row_index_args(row_index_name, row_index_offset), sample_size=sample_size, eol_char=eol_char, raise_if_empty=raise_if_empty, diff --git a/py-polars/polars/io/csv/functions.py b/py-polars/polars/io/csv/functions.py index b3410ac088c1..68ab428dd61e 100644 --- a/py-polars/polars/io/csv/functions.py +++ b/py-polars/polars/io/csv/functions.py @@ -8,7 +8,6 @@ import polars._reexport as pl from polars._utils.deprecation import deprecate_renamed_parameter from polars._utils.various import ( - _prepare_row_index_args, _process_null_values, is_str_sequence, normalize_filepath, @@ -17,9 +16,10 @@ from polars.datatypes import N_INFER_DEFAULT, String from polars.datatypes.convert import py_type_to_dtype from polars.io._utils import ( - _is_glob_pattern, - _prepare_file_arg, handle_projection_columns, + is_glob_pattern, + prepare_file_arg, + prepare_row_index_args, ) from polars.io.csv._utils import _check_arg_is_1byte, _update_columns from polars.io.csv.batched_reader import BatchedCsvReader @@ -269,7 +269,7 @@ def read_csv( # for pyarrow. include_columns = [f"f{column_idx}" for column_idx in projection] - with _prepare_file_arg( + with prepare_file_arg( source, encoding=None, use_pyarrow=True, @@ -403,7 +403,7 @@ def read_csv( for column_name, column_dtype in dtypes.items() } - with _prepare_file_arg( + with prepare_file_arg( source, encoding=encoding, use_pyarrow=False, @@ -502,7 +502,7 @@ def _read_csv_impl( if isinstance(columns, str): columns = [columns] - if isinstance(source, str) and _is_glob_pattern(source): + if isinstance(source, str) and is_glob_pattern(source): dtypes_dict = None if dtype_list is not None: dtypes_dict = dict(dtype_list) @@ -574,7 +574,7 @@ def _read_csv_impl( missing_utf8_is_empty_string, try_parse_dates, skip_rows_after_header, - _prepare_row_index_args(row_index_name, row_index_offset), + prepare_row_index_args(row_index_name, row_index_offset), sample_size=sample_size, eol_char=eol_char, raise_if_empty=raise_if_empty, @@ -1189,7 +1189,7 @@ def _scan_csv_impl( rechunk, skip_rows_after_header, encoding, - _prepare_row_index_args(row_index_name, row_index_offset), + prepare_row_index_args(row_index_name, row_index_offset), try_parse_dates, eol_char=eol_char, raise_if_empty=raise_if_empty, diff --git a/py-polars/polars/io/ipc/anonymous_scan.py b/py-polars/polars/io/ipc/anonymous_scan.py index 3b2fb5cb8c2f..e0648c4980d1 100644 --- a/py-polars/polars/io/ipc/anonymous_scan.py +++ b/py-polars/polars/io/ipc/anonymous_scan.py @@ -5,7 +5,7 @@ import polars._reexport as pl import polars.io.ipc -from polars.io._utils import _prepare_file_arg +from polars.io._utils import prepare_file_arg if TYPE_CHECKING: from polars import DataFrame, LazyFrame @@ -18,7 +18,7 @@ def _scan_ipc_fsspec( func = partial(_scan_ipc_impl, source, storage_options=storage_options) storage_options = storage_options or {} - with _prepare_file_arg(source, storage_options=storage_options) as data: + with prepare_file_arg(source, storage_options=storage_options) as data: schema = polars.io.ipc.read_ipc_schema(data) return pl.LazyFrame._scan_python_function(schema, func) diff --git a/py-polars/polars/io/ipc/functions.py b/py-polars/polars/io/ipc/functions.py index 52d6b6a307c8..fde7be409ee1 100644 --- a/py-polars/polars/io/ipc/functions.py +++ b/py-polars/polars/io/ipc/functions.py @@ -7,17 +7,17 @@ import polars._reexport as pl from polars._utils.deprecation import deprecate_renamed_parameter from polars._utils.various import ( - _prepare_row_index_args, is_str_sequence, normalize_filepath, ) from polars._utils.wrap import wrap_df, wrap_ldf from polars.dependencies import _PYARROW_AVAILABLE from polars.io._utils import ( - _is_glob_pattern, - _is_local_file, - _prepare_file_arg, handle_projection_columns, + is_glob_pattern, + is_local_file, + prepare_file_arg, + prepare_row_index_args, ) from polars.io.ipc.anonymous_scan import _scan_ipc_fsspec @@ -94,7 +94,7 @@ def read_ipc( msg = "`n_rows` cannot be used with `use_pyarrow=True` and `memory_map=False`" raise ValueError(msg) - with _prepare_file_arg( + with prepare_file_arg( source, use_pyarrow=use_pyarrow, storage_options=storage_options ) as data: if use_pyarrow: @@ -139,7 +139,7 @@ def _read_ipc_impl( if isinstance(columns, str): columns = [columns] - if isinstance(source, str) and _is_glob_pattern(source) and _is_local_file(source): + if isinstance(source, str) and is_glob_pattern(source) and is_local_file(source): scan = scan_ipc( source, n_rows=n_rows, @@ -166,7 +166,7 @@ def _read_ipc_impl( columns, projection, n_rows, - _prepare_row_index_args(row_index_name, row_index_offset), + prepare_row_index_args(row_index_name, row_index_offset), memory_map=memory_map, ) return wrap_df(pydf) @@ -221,7 +221,7 @@ def read_ipc_stream( ------- DataFrame """ - with _prepare_file_arg( + with prepare_file_arg( source, use_pyarrow=use_pyarrow, storage_options=storage_options ) as data: if use_pyarrow: @@ -273,7 +273,7 @@ def _read_ipc_stream_impl( columns, projection, n_rows, - _prepare_row_index_args(row_index_name, row_index_offset), + prepare_row_index_args(row_index_name, row_index_offset), rechunk, ) return wrap_df(pydf) @@ -358,7 +358,7 @@ def scan_ipc( source = None # type: ignore[assignment] # try fsspec scanner - if can_use_fsspec and not _is_local_file(source): # type: ignore[arg-type] + if can_use_fsspec and not is_local_file(source): # type: ignore[arg-type] scan = _scan_ipc_fsspec(source, storage_options) # type: ignore[arg-type] if n_rows: scan = scan.head(n_rows) @@ -372,7 +372,7 @@ def scan_ipc( n_rows, cache, rechunk, - _prepare_row_index_args(row_index_name, row_index_offset), + prepare_row_index_args(row_index_name, row_index_offset), memory_map=memory_map, cloud_options=storage_options, retries=retries, diff --git a/py-polars/polars/io/ndjson.py b/py-polars/polars/io/ndjson.py index 0e810a3f1eea..c1c077c66ea4 100644 --- a/py-polars/polars/io/ndjson.py +++ b/py-polars/polars/io/ndjson.py @@ -6,9 +6,10 @@ from typing import TYPE_CHECKING from polars._utils.deprecation import deprecate_renamed_parameter -from polars._utils.various import _prepare_row_index_args, normalize_filepath +from polars._utils.various import normalize_filepath from polars._utils.wrap import wrap_df, wrap_ldf from polars.datatypes import N_INFER_DEFAULT +from polars.io._utils import prepare_row_index_args with contextlib.suppress(ImportError): # Module not available when building docs from polars.polars import PyDataFrame, PyLazyFrame @@ -136,7 +137,7 @@ def scan_ndjson( n_rows, low_memory, rechunk, - _prepare_row_index_args(row_index_name, row_index_offset), + prepare_row_index_args(row_index_name, row_index_offset), ignore_errors, ) return wrap_ldf(pylf) diff --git a/py-polars/polars/io/parquet/anonymous_scan.py b/py-polars/polars/io/parquet/anonymous_scan.py index 5f1d72013bec..6bb06e2f2d32 100644 --- a/py-polars/polars/io/parquet/anonymous_scan.py +++ b/py-polars/polars/io/parquet/anonymous_scan.py @@ -5,7 +5,7 @@ import polars._reexport as pl import polars.io.parquet -from polars.io._utils import _prepare_file_arg +from polars.io._utils import prepare_file_arg if TYPE_CHECKING: from polars import DataFrame, LazyFrame @@ -17,7 +17,7 @@ def _scan_parquet_fsspec( ) -> LazyFrame: func = partial(_scan_parquet_impl, source, storage_options=storage_options) - with _prepare_file_arg(source, storage_options=storage_options) as data: + with prepare_file_arg(source, storage_options=storage_options) as data: schema = polars.io.parquet.read_parquet_schema(data) return pl.LazyFrame._scan_python_function(schema, func) diff --git a/py-polars/polars/io/parquet/functions.py b/py-polars/polars/io/parquet/functions.py index d83b671159ab..82d386bf4c13 100644 --- a/py-polars/polars/io/parquet/functions.py +++ b/py-polars/polars/io/parquet/functions.py @@ -8,7 +8,6 @@ from polars._utils.deprecation import deprecate_renamed_parameter from polars._utils.unstable import issue_unstable_warning from polars._utils.various import ( - _prepare_row_index_args, is_int_sequence, is_str_sequence, normalize_filepath, @@ -17,11 +16,12 @@ from polars.convert import from_arrow from polars.dependencies import _PYARROW_AVAILABLE from polars.io._utils import ( - _is_glob_pattern, - _is_local_file, - _is_supported_cloud, - _prepare_file_arg, handle_projection_columns, + is_glob_pattern, + is_local_file, + is_supported_cloud, + prepare_file_arg, + prepare_row_index_args, ) from polars.io.parquet.anonymous_scan import _scan_parquet_fsspec @@ -167,7 +167,7 @@ def read_parquet( pyarrow_options = pyarrow_options or {} - with _prepare_file_arg( + with prepare_file_arg( source, # type: ignore[arg-type] use_pyarrow=True, storage_options=storage_options, @@ -183,7 +183,7 @@ def read_parquet( # Read binary types using `read_parquet` elif isinstance(source, (io.BufferedIOBase, io.RawIOBase, bytes)): - with _prepare_file_arg(source, use_pyarrow=False) as source_prep: + with prepare_file_arg(source, use_pyarrow=False) as source_prep: return _read_parquet_binary( source_prep, columns=columns, @@ -238,7 +238,7 @@ def _read_parquet_binary( if isinstance(columns, str): columns = [columns] - if isinstance(source, str) and _is_glob_pattern(source): + if isinstance(source, str) and is_glob_pattern(source): from polars import scan_parquet scan = scan_parquet( @@ -270,7 +270,7 @@ def _read_parquet_binary( projection, n_rows, parallel, - _prepare_row_index_args(row_index_name, row_index_offset), + prepare_row_index_args(row_index_name, row_index_offset), low_memory=low_memory, use_statistics=use_statistics, rechunk=rechunk, @@ -451,8 +451,8 @@ def _scan_parquet_impl( # try fsspec scanner if ( can_use_fsspec - and not _is_local_file(source) # type: ignore[arg-type] - and not _is_supported_cloud(source) # type: ignore[arg-type] + and not is_local_file(source) # type: ignore[arg-type] + and not is_supported_cloud(source) # type: ignore[arg-type] ): scan = _scan_parquet_fsspec(source, storage_options) # type: ignore[arg-type] if n_rows: @@ -474,7 +474,7 @@ def _scan_parquet_impl( cache, parallel, rechunk, - _prepare_row_index_args(row_index_name, row_index_offset), + prepare_row_index_args(row_index_name, row_index_offset), low_memory, cloud_options=storage_options, use_statistics=use_statistics, diff --git a/py-polars/polars/io/spreadsheet/functions.py b/py-polars/polars/io/spreadsheet/functions.py index 9007255a6432..7f79cfa272a3 100644 --- a/py-polars/polars/io/spreadsheet/functions.py +++ b/py-polars/polars/io/spreadsheet/functions.py @@ -22,7 +22,7 @@ ) from polars.dependencies import import_optional from polars.exceptions import NoDataError, ParameterCollisionError -from polars.io._utils import PortableTemporaryFile, _looks_like_url, _process_file_url +from polars.io._utils import PortableTemporaryFile, looks_like_url, process_file_url from polars.io.csv.functions import read_csv if TYPE_CHECKING: @@ -447,8 +447,8 @@ def _read_spreadsheet( ) -> pl.DataFrame | dict[str, pl.DataFrame]: if is_file := isinstance(source, (str, Path)): source = normalize_filepath(source) - if _looks_like_url(source): - source = _process_file_url(source) + if looks_like_url(source): + source = process_file_url(source) if engine is None: if is_file and str(source).lower().endswith(".ods"): diff --git a/py-polars/tests/unit/io/cloud/test_utils.py b/py-polars/tests/unit/io/cloud/test_utils.py index 968661638ef7..90fb23343fe8 100644 --- a/py-polars/tests/unit/io/cloud/test_utils.py +++ b/py-polars/tests/unit/io/cloud/test_utils.py @@ -2,7 +2,7 @@ import pytest -from polars.io._utils import _is_supported_cloud +from polars.io._utils import is_supported_cloud @pytest.mark.parametrize( @@ -22,4 +22,4 @@ ], ) def test_is_cloud_url(url: str, expected: bool) -> None: - assert _is_supported_cloud(url) is expected + assert is_supported_cloud(url) is expected diff --git a/py-polars/tests/unit/utils/test_utils.py b/py-polars/tests/unit/utils/test_utils.py index d81540efc10d..e15b6e918bc9 100644 --- a/py-polars/tests/unit/utils/test_utils.py +++ b/py-polars/tests/unit/utils/test_utils.py @@ -23,7 +23,7 @@ parse_percentiles, parse_version, ) -from polars.io._utils import _looks_like_url +from polars.io._utils import looks_like_url if TYPE_CHECKING: from zoneinfo import ZoneInfo @@ -309,4 +309,4 @@ def test_is_str_sequence_check( ], ) def test_looks_like_url(url: str, result: bool) -> None: - assert _looks_like_url(url) == result + assert looks_like_url(url) == result From 0f4924b6f76cd19c92ebedb2a0686a2d486fe92f Mon Sep 17 00:00:00 2001 From: Stijn de Gooijer Date: Sun, 7 Apr 2024 21:25:46 +0200 Subject: [PATCH 2/9] Driveby --- py-polars/polars/io/_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/py-polars/polars/io/_utils.py b/py-polars/polars/io/_utils.py index da7efe21b562..55ffe5297f4d 100644 --- a/py-polars/polars/io/_utils.py +++ b/py-polars/polars/io/_utils.py @@ -244,7 +244,7 @@ def managed_file(file: Any) -> Iterator[Any]: def _check_empty( b: BytesIO, *, context: str, raise_if_empty: bool, read_position: int | None = None ) -> BytesIO: - if raise_if_empty and not b.getbuffer().nbytes: + if raise_if_empty and b.getbuffer().nbytes == 0: hint = ( f" (buffer position = {read_position}; try seek(0) before reading?)" if context in ("StringIO", "BytesIO") and read_position From af5f83cc9af596685025cf0e56aa0716ea04612f Mon Sep 17 00:00:00 2001 From: Stijn de Gooijer Date: Sun, 7 Apr 2024 21:25:56 +0200 Subject: [PATCH 3/9] Refactor --- py-polars/polars/io/_utils.py | 39 +++++++++++++++++++++-------------- 1 file changed, 23 insertions(+), 16 deletions(-) diff --git a/py-polars/polars/io/_utils.py b/py-polars/polars/io/_utils.py index 55ffe5297f4d..3078a1d6b0ec 100644 --- a/py-polars/polars/io/_utils.py +++ b/py-polars/polars/io/_utils.py @@ -17,24 +17,31 @@ def handle_projection_columns( columns: Sequence[str] | Sequence[int] | str | None, ) -> tuple[list[int] | None, Sequence[str] | None]: """Disambiguates between columns specified as integers vs. strings.""" + if columns is None: + return None, None + projection: list[int] | None = None new_columns: Sequence[str] | None = None - if columns is not None: - if isinstance(columns, str): - new_columns = [columns] - elif is_int_sequence(columns): - projection = list(columns) - elif not is_str_sequence(columns): - msg = "`columns` arg should contain a list of all integers or all strings values" - raise TypeError(msg) - else: - new_columns = columns - if columns and len(set(columns)) != len(columns): - msg = f"`columns` arg should only have unique values, got {columns!r}" - raise ValueError(msg) - if projection and len(set(projection)) != len(projection): - msg = f"`columns` arg should only have unique values, got {projection!r}" - raise ValueError(msg) + + if isinstance(columns, str): + new_columns = [columns] + elif is_int_sequence(columns): + projection = list(columns) + elif is_str_sequence(columns): + new_columns = columns + else: + msg = ( + "`columns` arg should contain a list of all integers or all strings values" + ) + raise TypeError(msg) + + if columns and len(set(columns)) != len(columns): + msg = f"`columns` arg should only have unique values, got {columns!r}" + raise ValueError(msg) + if projection and len(set(projection)) != len(projection): + msg = f"`columns` arg should only have unique values, got {projection!r}" + raise ValueError(msg) + return projection, new_columns From 4544f5c41d389a30869853aabbf34c5d61801fac Mon Sep 17 00:00:00 2001 From: Stijn de Gooijer Date: Sun, 7 Apr 2024 21:44:57 +0200 Subject: [PATCH 4/9] Remove unneeded import --- py-polars/polars/io/parquet/functions.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/py-polars/polars/io/parquet/functions.py b/py-polars/polars/io/parquet/functions.py index 82d386bf4c13..2d2673f99842 100644 --- a/py-polars/polars/io/parquet/functions.py +++ b/py-polars/polars/io/parquet/functions.py @@ -239,8 +239,6 @@ def _read_parquet_binary( columns = [columns] if isinstance(source, str) and is_glob_pattern(source): - from polars import scan_parquet - scan = scan_parquet( source, n_rows=n_rows, From 2881afe21c08b61b546a76acd9c2c174012513f1 Mon Sep 17 00:00:00 2001 From: Stijn de Gooijer Date: Sun, 7 Apr 2024 21:45:48 +0200 Subject: [PATCH 5/9] Small refactor --- py-polars/polars/io/_utils.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/py-polars/polars/io/_utils.py b/py-polars/polars/io/_utils.py index 3078a1d6b0ec..d243b472edea 100644 --- a/py-polars/polars/io/_utils.py +++ b/py-polars/polars/io/_utils.py @@ -25,14 +25,12 @@ def handle_projection_columns( if isinstance(columns, str): new_columns = [columns] - elif is_int_sequence(columns): - projection = list(columns) elif is_str_sequence(columns): new_columns = columns + elif is_int_sequence(columns): + projection = list(columns) else: - msg = ( - "`columns` arg should contain a list of all integers or all strings values" - ) + msg = "`columns` arg should contain a list of all integers or all string values" raise TypeError(msg) if columns and len(set(columns)) != len(columns): From 75548f0ee7fdaa2a3abd425a5ae253347e938a41 Mon Sep 17 00:00:00 2001 From: Stijn de Gooijer Date: Sun, 7 Apr 2024 23:24:50 +0200 Subject: [PATCH 6/9] Refactor parse columns util, add tests --- py-polars/polars/io/_utils.py | 45 +++++++++++++++-------- py-polars/polars/io/avro.py | 6 +-- py-polars/polars/io/csv/batched_reader.py | 4 +- py-polars/polars/io/csv/functions.py | 8 ++-- py-polars/polars/io/ipc/functions.py | 6 +-- py-polars/polars/io/parquet/functions.py | 4 +- py-polars/tests/unit/io/test_utils.py | 35 ++++++++++++++++++ 7 files changed, 78 insertions(+), 30 deletions(-) create mode 100644 py-polars/tests/unit/io/test_utils.py diff --git a/py-polars/polars/io/_utils.py b/py-polars/polars/io/_utils.py index d243b472edea..e878bc630ec5 100644 --- a/py-polars/polars/io/_utils.py +++ b/py-polars/polars/io/_utils.py @@ -13,34 +13,47 @@ from polars.exceptions import NoDataError -def handle_projection_columns( - columns: Sequence[str] | Sequence[int] | str | None, -) -> tuple[list[int] | None, Sequence[str] | None]: - """Disambiguates between columns specified as integers vs. strings.""" +def parse_columns_arg( + columns: Sequence[str] | Sequence[int] | str | int | None, +) -> tuple[Sequence[int] | None, Sequence[str] | None]: + """ + Parse the `columns` argument of an I/O function. + + Disambiguates between column names and column indices input. + + Returns + ------- + tuple + A tuple containing the columns as a projection and a list of column names. + Only one will be specified, the other will be `None`. + """ if columns is None: return None, None - projection: list[int] | None = None - new_columns: Sequence[str] | None = None + projection: Sequence[int] | None = None + column_names: Sequence[str] | None = None if isinstance(columns, str): - new_columns = [columns] + column_names = [columns] + elif isinstance(columns, int): + projection = [columns] elif is_str_sequence(columns): - new_columns = columns + _ensure_columns_are_unique(columns) + column_names = columns elif is_int_sequence(columns): - projection = list(columns) + _ensure_columns_are_unique(columns) + projection = columns else: - msg = "`columns` arg should contain a list of all integers or all string values" + msg = "the `columns` argument should contain a list of all integers or all string values" raise TypeError(msg) - if columns and len(set(columns)) != len(columns): + return projection, column_names + + +def _ensure_columns_are_unique(columns: Sequence[str] | Sequence[int]) -> None: + if len(columns) != len(set(columns)): msg = f"`columns` arg should only have unique values, got {columns!r}" raise ValueError(msg) - if projection and len(set(projection)) != len(projection): - msg = f"`columns` arg should only have unique values, got {projection!r}" - raise ValueError(msg) - - return projection, new_columns def prepare_row_index_args( diff --git a/py-polars/polars/io/avro.py b/py-polars/polars/io/avro.py index 3d695ba6ad35..feac29a7fd9e 100644 --- a/py-polars/polars/io/avro.py +++ b/py-polars/polars/io/avro.py @@ -6,7 +6,7 @@ from polars._utils.various import normalize_filepath from polars._utils.wrap import wrap_df -from polars.io._utils import handle_projection_columns +from polars.io._utils import parse_columns_arg with contextlib.suppress(ImportError): # Module not available when building docs from polars.polars import PyDataFrame @@ -42,7 +42,7 @@ def read_avro( """ if isinstance(source, (str, Path)): source = normalize_filepath(source) - projection, parsed_columns = handle_projection_columns(columns) + projection, column_names = parse_columns_arg(columns) - pydf = PyDataFrame.read_avro(source, parsed_columns, projection, n_rows) + pydf = PyDataFrame.read_avro(source, column_names, projection, n_rows) return wrap_df(pydf) diff --git a/py-polars/polars/io/csv/batched_reader.py b/py-polars/polars/io/csv/batched_reader.py index fa93ff3d5a57..2a6cec6e3a9c 100644 --- a/py-polars/polars/io/csv/batched_reader.py +++ b/py-polars/polars/io/csv/batched_reader.py @@ -9,7 +9,7 @@ ) from polars._utils.wrap import wrap_df from polars.datatypes import N_INFER_DEFAULT, py_type_to_dtype -from polars.io._utils import handle_projection_columns, prepare_row_index_args +from polars.io._utils import parse_columns_arg, prepare_row_index_args from polars.io.csv._utils import _update_columns with contextlib.suppress(ImportError): # Module not available when building docs @@ -72,7 +72,7 @@ def __init__( raise TypeError(msg) processed_null_values = _process_null_values(null_values) - projection, columns = handle_projection_columns(columns) + projection, columns = parse_columns_arg(columns) self._reader = PyBatchedCsv.new( infer_schema_length=infer_schema_length, diff --git a/py-polars/polars/io/csv/functions.py b/py-polars/polars/io/csv/functions.py index 68ab428dd61e..cabb84d9ae46 100644 --- a/py-polars/polars/io/csv/functions.py +++ b/py-polars/polars/io/csv/functions.py @@ -16,8 +16,8 @@ from polars.datatypes import N_INFER_DEFAULT, String from polars.datatypes.convert import py_type_to_dtype from polars.io._utils import ( - handle_projection_columns, is_glob_pattern, + parse_columns_arg, prepare_file_arg, prepare_row_index_args, ) @@ -235,7 +235,7 @@ def read_csv( _check_arg_is_1byte("quote_char", quote_char, can_be_empty=True) _check_arg_is_1byte("eol_char", eol_char, can_be_empty=False) - projection, columns = handle_projection_columns(columns) + projection, columns = parse_columns_arg(columns) storage_options = storage_options or {} if columns and not has_header: @@ -548,7 +548,7 @@ def _read_csv_impl( ) raise ValueError(msg) - projection, columns = handle_projection_columns(columns) + projection, columns = parse_columns_arg(columns) pydf = PyDataFrame.read_csv( source, @@ -758,7 +758,7 @@ def read_csv_batched( ... ... batches = reader.next_batches(100) """ - projection, columns = handle_projection_columns(columns) + projection, columns = parse_columns_arg(columns) if columns and not has_header: for column in columns: diff --git a/py-polars/polars/io/ipc/functions.py b/py-polars/polars/io/ipc/functions.py index fde7be409ee1..bbdc6f8a0014 100644 --- a/py-polars/polars/io/ipc/functions.py +++ b/py-polars/polars/io/ipc/functions.py @@ -13,9 +13,9 @@ from polars._utils.wrap import wrap_df, wrap_ldf from polars.dependencies import _PYARROW_AVAILABLE from polars.io._utils import ( - handle_projection_columns, is_glob_pattern, is_local_file, + parse_columns_arg, prepare_file_arg, prepare_row_index_args, ) @@ -160,7 +160,7 @@ def _read_ipc_impl( raise TypeError(msg) return df - projection, columns = handle_projection_columns(columns) + projection, columns = parse_columns_arg(columns) pydf = PyDataFrame.read_ipc( source, columns, @@ -267,7 +267,7 @@ def _read_ipc_stream_impl( if isinstance(columns, str): columns = [columns] - projection, columns = handle_projection_columns(columns) + projection, columns = parse_columns_arg(columns) pydf = PyDataFrame.read_ipc_stream( source, columns, diff --git a/py-polars/polars/io/parquet/functions.py b/py-polars/polars/io/parquet/functions.py index 2d2673f99842..bd34ac31629b 100644 --- a/py-polars/polars/io/parquet/functions.py +++ b/py-polars/polars/io/parquet/functions.py @@ -16,10 +16,10 @@ from polars.convert import from_arrow from polars.dependencies import _PYARROW_AVAILABLE from polars.io._utils import ( - handle_projection_columns, is_glob_pattern, is_local_file, is_supported_cloud, + parse_columns_arg, prepare_file_arg, prepare_row_index_args, ) @@ -260,7 +260,7 @@ def _read_parquet_binary( ) raise TypeError(msg) - projection, columns = handle_projection_columns(columns) + projection, columns = parse_columns_arg(columns) pydf = PyDataFrame.read_parquet( source, diff --git a/py-polars/tests/unit/io/test_utils.py b/py-polars/tests/unit/io/test_utils.py new file mode 100644 index 000000000000..72613b068e18 --- /dev/null +++ b/py-polars/tests/unit/io/test_utils.py @@ -0,0 +1,35 @@ +from __future__ import annotations + +from typing import Sequence + +import pytest + +from polars.io._utils import parse_columns_arg + + +@pytest.mark.parametrize( + ("columns", "expected"), + [ + (["a", "b"], (None, ["a", "b"])), + ((1, 2), ((1, 2), None)), + ("foo", (None, ["foo"])), + (3, ([3], None)), + (None, (None, None)), + ], +) +def test_parse_columns_arg( + columns: Sequence[str] | Sequence[int] | str | int | None, + expected: tuple[Sequence[int] | None, Sequence[str] | None], +) -> None: + assert parse_columns_arg(columns) == expected + + +def test_parse_columns_arg_mixed_types() -> None: + with pytest.raises(TypeError): + parse_columns_arg(["a", 1]) + + +@pytest.mark.parametrize("columns", [["a", "a"], [1, 1, 2]]) +def test_parse_columns_arg_duplicates(columns: Sequence[str] | Sequence[int]) -> None: + with pytest.raises(ValueError): + parse_columns_arg(columns) From a6e4ce1637112d04fa7d389dc380d4310418efb4 Mon Sep 17 00:00:00 2001 From: Stijn de Gooijer Date: Mon, 8 Apr 2024 00:07:04 +0200 Subject: [PATCH 7/9] Rename parse_row_index_args --- py-polars/polars/io/_utils.py | 13 +++++++++---- py-polars/polars/io/csv/batched_reader.py | 4 ++-- py-polars/polars/io/csv/functions.py | 6 +++--- py-polars/polars/io/ipc/functions.py | 8 ++++---- py-polars/polars/io/ndjson.py | 4 ++-- py-polars/polars/io/parquet/functions.py | 6 +++--- py-polars/tests/unit/io/test_utils.py | 7 ++++++- 7 files changed, 29 insertions(+), 19 deletions(-) diff --git a/py-polars/polars/io/_utils.py b/py-polars/polars/io/_utils.py index e878bc630ec5..cef1ca45559f 100644 --- a/py-polars/polars/io/_utils.py +++ b/py-polars/polars/io/_utils.py @@ -56,14 +56,19 @@ def _ensure_columns_are_unique(columns: Sequence[str] | Sequence[int]) -> None: raise ValueError(msg) -def prepare_row_index_args( +def parse_row_index_args( row_index_name: str | None = None, row_index_offset: int = 0, ) -> tuple[str, int] | None: - if row_index_name is not None: - return (row_index_name, row_index_offset) - else: + """ + Parse the `row_index_name` and `row_index_offset` arguments of an I/O function. + + The Rust functions take a single tuple rather than two separate arguments. + """ + if row_index_name is None: return None + else: + return (row_index_name, row_index_offset) def is_glob_pattern(file: str) -> bool: diff --git a/py-polars/polars/io/csv/batched_reader.py b/py-polars/polars/io/csv/batched_reader.py index 2a6cec6e3a9c..de8ae06844fe 100644 --- a/py-polars/polars/io/csv/batched_reader.py +++ b/py-polars/polars/io/csv/batched_reader.py @@ -9,7 +9,7 @@ ) from polars._utils.wrap import wrap_df from polars.datatypes import N_INFER_DEFAULT, py_type_to_dtype -from polars.io._utils import parse_columns_arg, prepare_row_index_args +from polars.io._utils import parse_columns_arg, parse_row_index_args from polars.io.csv._utils import _update_columns with contextlib.suppress(ImportError): # Module not available when building docs @@ -97,7 +97,7 @@ def __init__( missing_utf8_is_empty_string=missing_utf8_is_empty_string, try_parse_dates=try_parse_dates, skip_rows_after_header=skip_rows_after_header, - row_index=prepare_row_index_args(row_index_name, row_index_offset), + row_index=parse_row_index_args(row_index_name, row_index_offset), sample_size=sample_size, eol_char=eol_char, raise_if_empty=raise_if_empty, diff --git a/py-polars/polars/io/csv/functions.py b/py-polars/polars/io/csv/functions.py index cabb84d9ae46..72a79844b0af 100644 --- a/py-polars/polars/io/csv/functions.py +++ b/py-polars/polars/io/csv/functions.py @@ -18,8 +18,8 @@ from polars.io._utils import ( is_glob_pattern, parse_columns_arg, + parse_row_index_args, prepare_file_arg, - prepare_row_index_args, ) from polars.io.csv._utils import _check_arg_is_1byte, _update_columns from polars.io.csv.batched_reader import BatchedCsvReader @@ -574,7 +574,7 @@ def _read_csv_impl( missing_utf8_is_empty_string, try_parse_dates, skip_rows_after_header, - prepare_row_index_args(row_index_name, row_index_offset), + parse_row_index_args(row_index_name, row_index_offset), sample_size=sample_size, eol_char=eol_char, raise_if_empty=raise_if_empty, @@ -1189,7 +1189,7 @@ def _scan_csv_impl( rechunk, skip_rows_after_header, encoding, - prepare_row_index_args(row_index_name, row_index_offset), + parse_row_index_args(row_index_name, row_index_offset), try_parse_dates, eol_char=eol_char, raise_if_empty=raise_if_empty, diff --git a/py-polars/polars/io/ipc/functions.py b/py-polars/polars/io/ipc/functions.py index bbdc6f8a0014..ddae71836953 100644 --- a/py-polars/polars/io/ipc/functions.py +++ b/py-polars/polars/io/ipc/functions.py @@ -16,8 +16,8 @@ is_glob_pattern, is_local_file, parse_columns_arg, + parse_row_index_args, prepare_file_arg, - prepare_row_index_args, ) from polars.io.ipc.anonymous_scan import _scan_ipc_fsspec @@ -166,7 +166,7 @@ def _read_ipc_impl( columns, projection, n_rows, - prepare_row_index_args(row_index_name, row_index_offset), + parse_row_index_args(row_index_name, row_index_offset), memory_map=memory_map, ) return wrap_df(pydf) @@ -273,7 +273,7 @@ def _read_ipc_stream_impl( columns, projection, n_rows, - prepare_row_index_args(row_index_name, row_index_offset), + parse_row_index_args(row_index_name, row_index_offset), rechunk, ) return wrap_df(pydf) @@ -372,7 +372,7 @@ def scan_ipc( n_rows, cache, rechunk, - prepare_row_index_args(row_index_name, row_index_offset), + parse_row_index_args(row_index_name, row_index_offset), memory_map=memory_map, cloud_options=storage_options, retries=retries, diff --git a/py-polars/polars/io/ndjson.py b/py-polars/polars/io/ndjson.py index c1c077c66ea4..72d1263e8f9d 100644 --- a/py-polars/polars/io/ndjson.py +++ b/py-polars/polars/io/ndjson.py @@ -9,7 +9,7 @@ from polars._utils.various import normalize_filepath from polars._utils.wrap import wrap_df, wrap_ldf from polars.datatypes import N_INFER_DEFAULT -from polars.io._utils import prepare_row_index_args +from polars.io._utils import parse_row_index_args with contextlib.suppress(ImportError): # Module not available when building docs from polars.polars import PyDataFrame, PyLazyFrame @@ -137,7 +137,7 @@ def scan_ndjson( n_rows, low_memory, rechunk, - prepare_row_index_args(row_index_name, row_index_offset), + parse_row_index_args(row_index_name, row_index_offset), ignore_errors, ) return wrap_ldf(pylf) diff --git a/py-polars/polars/io/parquet/functions.py b/py-polars/polars/io/parquet/functions.py index bd34ac31629b..224b72b69e23 100644 --- a/py-polars/polars/io/parquet/functions.py +++ b/py-polars/polars/io/parquet/functions.py @@ -20,8 +20,8 @@ is_local_file, is_supported_cloud, parse_columns_arg, + parse_row_index_args, prepare_file_arg, - prepare_row_index_args, ) from polars.io.parquet.anonymous_scan import _scan_parquet_fsspec @@ -268,7 +268,7 @@ def _read_parquet_binary( projection, n_rows, parallel, - prepare_row_index_args(row_index_name, row_index_offset), + parse_row_index_args(row_index_name, row_index_offset), low_memory=low_memory, use_statistics=use_statistics, rechunk=rechunk, @@ -472,7 +472,7 @@ def _scan_parquet_impl( cache, parallel, rechunk, - prepare_row_index_args(row_index_name, row_index_offset), + parse_row_index_args(row_index_name, row_index_offset), low_memory, cloud_options=storage_options, use_statistics=use_statistics, diff --git a/py-polars/tests/unit/io/test_utils.py b/py-polars/tests/unit/io/test_utils.py index 72613b068e18..797435fd44e8 100644 --- a/py-polars/tests/unit/io/test_utils.py +++ b/py-polars/tests/unit/io/test_utils.py @@ -4,7 +4,7 @@ import pytest -from polars.io._utils import parse_columns_arg +from polars.io._utils import parse_columns_arg, parse_row_index_args @pytest.mark.parametrize( @@ -33,3 +33,8 @@ def test_parse_columns_arg_mixed_types() -> None: def test_parse_columns_arg_duplicates(columns: Sequence[str] | Sequence[int]) -> None: with pytest.raises(ValueError): parse_columns_arg(columns) + + +def test_parse_row_index_args() -> None: + assert parse_row_index_args("idx", 5) == ("idx", 5) + assert parse_row_index_args(None, 5) is None From 1dea764d301ffcb291c04e24735896a8a9575061 Mon Sep 17 00:00:00 2001 From: Stijn de Gooijer Date: Mon, 8 Apr 2024 00:36:47 +0200 Subject: [PATCH 8/9] Move some stuff around --- py-polars/polars/io/_utils.py | 70 ++++---------------- py-polars/polars/io/spreadsheet/_utils.py | 47 +++++++++++++ py-polars/polars/io/spreadsheet/functions.py | 3 +- py-polars/tests/unit/io/test_utils.py | 23 ++++++- py-polars/tests/unit/utils/test_utils.py | 20 ------ 5 files changed, 84 insertions(+), 79 deletions(-) create mode 100644 py-polars/polars/io/spreadsheet/_utils.py diff --git a/py-polars/polars/io/_utils.py b/py-polars/polars/io/_utils.py index cef1ca45559f..6493f63f5779 100644 --- a/py-polars/polars/io/_utils.py +++ b/py-polars/polars/io/_utils.py @@ -5,8 +5,7 @@ from contextlib import contextmanager from io import BytesIO, StringIO from pathlib import Path -from tempfile import NamedTemporaryFile -from typing import IO, Any, ContextManager, Iterator, Sequence, cast, overload +from typing import IO, Any, ContextManager, Iterator, Sequence, overload from polars._utils.various import is_int_sequence, is_str_sequence, normalize_filepath from polars.dependencies import _FSSPEC_AVAILABLE, fsspec @@ -71,23 +70,6 @@ def parse_row_index_args( return (row_index_name, row_index_offset) -def is_glob_pattern(file: str) -> bool: - return any(char in file for char in ["*", "?", "["]) - - -def is_supported_cloud(file: str) -> bool: - return bool(re.match("^(s3a?|gs|gcs|file|abfss?|azure|az|adl|https?)://", file)) - - -def is_local_file(file: str) -> bool: - try: - next(glob.iglob(file, recursive=True)) # noqa: PTH207 - except StopIteration: - return False - else: - return True - - @overload def prepare_file_arg( file: str | Path | list[str] | IO[bytes] | bytes, @@ -292,42 +274,18 @@ def process_file_url(path: str, encoding: str | None = None) -> BytesIO: return BytesIO(f.read().decode(encoding).encode("utf8")) -@contextmanager -def PortableTemporaryFile( - mode: str = "w+b", - *, - buffering: int = -1, - encoding: str | None = None, - newline: str | None = None, - suffix: str | None = None, - prefix: str | None = None, - dir: str | Path | None = None, - delete: bool = True, - errors: str | None = None, -) -> Iterator[Any]: - """ - Slightly more resilient version of the standard `NamedTemporaryFile`. +def is_glob_pattern(file: str) -> bool: + return any(char in file for char in ["*", "?", "["]) - Plays better with Windows when using the 'delete' option. - """ - params = cast( - Any, - { - "mode": mode, - "buffering": buffering, - "encoding": encoding, - "newline": newline, - "suffix": suffix, - "prefix": prefix, - "dir": dir, - "delete": False, - "errors": errors, - }, - ) - tmp = NamedTemporaryFile(**params) + +def is_supported_cloud(file: str) -> bool: + return bool(re.match("^(s3a?|gs|gcs|file|abfss?|azure|az|adl|https?)://", file)) + + +def is_local_file(file: str) -> bool: try: - yield tmp - finally: - tmp.close() - if delete: - Path(tmp.name).unlink(missing_ok=True) + next(glob.iglob(file, recursive=True)) # noqa: PTH207 + except StopIteration: + return False + else: + return True diff --git a/py-polars/polars/io/spreadsheet/_utils.py b/py-polars/polars/io/spreadsheet/_utils.py new file mode 100644 index 000000000000..cbf86eb45a48 --- /dev/null +++ b/py-polars/polars/io/spreadsheet/_utils.py @@ -0,0 +1,47 @@ +from __future__ import annotations + +from contextlib import contextmanager +from pathlib import Path +from tempfile import NamedTemporaryFile +from typing import Any, Iterator, cast + + +@contextmanager +def PortableTemporaryFile( + mode: str = "w+b", + *, + buffering: int = -1, + encoding: str | None = None, + newline: str | None = None, + suffix: str | None = None, + prefix: str | None = None, + dir: str | Path | None = None, + delete: bool = True, + errors: str | None = None, +) -> Iterator[Any]: + """ + Slightly more resilient version of the standard `NamedTemporaryFile`. + + Plays better with Windows when using the 'delete' option. + """ + params = cast( + Any, + { + "mode": mode, + "buffering": buffering, + "encoding": encoding, + "newline": newline, + "suffix": suffix, + "prefix": prefix, + "dir": dir, + "delete": False, + "errors": errors, + }, + ) + tmp = NamedTemporaryFile(**params) + try: + yield tmp + finally: + tmp.close() + if delete: + Path(tmp.name).unlink(missing_ok=True) diff --git a/py-polars/polars/io/spreadsheet/functions.py b/py-polars/polars/io/spreadsheet/functions.py index 7f79cfa272a3..eaa455b03721 100644 --- a/py-polars/polars/io/spreadsheet/functions.py +++ b/py-polars/polars/io/spreadsheet/functions.py @@ -22,8 +22,9 @@ ) from polars.dependencies import import_optional from polars.exceptions import NoDataError, ParameterCollisionError -from polars.io._utils import PortableTemporaryFile, looks_like_url, process_file_url +from polars.io._utils import looks_like_url, process_file_url from polars.io.csv.functions import read_csv +from polars.io.spreadsheet._utils import PortableTemporaryFile if TYPE_CHECKING: from typing import Literal diff --git a/py-polars/tests/unit/io/test_utils.py b/py-polars/tests/unit/io/test_utils.py index 797435fd44e8..7c2173469ebb 100644 --- a/py-polars/tests/unit/io/test_utils.py +++ b/py-polars/tests/unit/io/test_utils.py @@ -4,7 +4,7 @@ import pytest -from polars.io._utils import parse_columns_arg, parse_row_index_args +from polars.io._utils import looks_like_url, parse_columns_arg, parse_row_index_args @pytest.mark.parametrize( @@ -26,7 +26,7 @@ def test_parse_columns_arg( def test_parse_columns_arg_mixed_types() -> None: with pytest.raises(TypeError): - parse_columns_arg(["a", 1]) + parse_columns_arg(["a", 1]) # type: ignore[arg-type] @pytest.mark.parametrize("columns", [["a", "a"], [1, 1, 2]]) @@ -38,3 +38,22 @@ def test_parse_columns_arg_duplicates(columns: Sequence[str] | Sequence[int]) -> def test_parse_row_index_args() -> None: assert parse_row_index_args("idx", 5) == ("idx", 5) assert parse_row_index_args(None, 5) is None + + +@pytest.mark.parametrize( + ("url", "result"), + [ + ("HTTPS://pola.rs/data.csv", True), + ("http://pola.rs/data.csv", True), + ("ftps://pola.rs/data.csv", True), + ("FTP://pola.rs/data.csv", True), + ("htp://pola.rs/data.csv", False), + ("fttp://pola.rs/data.csv", False), + ("http_not_a_url", False), + ("ftp_not_a_url", False), + ("/mnt/data.csv", False), + ("file://mnt/data.csv", False), + ], +) +def test_looks_like_url(url: str, result: bool) -> None: + assert looks_like_url(url) == result diff --git a/py-polars/tests/unit/utils/test_utils.py b/py-polars/tests/unit/utils/test_utils.py index e15b6e918bc9..194070822be3 100644 --- a/py-polars/tests/unit/utils/test_utils.py +++ b/py-polars/tests/unit/utils/test_utils.py @@ -23,7 +23,6 @@ parse_percentiles, parse_version, ) -from polars.io._utils import looks_like_url if TYPE_CHECKING: from zoneinfo import ZoneInfo @@ -291,22 +290,3 @@ def test_is_str_sequence_check( assert is_str_sequence(sequence, include_series=include_series) == expected if expected: assert is_sequence(sequence, include_series=include_series) - - -@pytest.mark.parametrize( - ("url", "result"), - [ - ("HTTPS://pola.rs/data.csv", True), - ("http://pola.rs/data.csv", True), - ("ftps://pola.rs/data.csv", True), - ("FTP://pola.rs/data.csv", True), - ("htp://pola.rs/data.csv", False), - ("fttp://pola.rs/data.csv", False), - ("http_not_a_url", False), - ("ftp_not_a_url", False), - ("/mnt/data.csv", False), - ("file://mnt/data.csv", False), - ], -) -def test_looks_like_url(url: str, result: bool) -> None: - assert looks_like_url(url) == result From 42b5b1ff8245babda6b04732728aa3f39333747a Mon Sep 17 00:00:00 2001 From: Stijn de Gooijer Date: Mon, 8 Apr 2024 00:42:52 +0200 Subject: [PATCH 9/9] Some docstring cleanup --- py-polars/polars/io/_utils.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/py-polars/polars/io/_utils.py b/py-polars/polars/io/_utils.py index 6493f63f5779..3c11e031b9cb 100644 --- a/py-polars/polars/io/_utils.py +++ b/py-polars/polars/io/_utils.py @@ -117,15 +117,15 @@ def prepare_file_arg( Utility for read_[csv, parquet]. (not to be used by scan_[csv, parquet]). Returned value is always usable as a context. - A :class:`StringIO`, :class:`BytesIO` file is returned as a :class:`BytesIO`. + A `StringIO`, `BytesIO` file is returned as a `BytesIO`. A local path is returned as a string. - An http URL is read into a buffer and returned as a :class:`BytesIO`. + An http URL is read into a buffer and returned as a `BytesIO`. When `encoding` is not `utf8` or `utf8-lossy`, the whole file is - first read in python and decoded using the specified encoding and - returned as a :class:`BytesIO` (for usage with `read_csv`). + first read in Python and decoded using the specified encoding and + returned as a `BytesIO` (for usage with `read_csv`). - A `bytes` file is returned as a :class:`BytesIO` if `use_pyarrow=True`. + A `bytes` file is returned as a `BytesIO` if `use_pyarrow=True`. When fsspec is installed, remote file(s) is (are) opened with `fsspec.open(file, **kwargs)` or `fsspec.open_files(file, **kwargs)`.