From 7d97781dff16a1213cf9ca62185008caba116971 Mon Sep 17 00:00:00 2001
From: Stijn de Gooijer <stijndegooijer@gmail.com>
Date: Sun, 7 Apr 2024 20:05:18 +0200
Subject: [PATCH 1/9] Fix util names

---
 py-polars/polars/_utils/various.py            | 10 ------
 py-polars/polars/io/_utils.py                 | 32 ++++++++++++-------
 py-polars/polars/io/csv/batched_reader.py     |  5 ++-
 py-polars/polars/io/csv/functions.py          | 16 +++++-----
 py-polars/polars/io/ipc/anonymous_scan.py     |  4 +--
 py-polars/polars/io/ipc/functions.py          | 22 ++++++-------
 py-polars/polars/io/ndjson.py                 |  5 +--
 py-polars/polars/io/parquet/anonymous_scan.py |  4 +--
 py-polars/polars/io/parquet/functions.py      | 24 +++++++-------
 py-polars/polars/io/spreadsheet/functions.py  |  6 ++--
 py-polars/tests/unit/io/cloud/test_utils.py   |  4 +--
 py-polars/tests/unit/utils/test_utils.py      |  4 +--
 12 files changed, 68 insertions(+), 68 deletions(-)

diff --git a/py-polars/polars/_utils/various.py b/py-polars/polars/_utils/various.py
index 5856168ed95c..292ef6d73fc2 100644
--- a/py-polars/polars/_utils/various.py
+++ b/py-polars/polars/_utils/various.py
@@ -158,16 +158,6 @@ def range_to_slice(rng: range) -> slice:
     return slice(rng.start, rng.stop, rng.step)
 
 
-def _prepare_row_index_args(
-    row_index_name: str | None = None,
-    row_index_offset: int = 0,
-) -> tuple[str, int] | None:
-    if row_index_name is not None:
-        return (row_index_name, row_index_offset)
-    else:
-        return None
-
-
 def _in_notebook() -> bool:
     try:
         from IPython import get_ipython
diff --git a/py-polars/polars/io/_utils.py b/py-polars/polars/io/_utils.py
index b5e249c6b807..da7efe21b562 100644
--- a/py-polars/polars/io/_utils.py
+++ b/py-polars/polars/io/_utils.py
@@ -38,15 +38,25 @@ def handle_projection_columns(
     return projection, new_columns
 
 
-def _is_glob_pattern(file: str) -> bool:
+def prepare_row_index_args(
+    row_index_name: str | None = None,
+    row_index_offset: int = 0,
+) -> tuple[str, int] | None:
+    if row_index_name is not None:
+        return (row_index_name, row_index_offset)
+    else:
+        return None
+
+
+def is_glob_pattern(file: str) -> bool:
     return any(char in file for char in ["*", "?", "["])
 
 
-def _is_supported_cloud(file: str) -> bool:
+def is_supported_cloud(file: str) -> bool:
     return bool(re.match("^(s3a?|gs|gcs|file|abfss?|azure|az|adl|https?)://", file))
 
 
-def _is_local_file(file: str) -> bool:
+def is_local_file(file: str) -> bool:
     try:
         next(glob.iglob(file, recursive=True))  # noqa: PTH207
     except StopIteration:
@@ -56,7 +66,7 @@ def _is_local_file(file: str) -> bool:
 
 
 @overload
-def _prepare_file_arg(
+def prepare_file_arg(
     file: str | Path | list[str] | IO[bytes] | bytes,
     encoding: str | None = ...,
     *,
@@ -67,7 +77,7 @@ def _prepare_file_arg(
 
 
 @overload
-def _prepare_file_arg(
+def prepare_file_arg(
     file: str | Path | IO[str] | IO[bytes] | bytes,
     encoding: str | None = ...,
     *,
@@ -78,7 +88,7 @@ def _prepare_file_arg(
 
 
 @overload
-def _prepare_file_arg(
+def prepare_file_arg(
     file: str | Path | list[str] | IO[str] | IO[bytes] | bytes,
     encoding: str | None = ...,
     *,
@@ -88,7 +98,7 @@ def _prepare_file_arg(
 ) -> ContextManager[str | list[str] | BytesIO | list[BytesIO]]: ...
 
 
-def _prepare_file_arg(
+def prepare_file_arg(
     file: str | Path | list[str] | IO[str] | IO[bytes] | bytes,
     encoding: str | None = None,
     *,
@@ -181,8 +191,8 @@ def managed_file(file: Any) -> Iterator[Any]:
         # make sure that this is before fsspec
         # as fsspec needs requests to be installed
         # to read from http
-        if _looks_like_url(file):
-            return _process_file_url(file, encoding_str)
+        if looks_like_url(file):
+            return process_file_url(file, encoding_str)
         if _FSSPEC_AVAILABLE:
             from fsspec.utils import infer_storage_options
 
@@ -245,11 +255,11 @@ def _check_empty(
     return b
 
 
-def _looks_like_url(path: str) -> bool:
+def looks_like_url(path: str) -> bool:
     return re.match("^(ht|f)tps?://", path, re.IGNORECASE) is not None
 
 
-def _process_file_url(path: str, encoding: str | None = None) -> BytesIO:
+def process_file_url(path: str, encoding: str | None = None) -> BytesIO:
     from urllib.request import urlopen
 
     with urlopen(path) as f:
diff --git a/py-polars/polars/io/csv/batched_reader.py b/py-polars/polars/io/csv/batched_reader.py
index 6f79d0e71b63..fa93ff3d5a57 100644
--- a/py-polars/polars/io/csv/batched_reader.py
+++ b/py-polars/polars/io/csv/batched_reader.py
@@ -4,13 +4,12 @@
 from typing import TYPE_CHECKING, Sequence
 
 from polars._utils.various import (
-    _prepare_row_index_args,
     _process_null_values,
     normalize_filepath,
 )
 from polars._utils.wrap import wrap_df
 from polars.datatypes import N_INFER_DEFAULT, py_type_to_dtype
-from polars.io._utils import handle_projection_columns
+from polars.io._utils import handle_projection_columns, prepare_row_index_args
 from polars.io.csv._utils import _update_columns
 
 with contextlib.suppress(ImportError):  # Module not available when building docs
@@ -98,7 +97,7 @@ def __init__(
             missing_utf8_is_empty_string=missing_utf8_is_empty_string,
             try_parse_dates=try_parse_dates,
             skip_rows_after_header=skip_rows_after_header,
-            row_index=_prepare_row_index_args(row_index_name, row_index_offset),
+            row_index=prepare_row_index_args(row_index_name, row_index_offset),
             sample_size=sample_size,
             eol_char=eol_char,
             raise_if_empty=raise_if_empty,
diff --git a/py-polars/polars/io/csv/functions.py b/py-polars/polars/io/csv/functions.py
index b3410ac088c1..68ab428dd61e 100644
--- a/py-polars/polars/io/csv/functions.py
+++ b/py-polars/polars/io/csv/functions.py
@@ -8,7 +8,6 @@
 import polars._reexport as pl
 from polars._utils.deprecation import deprecate_renamed_parameter
 from polars._utils.various import (
-    _prepare_row_index_args,
     _process_null_values,
     is_str_sequence,
     normalize_filepath,
@@ -17,9 +16,10 @@
 from polars.datatypes import N_INFER_DEFAULT, String
 from polars.datatypes.convert import py_type_to_dtype
 from polars.io._utils import (
-    _is_glob_pattern,
-    _prepare_file_arg,
     handle_projection_columns,
+    is_glob_pattern,
+    prepare_file_arg,
+    prepare_row_index_args,
 )
 from polars.io.csv._utils import _check_arg_is_1byte, _update_columns
 from polars.io.csv.batched_reader import BatchedCsvReader
@@ -269,7 +269,7 @@ def read_csv(
             # for pyarrow.
             include_columns = [f"f{column_idx}" for column_idx in projection]
 
-        with _prepare_file_arg(
+        with prepare_file_arg(
             source,
             encoding=None,
             use_pyarrow=True,
@@ -403,7 +403,7 @@ def read_csv(
                 for column_name, column_dtype in dtypes.items()
             }
 
-    with _prepare_file_arg(
+    with prepare_file_arg(
         source,
         encoding=encoding,
         use_pyarrow=False,
@@ -502,7 +502,7 @@ def _read_csv_impl(
 
     if isinstance(columns, str):
         columns = [columns]
-    if isinstance(source, str) and _is_glob_pattern(source):
+    if isinstance(source, str) and is_glob_pattern(source):
         dtypes_dict = None
         if dtype_list is not None:
             dtypes_dict = dict(dtype_list)
@@ -574,7 +574,7 @@ def _read_csv_impl(
         missing_utf8_is_empty_string,
         try_parse_dates,
         skip_rows_after_header,
-        _prepare_row_index_args(row_index_name, row_index_offset),
+        prepare_row_index_args(row_index_name, row_index_offset),
         sample_size=sample_size,
         eol_char=eol_char,
         raise_if_empty=raise_if_empty,
@@ -1189,7 +1189,7 @@ def _scan_csv_impl(
         rechunk,
         skip_rows_after_header,
         encoding,
-        _prepare_row_index_args(row_index_name, row_index_offset),
+        prepare_row_index_args(row_index_name, row_index_offset),
         try_parse_dates,
         eol_char=eol_char,
         raise_if_empty=raise_if_empty,
diff --git a/py-polars/polars/io/ipc/anonymous_scan.py b/py-polars/polars/io/ipc/anonymous_scan.py
index 3b2fb5cb8c2f..e0648c4980d1 100644
--- a/py-polars/polars/io/ipc/anonymous_scan.py
+++ b/py-polars/polars/io/ipc/anonymous_scan.py
@@ -5,7 +5,7 @@
 
 import polars._reexport as pl
 import polars.io.ipc
-from polars.io._utils import _prepare_file_arg
+from polars.io._utils import prepare_file_arg
 
 if TYPE_CHECKING:
     from polars import DataFrame, LazyFrame
@@ -18,7 +18,7 @@ def _scan_ipc_fsspec(
     func = partial(_scan_ipc_impl, source, storage_options=storage_options)
 
     storage_options = storage_options or {}
-    with _prepare_file_arg(source, storage_options=storage_options) as data:
+    with prepare_file_arg(source, storage_options=storage_options) as data:
         schema = polars.io.ipc.read_ipc_schema(data)
 
     return pl.LazyFrame._scan_python_function(schema, func)
diff --git a/py-polars/polars/io/ipc/functions.py b/py-polars/polars/io/ipc/functions.py
index 52d6b6a307c8..fde7be409ee1 100644
--- a/py-polars/polars/io/ipc/functions.py
+++ b/py-polars/polars/io/ipc/functions.py
@@ -7,17 +7,17 @@
 import polars._reexport as pl
 from polars._utils.deprecation import deprecate_renamed_parameter
 from polars._utils.various import (
-    _prepare_row_index_args,
     is_str_sequence,
     normalize_filepath,
 )
 from polars._utils.wrap import wrap_df, wrap_ldf
 from polars.dependencies import _PYARROW_AVAILABLE
 from polars.io._utils import (
-    _is_glob_pattern,
-    _is_local_file,
-    _prepare_file_arg,
     handle_projection_columns,
+    is_glob_pattern,
+    is_local_file,
+    prepare_file_arg,
+    prepare_row_index_args,
 )
 from polars.io.ipc.anonymous_scan import _scan_ipc_fsspec
 
@@ -94,7 +94,7 @@ def read_ipc(
         msg = "`n_rows` cannot be used with `use_pyarrow=True` and `memory_map=False`"
         raise ValueError(msg)
 
-    with _prepare_file_arg(
+    with prepare_file_arg(
         source, use_pyarrow=use_pyarrow, storage_options=storage_options
     ) as data:
         if use_pyarrow:
@@ -139,7 +139,7 @@ def _read_ipc_impl(
     if isinstance(columns, str):
         columns = [columns]
 
-    if isinstance(source, str) and _is_glob_pattern(source) and _is_local_file(source):
+    if isinstance(source, str) and is_glob_pattern(source) and is_local_file(source):
         scan = scan_ipc(
             source,
             n_rows=n_rows,
@@ -166,7 +166,7 @@ def _read_ipc_impl(
         columns,
         projection,
         n_rows,
-        _prepare_row_index_args(row_index_name, row_index_offset),
+        prepare_row_index_args(row_index_name, row_index_offset),
         memory_map=memory_map,
     )
     return wrap_df(pydf)
@@ -221,7 +221,7 @@ def read_ipc_stream(
     -------
     DataFrame
     """
-    with _prepare_file_arg(
+    with prepare_file_arg(
         source, use_pyarrow=use_pyarrow, storage_options=storage_options
     ) as data:
         if use_pyarrow:
@@ -273,7 +273,7 @@ def _read_ipc_stream_impl(
         columns,
         projection,
         n_rows,
-        _prepare_row_index_args(row_index_name, row_index_offset),
+        prepare_row_index_args(row_index_name, row_index_offset),
         rechunk,
     )
     return wrap_df(pydf)
@@ -358,7 +358,7 @@ def scan_ipc(
         source = None  # type: ignore[assignment]
 
     # try fsspec scanner
-    if can_use_fsspec and not _is_local_file(source):  # type: ignore[arg-type]
+    if can_use_fsspec and not is_local_file(source):  # type: ignore[arg-type]
         scan = _scan_ipc_fsspec(source, storage_options)  # type: ignore[arg-type]
         if n_rows:
             scan = scan.head(n_rows)
@@ -372,7 +372,7 @@ def scan_ipc(
         n_rows,
         cache,
         rechunk,
-        _prepare_row_index_args(row_index_name, row_index_offset),
+        prepare_row_index_args(row_index_name, row_index_offset),
         memory_map=memory_map,
         cloud_options=storage_options,
         retries=retries,
diff --git a/py-polars/polars/io/ndjson.py b/py-polars/polars/io/ndjson.py
index 0e810a3f1eea..c1c077c66ea4 100644
--- a/py-polars/polars/io/ndjson.py
+++ b/py-polars/polars/io/ndjson.py
@@ -6,9 +6,10 @@
 from typing import TYPE_CHECKING
 
 from polars._utils.deprecation import deprecate_renamed_parameter
-from polars._utils.various import _prepare_row_index_args, normalize_filepath
+from polars._utils.various import normalize_filepath
 from polars._utils.wrap import wrap_df, wrap_ldf
 from polars.datatypes import N_INFER_DEFAULT
+from polars.io._utils import prepare_row_index_args
 
 with contextlib.suppress(ImportError):  # Module not available when building docs
     from polars.polars import PyDataFrame, PyLazyFrame
@@ -136,7 +137,7 @@ def scan_ndjson(
         n_rows,
         low_memory,
         rechunk,
-        _prepare_row_index_args(row_index_name, row_index_offset),
+        prepare_row_index_args(row_index_name, row_index_offset),
         ignore_errors,
     )
     return wrap_ldf(pylf)
diff --git a/py-polars/polars/io/parquet/anonymous_scan.py b/py-polars/polars/io/parquet/anonymous_scan.py
index 5f1d72013bec..6bb06e2f2d32 100644
--- a/py-polars/polars/io/parquet/anonymous_scan.py
+++ b/py-polars/polars/io/parquet/anonymous_scan.py
@@ -5,7 +5,7 @@
 
 import polars._reexport as pl
 import polars.io.parquet
-from polars.io._utils import _prepare_file_arg
+from polars.io._utils import prepare_file_arg
 
 if TYPE_CHECKING:
     from polars import DataFrame, LazyFrame
@@ -17,7 +17,7 @@ def _scan_parquet_fsspec(
 ) -> LazyFrame:
     func = partial(_scan_parquet_impl, source, storage_options=storage_options)
 
-    with _prepare_file_arg(source, storage_options=storage_options) as data:
+    with prepare_file_arg(source, storage_options=storage_options) as data:
         schema = polars.io.parquet.read_parquet_schema(data)
 
     return pl.LazyFrame._scan_python_function(schema, func)
diff --git a/py-polars/polars/io/parquet/functions.py b/py-polars/polars/io/parquet/functions.py
index d83b671159ab..82d386bf4c13 100644
--- a/py-polars/polars/io/parquet/functions.py
+++ b/py-polars/polars/io/parquet/functions.py
@@ -8,7 +8,6 @@
 from polars._utils.deprecation import deprecate_renamed_parameter
 from polars._utils.unstable import issue_unstable_warning
 from polars._utils.various import (
-    _prepare_row_index_args,
     is_int_sequence,
     is_str_sequence,
     normalize_filepath,
@@ -17,11 +16,12 @@
 from polars.convert import from_arrow
 from polars.dependencies import _PYARROW_AVAILABLE
 from polars.io._utils import (
-    _is_glob_pattern,
-    _is_local_file,
-    _is_supported_cloud,
-    _prepare_file_arg,
     handle_projection_columns,
+    is_glob_pattern,
+    is_local_file,
+    is_supported_cloud,
+    prepare_file_arg,
+    prepare_row_index_args,
 )
 from polars.io.parquet.anonymous_scan import _scan_parquet_fsspec
 
@@ -167,7 +167,7 @@ def read_parquet(
 
         pyarrow_options = pyarrow_options or {}
 
-        with _prepare_file_arg(
+        with prepare_file_arg(
             source,  # type: ignore[arg-type]
             use_pyarrow=True,
             storage_options=storage_options,
@@ -183,7 +183,7 @@ def read_parquet(
 
     # Read binary types using `read_parquet`
     elif isinstance(source, (io.BufferedIOBase, io.RawIOBase, bytes)):
-        with _prepare_file_arg(source, use_pyarrow=False) as source_prep:
+        with prepare_file_arg(source, use_pyarrow=False) as source_prep:
             return _read_parquet_binary(
                 source_prep,
                 columns=columns,
@@ -238,7 +238,7 @@ def _read_parquet_binary(
     if isinstance(columns, str):
         columns = [columns]
 
-    if isinstance(source, str) and _is_glob_pattern(source):
+    if isinstance(source, str) and is_glob_pattern(source):
         from polars import scan_parquet
 
         scan = scan_parquet(
@@ -270,7 +270,7 @@ def _read_parquet_binary(
         projection,
         n_rows,
         parallel,
-        _prepare_row_index_args(row_index_name, row_index_offset),
+        prepare_row_index_args(row_index_name, row_index_offset),
         low_memory=low_memory,
         use_statistics=use_statistics,
         rechunk=rechunk,
@@ -451,8 +451,8 @@ def _scan_parquet_impl(
     # try fsspec scanner
     if (
         can_use_fsspec
-        and not _is_local_file(source)  # type: ignore[arg-type]
-        and not _is_supported_cloud(source)  # type: ignore[arg-type]
+        and not is_local_file(source)  # type: ignore[arg-type]
+        and not is_supported_cloud(source)  # type: ignore[arg-type]
     ):
         scan = _scan_parquet_fsspec(source, storage_options)  # type: ignore[arg-type]
         if n_rows:
@@ -474,7 +474,7 @@ def _scan_parquet_impl(
         cache,
         parallel,
         rechunk,
-        _prepare_row_index_args(row_index_name, row_index_offset),
+        prepare_row_index_args(row_index_name, row_index_offset),
         low_memory,
         cloud_options=storage_options,
         use_statistics=use_statistics,
diff --git a/py-polars/polars/io/spreadsheet/functions.py b/py-polars/polars/io/spreadsheet/functions.py
index 9007255a6432..7f79cfa272a3 100644
--- a/py-polars/polars/io/spreadsheet/functions.py
+++ b/py-polars/polars/io/spreadsheet/functions.py
@@ -22,7 +22,7 @@
 )
 from polars.dependencies import import_optional
 from polars.exceptions import NoDataError, ParameterCollisionError
-from polars.io._utils import PortableTemporaryFile, _looks_like_url, _process_file_url
+from polars.io._utils import PortableTemporaryFile, looks_like_url, process_file_url
 from polars.io.csv.functions import read_csv
 
 if TYPE_CHECKING:
@@ -447,8 +447,8 @@ def _read_spreadsheet(
 ) -> pl.DataFrame | dict[str, pl.DataFrame]:
     if is_file := isinstance(source, (str, Path)):
         source = normalize_filepath(source)
-        if _looks_like_url(source):
-            source = _process_file_url(source)
+        if looks_like_url(source):
+            source = process_file_url(source)
 
     if engine is None:
         if is_file and str(source).lower().endswith(".ods"):
diff --git a/py-polars/tests/unit/io/cloud/test_utils.py b/py-polars/tests/unit/io/cloud/test_utils.py
index 968661638ef7..90fb23343fe8 100644
--- a/py-polars/tests/unit/io/cloud/test_utils.py
+++ b/py-polars/tests/unit/io/cloud/test_utils.py
@@ -2,7 +2,7 @@
 
 import pytest
 
-from polars.io._utils import _is_supported_cloud
+from polars.io._utils import is_supported_cloud
 
 
 @pytest.mark.parametrize(
@@ -22,4 +22,4 @@
     ],
 )
 def test_is_cloud_url(url: str, expected: bool) -> None:
-    assert _is_supported_cloud(url) is expected
+    assert is_supported_cloud(url) is expected
diff --git a/py-polars/tests/unit/utils/test_utils.py b/py-polars/tests/unit/utils/test_utils.py
index d81540efc10d..e15b6e918bc9 100644
--- a/py-polars/tests/unit/utils/test_utils.py
+++ b/py-polars/tests/unit/utils/test_utils.py
@@ -23,7 +23,7 @@
     parse_percentiles,
     parse_version,
 )
-from polars.io._utils import _looks_like_url
+from polars.io._utils import looks_like_url
 
 if TYPE_CHECKING:
     from zoneinfo import ZoneInfo
@@ -309,4 +309,4 @@ def test_is_str_sequence_check(
     ],
 )
 def test_looks_like_url(url: str, result: bool) -> None:
-    assert _looks_like_url(url) == result
+    assert looks_like_url(url) == result

From 0f4924b6f76cd19c92ebedb2a0686a2d486fe92f Mon Sep 17 00:00:00 2001
From: Stijn de Gooijer <stijndegooijer@gmail.com>
Date: Sun, 7 Apr 2024 21:25:46 +0200
Subject: [PATCH 2/9] Driveby

---
 py-polars/polars/io/_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/py-polars/polars/io/_utils.py b/py-polars/polars/io/_utils.py
index da7efe21b562..55ffe5297f4d 100644
--- a/py-polars/polars/io/_utils.py
+++ b/py-polars/polars/io/_utils.py
@@ -244,7 +244,7 @@ def managed_file(file: Any) -> Iterator[Any]:
 def _check_empty(
     b: BytesIO, *, context: str, raise_if_empty: bool, read_position: int | None = None
 ) -> BytesIO:
-    if raise_if_empty and not b.getbuffer().nbytes:
+    if raise_if_empty and b.getbuffer().nbytes == 0:
         hint = (
             f" (buffer position = {read_position}; try seek(0) before reading?)"
             if context in ("StringIO", "BytesIO") and read_position

From af5f83cc9af596685025cf0e56aa0716ea04612f Mon Sep 17 00:00:00 2001
From: Stijn de Gooijer <stijndegooijer@gmail.com>
Date: Sun, 7 Apr 2024 21:25:56 +0200
Subject: [PATCH 3/9] Refactor

---
 py-polars/polars/io/_utils.py | 39 +++++++++++++++++++++--------------
 1 file changed, 23 insertions(+), 16 deletions(-)

diff --git a/py-polars/polars/io/_utils.py b/py-polars/polars/io/_utils.py
index 55ffe5297f4d..3078a1d6b0ec 100644
--- a/py-polars/polars/io/_utils.py
+++ b/py-polars/polars/io/_utils.py
@@ -17,24 +17,31 @@ def handle_projection_columns(
     columns: Sequence[str] | Sequence[int] | str | None,
 ) -> tuple[list[int] | None, Sequence[str] | None]:
     """Disambiguates between columns specified as integers vs. strings."""
+    if columns is None:
+        return None, None
+
     projection: list[int] | None = None
     new_columns: Sequence[str] | None = None
-    if columns is not None:
-        if isinstance(columns, str):
-            new_columns = [columns]
-        elif is_int_sequence(columns):
-            projection = list(columns)
-        elif not is_str_sequence(columns):
-            msg = "`columns` arg should contain a list of all integers or all strings values"
-            raise TypeError(msg)
-        else:
-            new_columns = columns
-        if columns and len(set(columns)) != len(columns):
-            msg = f"`columns` arg should only have unique values, got {columns!r}"
-            raise ValueError(msg)
-        if projection and len(set(projection)) != len(projection):
-            msg = f"`columns` arg should only have unique values, got {projection!r}"
-            raise ValueError(msg)
+
+    if isinstance(columns, str):
+        new_columns = [columns]
+    elif is_int_sequence(columns):
+        projection = list(columns)
+    elif is_str_sequence(columns):
+        new_columns = columns
+    else:
+        msg = (
+            "`columns` arg should contain a list of all integers or all strings values"
+        )
+        raise TypeError(msg)
+
+    if columns and len(set(columns)) != len(columns):
+        msg = f"`columns` arg should only have unique values, got {columns!r}"
+        raise ValueError(msg)
+    if projection and len(set(projection)) != len(projection):
+        msg = f"`columns` arg should only have unique values, got {projection!r}"
+        raise ValueError(msg)
+
     return projection, new_columns
 
 

From 4544f5c41d389a30869853aabbf34c5d61801fac Mon Sep 17 00:00:00 2001
From: Stijn de Gooijer <stijndegooijer@gmail.com>
Date: Sun, 7 Apr 2024 21:44:57 +0200
Subject: [PATCH 4/9] Remove unneeded import

---
 py-polars/polars/io/parquet/functions.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/py-polars/polars/io/parquet/functions.py b/py-polars/polars/io/parquet/functions.py
index 82d386bf4c13..2d2673f99842 100644
--- a/py-polars/polars/io/parquet/functions.py
+++ b/py-polars/polars/io/parquet/functions.py
@@ -239,8 +239,6 @@ def _read_parquet_binary(
         columns = [columns]
 
     if isinstance(source, str) and is_glob_pattern(source):
-        from polars import scan_parquet
-
         scan = scan_parquet(
             source,
             n_rows=n_rows,

From 2881afe21c08b61b546a76acd9c2c174012513f1 Mon Sep 17 00:00:00 2001
From: Stijn de Gooijer <stijndegooijer@gmail.com>
Date: Sun, 7 Apr 2024 21:45:48 +0200
Subject: [PATCH 5/9] Small refactor

---
 py-polars/polars/io/_utils.py | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/py-polars/polars/io/_utils.py b/py-polars/polars/io/_utils.py
index 3078a1d6b0ec..d243b472edea 100644
--- a/py-polars/polars/io/_utils.py
+++ b/py-polars/polars/io/_utils.py
@@ -25,14 +25,12 @@ def handle_projection_columns(
 
     if isinstance(columns, str):
         new_columns = [columns]
-    elif is_int_sequence(columns):
-        projection = list(columns)
     elif is_str_sequence(columns):
         new_columns = columns
+    elif is_int_sequence(columns):
+        projection = list(columns)
     else:
-        msg = (
-            "`columns` arg should contain a list of all integers or all strings values"
-        )
+        msg = "`columns` arg should contain a list of all integers or all string values"
         raise TypeError(msg)
 
     if columns and len(set(columns)) != len(columns):

From 75548f0ee7fdaa2a3abd425a5ae253347e938a41 Mon Sep 17 00:00:00 2001
From: Stijn de Gooijer <stijndegooijer@gmail.com>
Date: Sun, 7 Apr 2024 23:24:50 +0200
Subject: [PATCH 6/9] Refactor parse columns util, add tests

---
 py-polars/polars/io/_utils.py             | 45 +++++++++++++++--------
 py-polars/polars/io/avro.py               |  6 +--
 py-polars/polars/io/csv/batched_reader.py |  4 +-
 py-polars/polars/io/csv/functions.py      |  8 ++--
 py-polars/polars/io/ipc/functions.py      |  6 +--
 py-polars/polars/io/parquet/functions.py  |  4 +-
 py-polars/tests/unit/io/test_utils.py     | 35 ++++++++++++++++++
 7 files changed, 78 insertions(+), 30 deletions(-)
 create mode 100644 py-polars/tests/unit/io/test_utils.py

diff --git a/py-polars/polars/io/_utils.py b/py-polars/polars/io/_utils.py
index d243b472edea..e878bc630ec5 100644
--- a/py-polars/polars/io/_utils.py
+++ b/py-polars/polars/io/_utils.py
@@ -13,34 +13,47 @@
 from polars.exceptions import NoDataError
 
 
-def handle_projection_columns(
-    columns: Sequence[str] | Sequence[int] | str | None,
-) -> tuple[list[int] | None, Sequence[str] | None]:
-    """Disambiguates between columns specified as integers vs. strings."""
+def parse_columns_arg(
+    columns: Sequence[str] | Sequence[int] | str | int | None,
+) -> tuple[Sequence[int] | None, Sequence[str] | None]:
+    """
+    Parse the `columns` argument of an I/O function.
+
+    Disambiguates between column names and column indices input.
+
+    Returns
+    -------
+    tuple
+        A tuple containing the columns as a projection and a list of column names.
+        Only one will be specified, the other will be `None`.
+    """
     if columns is None:
         return None, None
 
-    projection: list[int] | None = None
-    new_columns: Sequence[str] | None = None
+    projection: Sequence[int] | None = None
+    column_names: Sequence[str] | None = None
 
     if isinstance(columns, str):
-        new_columns = [columns]
+        column_names = [columns]
+    elif isinstance(columns, int):
+        projection = [columns]
     elif is_str_sequence(columns):
-        new_columns = columns
+        _ensure_columns_are_unique(columns)
+        column_names = columns
     elif is_int_sequence(columns):
-        projection = list(columns)
+        _ensure_columns_are_unique(columns)
+        projection = columns
     else:
-        msg = "`columns` arg should contain a list of all integers or all string values"
+        msg = "the `columns` argument should contain a list of all integers or all string values"
         raise TypeError(msg)
 
-    if columns and len(set(columns)) != len(columns):
+    return projection, column_names
+
+
+def _ensure_columns_are_unique(columns: Sequence[str] | Sequence[int]) -> None:
+    if len(columns) != len(set(columns)):
         msg = f"`columns` arg should only have unique values, got {columns!r}"
         raise ValueError(msg)
-    if projection and len(set(projection)) != len(projection):
-        msg = f"`columns` arg should only have unique values, got {projection!r}"
-        raise ValueError(msg)
-
-    return projection, new_columns
 
 
 def prepare_row_index_args(
diff --git a/py-polars/polars/io/avro.py b/py-polars/polars/io/avro.py
index 3d695ba6ad35..feac29a7fd9e 100644
--- a/py-polars/polars/io/avro.py
+++ b/py-polars/polars/io/avro.py
@@ -6,7 +6,7 @@
 
 from polars._utils.various import normalize_filepath
 from polars._utils.wrap import wrap_df
-from polars.io._utils import handle_projection_columns
+from polars.io._utils import parse_columns_arg
 
 with contextlib.suppress(ImportError):  # Module not available when building docs
     from polars.polars import PyDataFrame
@@ -42,7 +42,7 @@ def read_avro(
     """
     if isinstance(source, (str, Path)):
         source = normalize_filepath(source)
-    projection, parsed_columns = handle_projection_columns(columns)
+    projection, column_names = parse_columns_arg(columns)
 
-    pydf = PyDataFrame.read_avro(source, parsed_columns, projection, n_rows)
+    pydf = PyDataFrame.read_avro(source, column_names, projection, n_rows)
     return wrap_df(pydf)
diff --git a/py-polars/polars/io/csv/batched_reader.py b/py-polars/polars/io/csv/batched_reader.py
index fa93ff3d5a57..2a6cec6e3a9c 100644
--- a/py-polars/polars/io/csv/batched_reader.py
+++ b/py-polars/polars/io/csv/batched_reader.py
@@ -9,7 +9,7 @@
 )
 from polars._utils.wrap import wrap_df
 from polars.datatypes import N_INFER_DEFAULT, py_type_to_dtype
-from polars.io._utils import handle_projection_columns, prepare_row_index_args
+from polars.io._utils import parse_columns_arg, prepare_row_index_args
 from polars.io.csv._utils import _update_columns
 
 with contextlib.suppress(ImportError):  # Module not available when building docs
@@ -72,7 +72,7 @@ def __init__(
                 raise TypeError(msg)
 
         processed_null_values = _process_null_values(null_values)
-        projection, columns = handle_projection_columns(columns)
+        projection, columns = parse_columns_arg(columns)
 
         self._reader = PyBatchedCsv.new(
             infer_schema_length=infer_schema_length,
diff --git a/py-polars/polars/io/csv/functions.py b/py-polars/polars/io/csv/functions.py
index 68ab428dd61e..cabb84d9ae46 100644
--- a/py-polars/polars/io/csv/functions.py
+++ b/py-polars/polars/io/csv/functions.py
@@ -16,8 +16,8 @@
 from polars.datatypes import N_INFER_DEFAULT, String
 from polars.datatypes.convert import py_type_to_dtype
 from polars.io._utils import (
-    handle_projection_columns,
     is_glob_pattern,
+    parse_columns_arg,
     prepare_file_arg,
     prepare_row_index_args,
 )
@@ -235,7 +235,7 @@ def read_csv(
     _check_arg_is_1byte("quote_char", quote_char, can_be_empty=True)
     _check_arg_is_1byte("eol_char", eol_char, can_be_empty=False)
 
-    projection, columns = handle_projection_columns(columns)
+    projection, columns = parse_columns_arg(columns)
     storage_options = storage_options or {}
 
     if columns and not has_header:
@@ -548,7 +548,7 @@ def _read_csv_impl(
             )
             raise ValueError(msg)
 
-    projection, columns = handle_projection_columns(columns)
+    projection, columns = parse_columns_arg(columns)
 
     pydf = PyDataFrame.read_csv(
         source,
@@ -758,7 +758,7 @@ def read_csv_batched(
     ...
     ...     batches = reader.next_batches(100)
     """
-    projection, columns = handle_projection_columns(columns)
+    projection, columns = parse_columns_arg(columns)
 
     if columns and not has_header:
         for column in columns:
diff --git a/py-polars/polars/io/ipc/functions.py b/py-polars/polars/io/ipc/functions.py
index fde7be409ee1..bbdc6f8a0014 100644
--- a/py-polars/polars/io/ipc/functions.py
+++ b/py-polars/polars/io/ipc/functions.py
@@ -13,9 +13,9 @@
 from polars._utils.wrap import wrap_df, wrap_ldf
 from polars.dependencies import _PYARROW_AVAILABLE
 from polars.io._utils import (
-    handle_projection_columns,
     is_glob_pattern,
     is_local_file,
+    parse_columns_arg,
     prepare_file_arg,
     prepare_row_index_args,
 )
@@ -160,7 +160,7 @@ def _read_ipc_impl(
             raise TypeError(msg)
         return df
 
-    projection, columns = handle_projection_columns(columns)
+    projection, columns = parse_columns_arg(columns)
     pydf = PyDataFrame.read_ipc(
         source,
         columns,
@@ -267,7 +267,7 @@ def _read_ipc_stream_impl(
     if isinstance(columns, str):
         columns = [columns]
 
-    projection, columns = handle_projection_columns(columns)
+    projection, columns = parse_columns_arg(columns)
     pydf = PyDataFrame.read_ipc_stream(
         source,
         columns,
diff --git a/py-polars/polars/io/parquet/functions.py b/py-polars/polars/io/parquet/functions.py
index 2d2673f99842..bd34ac31629b 100644
--- a/py-polars/polars/io/parquet/functions.py
+++ b/py-polars/polars/io/parquet/functions.py
@@ -16,10 +16,10 @@
 from polars.convert import from_arrow
 from polars.dependencies import _PYARROW_AVAILABLE
 from polars.io._utils import (
-    handle_projection_columns,
     is_glob_pattern,
     is_local_file,
     is_supported_cloud,
+    parse_columns_arg,
     prepare_file_arg,
     prepare_row_index_args,
 )
@@ -260,7 +260,7 @@ def _read_parquet_binary(
             )
             raise TypeError(msg)
 
-    projection, columns = handle_projection_columns(columns)
+    projection, columns = parse_columns_arg(columns)
 
     pydf = PyDataFrame.read_parquet(
         source,
diff --git a/py-polars/tests/unit/io/test_utils.py b/py-polars/tests/unit/io/test_utils.py
new file mode 100644
index 000000000000..72613b068e18
--- /dev/null
+++ b/py-polars/tests/unit/io/test_utils.py
@@ -0,0 +1,35 @@
+from __future__ import annotations
+
+from typing import Sequence
+
+import pytest
+
+from polars.io._utils import parse_columns_arg
+
+
+@pytest.mark.parametrize(
+    ("columns", "expected"),
+    [
+        (["a", "b"], (None, ["a", "b"])),
+        ((1, 2), ((1, 2), None)),
+        ("foo", (None, ["foo"])),
+        (3, ([3], None)),
+        (None, (None, None)),
+    ],
+)
+def test_parse_columns_arg(
+    columns: Sequence[str] | Sequence[int] | str | int | None,
+    expected: tuple[Sequence[int] | None, Sequence[str] | None],
+) -> None:
+    assert parse_columns_arg(columns) == expected
+
+
+def test_parse_columns_arg_mixed_types() -> None:
+    with pytest.raises(TypeError):
+        parse_columns_arg(["a", 1])
+
+
+@pytest.mark.parametrize("columns", [["a", "a"], [1, 1, 2]])
+def test_parse_columns_arg_duplicates(columns: Sequence[str] | Sequence[int]) -> None:
+    with pytest.raises(ValueError):
+        parse_columns_arg(columns)

From a6e4ce1637112d04fa7d389dc380d4310418efb4 Mon Sep 17 00:00:00 2001
From: Stijn de Gooijer <stijndegooijer@gmail.com>
Date: Mon, 8 Apr 2024 00:07:04 +0200
Subject: [PATCH 7/9] Rename parse_row_index_args

---
 py-polars/polars/io/_utils.py             | 13 +++++++++----
 py-polars/polars/io/csv/batched_reader.py |  4 ++--
 py-polars/polars/io/csv/functions.py      |  6 +++---
 py-polars/polars/io/ipc/functions.py      |  8 ++++----
 py-polars/polars/io/ndjson.py             |  4 ++--
 py-polars/polars/io/parquet/functions.py  |  6 +++---
 py-polars/tests/unit/io/test_utils.py     |  7 ++++++-
 7 files changed, 29 insertions(+), 19 deletions(-)

diff --git a/py-polars/polars/io/_utils.py b/py-polars/polars/io/_utils.py
index e878bc630ec5..cef1ca45559f 100644
--- a/py-polars/polars/io/_utils.py
+++ b/py-polars/polars/io/_utils.py
@@ -56,14 +56,19 @@ def _ensure_columns_are_unique(columns: Sequence[str] | Sequence[int]) -> None:
         raise ValueError(msg)
 
 
-def prepare_row_index_args(
+def parse_row_index_args(
     row_index_name: str | None = None,
     row_index_offset: int = 0,
 ) -> tuple[str, int] | None:
-    if row_index_name is not None:
-        return (row_index_name, row_index_offset)
-    else:
+    """
+    Parse the `row_index_name` and `row_index_offset` arguments of an I/O function.
+
+    The Rust functions take a single tuple rather than two separate arguments.
+    """
+    if row_index_name is None:
         return None
+    else:
+        return (row_index_name, row_index_offset)
 
 
 def is_glob_pattern(file: str) -> bool:
diff --git a/py-polars/polars/io/csv/batched_reader.py b/py-polars/polars/io/csv/batched_reader.py
index 2a6cec6e3a9c..de8ae06844fe 100644
--- a/py-polars/polars/io/csv/batched_reader.py
+++ b/py-polars/polars/io/csv/batched_reader.py
@@ -9,7 +9,7 @@
 )
 from polars._utils.wrap import wrap_df
 from polars.datatypes import N_INFER_DEFAULT, py_type_to_dtype
-from polars.io._utils import parse_columns_arg, prepare_row_index_args
+from polars.io._utils import parse_columns_arg, parse_row_index_args
 from polars.io.csv._utils import _update_columns
 
 with contextlib.suppress(ImportError):  # Module not available when building docs
@@ -97,7 +97,7 @@ def __init__(
             missing_utf8_is_empty_string=missing_utf8_is_empty_string,
             try_parse_dates=try_parse_dates,
             skip_rows_after_header=skip_rows_after_header,
-            row_index=prepare_row_index_args(row_index_name, row_index_offset),
+            row_index=parse_row_index_args(row_index_name, row_index_offset),
             sample_size=sample_size,
             eol_char=eol_char,
             raise_if_empty=raise_if_empty,
diff --git a/py-polars/polars/io/csv/functions.py b/py-polars/polars/io/csv/functions.py
index cabb84d9ae46..72a79844b0af 100644
--- a/py-polars/polars/io/csv/functions.py
+++ b/py-polars/polars/io/csv/functions.py
@@ -18,8 +18,8 @@
 from polars.io._utils import (
     is_glob_pattern,
     parse_columns_arg,
+    parse_row_index_args,
     prepare_file_arg,
-    prepare_row_index_args,
 )
 from polars.io.csv._utils import _check_arg_is_1byte, _update_columns
 from polars.io.csv.batched_reader import BatchedCsvReader
@@ -574,7 +574,7 @@ def _read_csv_impl(
         missing_utf8_is_empty_string,
         try_parse_dates,
         skip_rows_after_header,
-        prepare_row_index_args(row_index_name, row_index_offset),
+        parse_row_index_args(row_index_name, row_index_offset),
         sample_size=sample_size,
         eol_char=eol_char,
         raise_if_empty=raise_if_empty,
@@ -1189,7 +1189,7 @@ def _scan_csv_impl(
         rechunk,
         skip_rows_after_header,
         encoding,
-        prepare_row_index_args(row_index_name, row_index_offset),
+        parse_row_index_args(row_index_name, row_index_offset),
         try_parse_dates,
         eol_char=eol_char,
         raise_if_empty=raise_if_empty,
diff --git a/py-polars/polars/io/ipc/functions.py b/py-polars/polars/io/ipc/functions.py
index bbdc6f8a0014..ddae71836953 100644
--- a/py-polars/polars/io/ipc/functions.py
+++ b/py-polars/polars/io/ipc/functions.py
@@ -16,8 +16,8 @@
     is_glob_pattern,
     is_local_file,
     parse_columns_arg,
+    parse_row_index_args,
     prepare_file_arg,
-    prepare_row_index_args,
 )
 from polars.io.ipc.anonymous_scan import _scan_ipc_fsspec
 
@@ -166,7 +166,7 @@ def _read_ipc_impl(
         columns,
         projection,
         n_rows,
-        prepare_row_index_args(row_index_name, row_index_offset),
+        parse_row_index_args(row_index_name, row_index_offset),
         memory_map=memory_map,
     )
     return wrap_df(pydf)
@@ -273,7 +273,7 @@ def _read_ipc_stream_impl(
         columns,
         projection,
         n_rows,
-        prepare_row_index_args(row_index_name, row_index_offset),
+        parse_row_index_args(row_index_name, row_index_offset),
         rechunk,
     )
     return wrap_df(pydf)
@@ -372,7 +372,7 @@ def scan_ipc(
         n_rows,
         cache,
         rechunk,
-        prepare_row_index_args(row_index_name, row_index_offset),
+        parse_row_index_args(row_index_name, row_index_offset),
         memory_map=memory_map,
         cloud_options=storage_options,
         retries=retries,
diff --git a/py-polars/polars/io/ndjson.py b/py-polars/polars/io/ndjson.py
index c1c077c66ea4..72d1263e8f9d 100644
--- a/py-polars/polars/io/ndjson.py
+++ b/py-polars/polars/io/ndjson.py
@@ -9,7 +9,7 @@
 from polars._utils.various import normalize_filepath
 from polars._utils.wrap import wrap_df, wrap_ldf
 from polars.datatypes import N_INFER_DEFAULT
-from polars.io._utils import prepare_row_index_args
+from polars.io._utils import parse_row_index_args
 
 with contextlib.suppress(ImportError):  # Module not available when building docs
     from polars.polars import PyDataFrame, PyLazyFrame
@@ -137,7 +137,7 @@ def scan_ndjson(
         n_rows,
         low_memory,
         rechunk,
-        prepare_row_index_args(row_index_name, row_index_offset),
+        parse_row_index_args(row_index_name, row_index_offset),
         ignore_errors,
     )
     return wrap_ldf(pylf)
diff --git a/py-polars/polars/io/parquet/functions.py b/py-polars/polars/io/parquet/functions.py
index bd34ac31629b..224b72b69e23 100644
--- a/py-polars/polars/io/parquet/functions.py
+++ b/py-polars/polars/io/parquet/functions.py
@@ -20,8 +20,8 @@
     is_local_file,
     is_supported_cloud,
     parse_columns_arg,
+    parse_row_index_args,
     prepare_file_arg,
-    prepare_row_index_args,
 )
 from polars.io.parquet.anonymous_scan import _scan_parquet_fsspec
 
@@ -268,7 +268,7 @@ def _read_parquet_binary(
         projection,
         n_rows,
         parallel,
-        prepare_row_index_args(row_index_name, row_index_offset),
+        parse_row_index_args(row_index_name, row_index_offset),
         low_memory=low_memory,
         use_statistics=use_statistics,
         rechunk=rechunk,
@@ -472,7 +472,7 @@ def _scan_parquet_impl(
         cache,
         parallel,
         rechunk,
-        prepare_row_index_args(row_index_name, row_index_offset),
+        parse_row_index_args(row_index_name, row_index_offset),
         low_memory,
         cloud_options=storage_options,
         use_statistics=use_statistics,
diff --git a/py-polars/tests/unit/io/test_utils.py b/py-polars/tests/unit/io/test_utils.py
index 72613b068e18..797435fd44e8 100644
--- a/py-polars/tests/unit/io/test_utils.py
+++ b/py-polars/tests/unit/io/test_utils.py
@@ -4,7 +4,7 @@
 
 import pytest
 
-from polars.io._utils import parse_columns_arg
+from polars.io._utils import parse_columns_arg, parse_row_index_args
 
 
 @pytest.mark.parametrize(
@@ -33,3 +33,8 @@ def test_parse_columns_arg_mixed_types() -> None:
 def test_parse_columns_arg_duplicates(columns: Sequence[str] | Sequence[int]) -> None:
     with pytest.raises(ValueError):
         parse_columns_arg(columns)
+
+
+def test_parse_row_index_args() -> None:
+    assert parse_row_index_args("idx", 5) == ("idx", 5)
+    assert parse_row_index_args(None, 5) is None

From 1dea764d301ffcb291c04e24735896a8a9575061 Mon Sep 17 00:00:00 2001
From: Stijn de Gooijer <stijndegooijer@gmail.com>
Date: Mon, 8 Apr 2024 00:36:47 +0200
Subject: [PATCH 8/9] Move some stuff around

---
 py-polars/polars/io/_utils.py                | 70 ++++----------------
 py-polars/polars/io/spreadsheet/_utils.py    | 47 +++++++++++++
 py-polars/polars/io/spreadsheet/functions.py |  3 +-
 py-polars/tests/unit/io/test_utils.py        | 23 ++++++-
 py-polars/tests/unit/utils/test_utils.py     | 20 ------
 5 files changed, 84 insertions(+), 79 deletions(-)
 create mode 100644 py-polars/polars/io/spreadsheet/_utils.py

diff --git a/py-polars/polars/io/_utils.py b/py-polars/polars/io/_utils.py
index cef1ca45559f..6493f63f5779 100644
--- a/py-polars/polars/io/_utils.py
+++ b/py-polars/polars/io/_utils.py
@@ -5,8 +5,7 @@
 from contextlib import contextmanager
 from io import BytesIO, StringIO
 from pathlib import Path
-from tempfile import NamedTemporaryFile
-from typing import IO, Any, ContextManager, Iterator, Sequence, cast, overload
+from typing import IO, Any, ContextManager, Iterator, Sequence, overload
 
 from polars._utils.various import is_int_sequence, is_str_sequence, normalize_filepath
 from polars.dependencies import _FSSPEC_AVAILABLE, fsspec
@@ -71,23 +70,6 @@ def parse_row_index_args(
         return (row_index_name, row_index_offset)
 
 
-def is_glob_pattern(file: str) -> bool:
-    return any(char in file for char in ["*", "?", "["])
-
-
-def is_supported_cloud(file: str) -> bool:
-    return bool(re.match("^(s3a?|gs|gcs|file|abfss?|azure|az|adl|https?)://", file))
-
-
-def is_local_file(file: str) -> bool:
-    try:
-        next(glob.iglob(file, recursive=True))  # noqa: PTH207
-    except StopIteration:
-        return False
-    else:
-        return True
-
-
 @overload
 def prepare_file_arg(
     file: str | Path | list[str] | IO[bytes] | bytes,
@@ -292,42 +274,18 @@ def process_file_url(path: str, encoding: str | None = None) -> BytesIO:
             return BytesIO(f.read().decode(encoding).encode("utf8"))
 
 
-@contextmanager
-def PortableTemporaryFile(
-    mode: str = "w+b",
-    *,
-    buffering: int = -1,
-    encoding: str | None = None,
-    newline: str | None = None,
-    suffix: str | None = None,
-    prefix: str | None = None,
-    dir: str | Path | None = None,
-    delete: bool = True,
-    errors: str | None = None,
-) -> Iterator[Any]:
-    """
-    Slightly more resilient version of the standard `NamedTemporaryFile`.
+def is_glob_pattern(file: str) -> bool:
+    return any(char in file for char in ["*", "?", "["])
 
-    Plays better with Windows when using the 'delete' option.
-    """
-    params = cast(
-        Any,
-        {
-            "mode": mode,
-            "buffering": buffering,
-            "encoding": encoding,
-            "newline": newline,
-            "suffix": suffix,
-            "prefix": prefix,
-            "dir": dir,
-            "delete": False,
-            "errors": errors,
-        },
-    )
-    tmp = NamedTemporaryFile(**params)
+
+def is_supported_cloud(file: str) -> bool:
+    return bool(re.match("^(s3a?|gs|gcs|file|abfss?|azure|az|adl|https?)://", file))
+
+
+def is_local_file(file: str) -> bool:
     try:
-        yield tmp
-    finally:
-        tmp.close()
-        if delete:
-            Path(tmp.name).unlink(missing_ok=True)
+        next(glob.iglob(file, recursive=True))  # noqa: PTH207
+    except StopIteration:
+        return False
+    else:
+        return True
diff --git a/py-polars/polars/io/spreadsheet/_utils.py b/py-polars/polars/io/spreadsheet/_utils.py
new file mode 100644
index 000000000000..cbf86eb45a48
--- /dev/null
+++ b/py-polars/polars/io/spreadsheet/_utils.py
@@ -0,0 +1,47 @@
+from __future__ import annotations
+
+from contextlib import contextmanager
+from pathlib import Path
+from tempfile import NamedTemporaryFile
+from typing import Any, Iterator, cast
+
+
+@contextmanager
+def PortableTemporaryFile(
+    mode: str = "w+b",
+    *,
+    buffering: int = -1,
+    encoding: str | None = None,
+    newline: str | None = None,
+    suffix: str | None = None,
+    prefix: str | None = None,
+    dir: str | Path | None = None,
+    delete: bool = True,
+    errors: str | None = None,
+) -> Iterator[Any]:
+    """
+    Slightly more resilient version of the standard `NamedTemporaryFile`.
+
+    Plays better with Windows when using the 'delete' option.
+    """
+    params = cast(
+        Any,
+        {
+            "mode": mode,
+            "buffering": buffering,
+            "encoding": encoding,
+            "newline": newline,
+            "suffix": suffix,
+            "prefix": prefix,
+            "dir": dir,
+            "delete": False,
+            "errors": errors,
+        },
+    )
+    tmp = NamedTemporaryFile(**params)
+    try:
+        yield tmp
+    finally:
+        tmp.close()
+        if delete:
+            Path(tmp.name).unlink(missing_ok=True)
diff --git a/py-polars/polars/io/spreadsheet/functions.py b/py-polars/polars/io/spreadsheet/functions.py
index 7f79cfa272a3..eaa455b03721 100644
--- a/py-polars/polars/io/spreadsheet/functions.py
+++ b/py-polars/polars/io/spreadsheet/functions.py
@@ -22,8 +22,9 @@
 )
 from polars.dependencies import import_optional
 from polars.exceptions import NoDataError, ParameterCollisionError
-from polars.io._utils import PortableTemporaryFile, looks_like_url, process_file_url
+from polars.io._utils import looks_like_url, process_file_url
 from polars.io.csv.functions import read_csv
+from polars.io.spreadsheet._utils import PortableTemporaryFile
 
 if TYPE_CHECKING:
     from typing import Literal
diff --git a/py-polars/tests/unit/io/test_utils.py b/py-polars/tests/unit/io/test_utils.py
index 797435fd44e8..7c2173469ebb 100644
--- a/py-polars/tests/unit/io/test_utils.py
+++ b/py-polars/tests/unit/io/test_utils.py
@@ -4,7 +4,7 @@
 
 import pytest
 
-from polars.io._utils import parse_columns_arg, parse_row_index_args
+from polars.io._utils import looks_like_url, parse_columns_arg, parse_row_index_args
 
 
 @pytest.mark.parametrize(
@@ -26,7 +26,7 @@ def test_parse_columns_arg(
 
 def test_parse_columns_arg_mixed_types() -> None:
     with pytest.raises(TypeError):
-        parse_columns_arg(["a", 1])
+        parse_columns_arg(["a", 1])  # type: ignore[arg-type]
 
 
 @pytest.mark.parametrize("columns", [["a", "a"], [1, 1, 2]])
@@ -38,3 +38,22 @@ def test_parse_columns_arg_duplicates(columns: Sequence[str] | Sequence[int]) ->
 def test_parse_row_index_args() -> None:
     assert parse_row_index_args("idx", 5) == ("idx", 5)
     assert parse_row_index_args(None, 5) is None
+
+
+@pytest.mark.parametrize(
+    ("url", "result"),
+    [
+        ("HTTPS://pola.rs/data.csv", True),
+        ("http://pola.rs/data.csv", True),
+        ("ftps://pola.rs/data.csv", True),
+        ("FTP://pola.rs/data.csv", True),
+        ("htp://pola.rs/data.csv", False),
+        ("fttp://pola.rs/data.csv", False),
+        ("http_not_a_url", False),
+        ("ftp_not_a_url", False),
+        ("/mnt/data.csv", False),
+        ("file://mnt/data.csv", False),
+    ],
+)
+def test_looks_like_url(url: str, result: bool) -> None:
+    assert looks_like_url(url) == result
diff --git a/py-polars/tests/unit/utils/test_utils.py b/py-polars/tests/unit/utils/test_utils.py
index e15b6e918bc9..194070822be3 100644
--- a/py-polars/tests/unit/utils/test_utils.py
+++ b/py-polars/tests/unit/utils/test_utils.py
@@ -23,7 +23,6 @@
     parse_percentiles,
     parse_version,
 )
-from polars.io._utils import looks_like_url
 
 if TYPE_CHECKING:
     from zoneinfo import ZoneInfo
@@ -291,22 +290,3 @@ def test_is_str_sequence_check(
     assert is_str_sequence(sequence, include_series=include_series) == expected
     if expected:
         assert is_sequence(sequence, include_series=include_series)
-
-
-@pytest.mark.parametrize(
-    ("url", "result"),
-    [
-        ("HTTPS://pola.rs/data.csv", True),
-        ("http://pola.rs/data.csv", True),
-        ("ftps://pola.rs/data.csv", True),
-        ("FTP://pola.rs/data.csv", True),
-        ("htp://pola.rs/data.csv", False),
-        ("fttp://pola.rs/data.csv", False),
-        ("http_not_a_url", False),
-        ("ftp_not_a_url", False),
-        ("/mnt/data.csv", False),
-        ("file://mnt/data.csv", False),
-    ],
-)
-def test_looks_like_url(url: str, result: bool) -> None:
-    assert looks_like_url(url) == result

From 42b5b1ff8245babda6b04732728aa3f39333747a Mon Sep 17 00:00:00 2001
From: Stijn de Gooijer <stijndegooijer@gmail.com>
Date: Mon, 8 Apr 2024 00:42:52 +0200
Subject: [PATCH 9/9] Some docstring cleanup

---
 py-polars/polars/io/_utils.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/py-polars/polars/io/_utils.py b/py-polars/polars/io/_utils.py
index 6493f63f5779..3c11e031b9cb 100644
--- a/py-polars/polars/io/_utils.py
+++ b/py-polars/polars/io/_utils.py
@@ -117,15 +117,15 @@ def prepare_file_arg(
     Utility for read_[csv, parquet]. (not to be used by scan_[csv, parquet]).
     Returned value is always usable as a context.
 
-    A :class:`StringIO`, :class:`BytesIO` file is returned as a :class:`BytesIO`.
+    A `StringIO`, `BytesIO` file is returned as a `BytesIO`.
     A local path is returned as a string.
-    An http URL is read into a buffer and returned as a :class:`BytesIO`.
+    An http URL is read into a buffer and returned as a `BytesIO`.
 
     When `encoding` is not `utf8` or `utf8-lossy`, the whole file is
-    first read in python and decoded using the specified encoding and
-    returned as a :class:`BytesIO` (for usage with `read_csv`).
+    first read in Python and decoded using the specified encoding and
+    returned as a `BytesIO` (for usage with `read_csv`).
 
-    A `bytes` file is returned as a :class:`BytesIO` if `use_pyarrow=True`.
+    A `bytes` file is returned as a `BytesIO` if `use_pyarrow=True`.
 
     When fsspec is installed, remote file(s) is (are) opened with
     `fsspec.open(file, **kwargs)` or `fsspec.open_files(file, **kwargs)`.