Skip to content

Commit 7f5d9c7

Browse files
authored
feat(python): Add infer_schema parameter to read_csv / scan_csv (#17617)
1 parent c6d6d73 commit 7f5d9c7

File tree

2 files changed

+37
-6
lines changed

2 files changed

+37
-6
lines changed

py-polars/polars/io/csv/functions.py

+24-6
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,7 @@ def read_csv(
5353
ignore_errors: bool = False,
5454
try_parse_dates: bool = False,
5555
n_threads: int | None = None,
56+
infer_schema: bool = True,
5657
infer_schema_length: int | None = N_INFER_DEFAULT,
5758
batch_size: int = 8192,
5859
n_rows: int | None = None,
@@ -126,7 +127,7 @@ def read_csv(
126127
Before using this option, try to increase the number of lines used for schema
127128
inference with e.g `infer_schema_length=10000` or override automatic dtype
128129
inference for specific columns with the `schema_overrides` option or use
129-
`infer_schema_length=0` to read all columns as `pl.String` to check which
130+
`infer_schema=False` to read all columns as `pl.String` to check which
130131
values might cause an issue.
131132
try_parse_dates
132133
Try to automatically parse dates. Most ISO8601-like formats can
@@ -136,10 +137,15 @@ def read_csv(
136137
n_threads
137138
Number of threads to use in csv parsing.
138139
Defaults to the number of physical cpu's of your system.
140+
infer_schema
141+
When `True`, the schema is inferred from the data using the first
142+
`infer_schema_length` rows.
143+
When `False`, the schema is not inferred and will be `pl.String` if not
144+
specified in `schema` or `schema_overrides`.
139145
infer_schema_length
140146
The maximum number of rows to scan for schema inference.
141-
If set to `0`, all columns will be read as `pl.String`.
142147
If set to `None`, the full data may be scanned *(this is slow)*.
148+
Set `infer_schema=False` to read all columns as `pl.String`.
143149
batch_size
144150
Number of lines to read into the buffer at once.
145151
Modify this to change performance.
@@ -184,7 +190,7 @@ def read_csv(
184190
with windows line endings (`\r\n`), one can go with the default `\n`. The extra
185191
`\r` will be removed when processed.
186192
raise_if_empty
187-
When there is no data in the source,`NoDataError` is raised. If this parameter
193+
When there is no data in the source, `NoDataError` is raised. If this parameter
188194
is set to False, an empty DataFrame (with no columns) is returned instead.
189195
truncate_ragged_lines
190196
Truncate lines that are longer than the schema.
@@ -410,6 +416,9 @@ def read_csv(
410416
for column_name, column_dtype in schema_overrides.items()
411417
}
412418

419+
if not infer_schema:
420+
infer_schema_length = 0
421+
413422
with prepare_file_arg(
414423
source,
415424
encoding=encoding,
@@ -922,6 +931,7 @@ def scan_csv(
922931
ignore_errors: bool = False,
923932
cache: bool = True,
924933
with_column_names: Callable[[list[str]], list[str]] | None = None,
934+
infer_schema: bool = True,
925935
infer_schema_length: int | None = N_INFER_DEFAULT,
926936
n_rows: int | None = None,
927937
encoding: CsvEncoding = "utf8",
@@ -989,17 +999,22 @@ def scan_csv(
989999
utf8 values to be treated as the empty string you can set this param True.
9901000
ignore_errors
9911001
Try to keep reading lines if some lines yield errors.
992-
First try `infer_schema_length=0` to read all columns as
1002+
First try `infer_schema=False` to read all columns as
9931003
`pl.String` to check which values might cause an issue.
9941004
cache
9951005
Cache the result after reading.
9961006
with_column_names
9971007
Apply a function over the column names just in time (when they are determined);
9981008
this function will receive (and should return) a list of column names.
1009+
infer_schema
1010+
When `True`, the schema is inferred from the data using the first
1011+
`infer_schema_length` rows.
1012+
When `False`, the schema is not inferred and will be `pl.String` if not
1013+
specified in `schema` or `schema_overrides`.
9991014
infer_schema_length
10001015
The maximum number of rows to scan for schema inference.
1001-
If set to `0`, all columns will be read as `pl.String`.
10021016
If set to `None`, the full data may be scanned *(this is slow)*.
1017+
Set `infer_schema=False` to read all columns as `pl.String`.
10031018
n_rows
10041019
Stop reading from CSV file after reading `n_rows`.
10051020
encoding : {'utf8', 'utf8-lossy'}
@@ -1029,7 +1044,7 @@ def scan_csv(
10291044
scanning a headerless CSV file). If the given list is shorter than the width of
10301045
the DataFrame the remaining columns will have their original name.
10311046
raise_if_empty
1032-
When there is no data in the source,`NoDataError` is raised. If this parameter
1047+
When there is no data in the source, `NoDataError` is raised. If this parameter
10331048
is set to False, an empty LazyFrame (with no columns) is returned instead.
10341049
truncate_ragged_lines
10351050
Truncate lines that are longer than the schema.
@@ -1153,6 +1168,9 @@ def with_column_names(cols: list[str]) -> list[str]:
11531168
normalize_filepath(source, check_not_directory=False) for source in source
11541169
]
11551170

1171+
if not infer_schema:
1172+
infer_schema_length = 0
1173+
11561174
return _scan_csv_impl(
11571175
source,
11581176
has_header=has_header,

py-polars/tests/unit/io/test_csv.py

+13
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,19 @@ def test_normalize_filepath(io_files_path: Path) -> None:
113113
)
114114

115115

116+
def test_infer_schema_false() -> None:
117+
csv = textwrap.dedent(
118+
"""\
119+
a,b,c
120+
1,2,3
121+
1,2,3
122+
"""
123+
)
124+
f = io.StringIO(csv)
125+
df = pl.read_csv(f, infer_schema=False)
126+
assert df.dtypes == [pl.String, pl.String, pl.String]
127+
128+
116129
def test_csv_null_values() -> None:
117130
csv = textwrap.dedent(
118131
"""\

0 commit comments

Comments
 (0)