From b5404d5d00686b39a6e1bad1c5b51582ca6e7db8 Mon Sep 17 00:00:00 2001 From: Steven Van Ingelgem Date: Sat, 7 Oct 2023 16:05:07 +0200 Subject: [PATCH] Align types across all csv methods --- crates/polars-plan/src/logical_plan/builder.rs | 6 +++--- py-polars/polars/dataframe/frame.py | 10 +++++----- py-polars/polars/io/csv/batched_reader.py | 2 +- py-polars/polars/io/csv/functions.py | 6 +++--- py-polars/polars/lazyframe/frame.py | 10 +++++----- py-polars/src/dataframe.rs | 13 +++++-------- py-polars/src/lazyframe.rs | 14 +++++--------- 7 files changed, 27 insertions(+), 34 deletions(-) diff --git a/crates/polars-plan/src/logical_plan/builder.rs b/crates/polars-plan/src/logical_plan/builder.rs index 3c478a61e051..8837ce46c3f8 100644 --- a/crates/polars-plan/src/logical_plan/builder.rs +++ b/crates/polars-plan/src/logical_plan/builder.rs @@ -276,7 +276,7 @@ impl LogicalPlanBuilder { low_memory: bool, comment_char: Option, quote_char: Option, - line_terminator: u8, + eol_char: u8, null_values: Option, infer_schema_length: Option, rechunk: bool, @@ -322,7 +322,7 @@ impl LogicalPlanBuilder { skip_rows_after_header, comment_char, quote_char, - line_terminator, + eol_char, null_values.as_ref(), try_parse_dates, raise_if_empty, @@ -374,7 +374,7 @@ impl LogicalPlanBuilder { low_memory, comment_char, quote_char, - eol_char: line_terminator, + eol_char, null_values, encoding, try_parse_dates, diff --git a/py-polars/polars/dataframe/frame.py b/py-polars/polars/dataframe/frame.py index 475aa1f09025..ab62bb81f188 100644 --- a/py-polars/polars/dataframe/frame.py +++ b/py-polars/polars/dataframe/frame.py @@ -658,7 +658,7 @@ def _read_csv( columns: Sequence[int] | Sequence[str] | None = None, delimiter_char: str = ",", comment_char: str | None = None, - quote_char: str | None = r'"', + quote_char: str | None = '"', skip_rows: int = 0, dtypes: None | (SchemaDict | Sequence[PolarsDataType]) = None, schema: None | SchemaDict = None, @@ -776,7 +776,7 @@ def _read_csv( n_rows, skip_rows, projection, - delimiter_char, + ord(delimiter_char), rechunk, columns, encoding, @@ -785,15 +785,15 @@ def _read_csv( dtype_list, dtype_slice, low_memory, - comment_char, - quote_char, + ord(comment_char) if comment_char else None, + ord(quote_char) if quote_char else None, processed_null_values, missing_utf8_is_empty_string, try_parse_dates, skip_rows_after_header, _prepare_row_count_args(row_count_name, row_count_offset), sample_size=sample_size, - eol_char=eol_char, + eol_char=ord(eol_char), raise_if_empty=raise_if_empty, truncate_ragged_lines=truncate_ragged_lines, schema=schema, diff --git a/py-polars/polars/io/csv/batched_reader.py b/py-polars/polars/io/csv/batched_reader.py index 7eb522bb20f3..dfa523e7dcec 100644 --- a/py-polars/polars/io/csv/batched_reader.py +++ b/py-polars/polars/io/csv/batched_reader.py @@ -33,7 +33,7 @@ def __init__( columns: Sequence[int] | Sequence[str] | None = None, delimiter_char: str = ",", comment_char: str | None = None, - quote_char: str | None = r'"', + quote_char: str | None = '"', skip_rows: int = 0, dtypes: None | (SchemaDict | Sequence[PolarsDataType]) = None, null_values: str | Sequence[str] | dict[str, str] | None = None, diff --git a/py-polars/polars/io/csv/functions.py b/py-polars/polars/io/csv/functions.py index 3148f617d784..045e8053c307 100644 --- a/py-polars/polars/io/csv/functions.py +++ b/py-polars/polars/io/csv/functions.py @@ -25,7 +25,7 @@ def read_csv( new_columns: Sequence[str] | None = None, delimiter_char: str = ",", comment_char: str | None = None, - quote_char: str | None = r'"', + quote_char: str | None = '"', skip_rows: int = 0, dtypes: Mapping[str, PolarsDataType] | Sequence[PolarsDataType] | None = None, schema: SchemaDict | None = None, @@ -406,7 +406,7 @@ def read_csv_batched( new_columns: Sequence[str] | None = None, delimiter_char: str = ",", comment_char: str | None = None, - quote_char: str | None = r'"', + quote_char: str | None = '"', skip_rows: int = 0, dtypes: Mapping[str, PolarsDataType] | Sequence[PolarsDataType] | None = None, null_values: str | Sequence[str] | dict[str, str] | None = None, @@ -700,7 +700,7 @@ def scan_csv( has_header: bool = True, delimiter_char: str = ",", comment_char: str | None = None, - quote_char: str | None = r'"', + quote_char: str | None = '"', skip_rows: int = 0, dtypes: SchemaDict | Sequence[PolarsDataType] | None = None, schema: SchemaDict | None = None, diff --git a/py-polars/polars/lazyframe/frame.py b/py-polars/polars/lazyframe/frame.py index f100630adacf..5778ee19c9d5 100644 --- a/py-polars/polars/lazyframe/frame.py +++ b/py-polars/polars/lazyframe/frame.py @@ -317,7 +317,7 @@ def _scan_csv( has_header: bool = True, delimiter_char: str = ",", comment_char: str | None = None, - quote_char: str | None = r'"', + quote_char: str | None = '"', skip_rows: int = 0, dtypes: SchemaDict | None = None, schema: SchemaDict | None = None, @@ -359,7 +359,7 @@ def _scan_csv( self = cls.__new__(cls) self._ldf = PyLazyFrame.new_from_csv( source, - delimiter_char, + ord(delimiter_char), has_header, ignore_errors, skip_rows, @@ -367,8 +367,8 @@ def _scan_csv( cache, dtype_list, low_memory, - comment_char, - quote_char, + ord(comment_char) if comment_char else None, + ord(quote_char) if quote_char else None, processed_null_values, missing_utf8_is_empty_string, infer_schema_length, @@ -378,7 +378,7 @@ def _scan_csv( encoding, _prepare_row_count_args(row_count_name, row_count_offset), try_parse_dates, - eol_char=eol_char, + eol_char=ord(eol_char), raise_if_empty=raise_if_empty, truncate_ragged_lines=truncate_ragged_lines, schema=schema, diff --git a/py-polars/src/dataframe.rs b/py-polars/src/dataframe.rs index a39d3d654ffc..88bd8744d468 100644 --- a/py-polars/src/dataframe.rs +++ b/py-polars/src/dataframe.rs @@ -179,7 +179,7 @@ impl PyDataFrame { n_rows: Option, skip_rows: usize, projection: Option>, - delimiter_char: &str, + delimiter_char: u8, rechunk: bool, columns: Option>, encoding: Wrap, @@ -188,24 +188,21 @@ impl PyDataFrame { overwrite_dtype: Option)>>, overwrite_dtype_slice: Option>>, low_memory: bool, - comment_char: Option<&str>, - quote_char: Option<&str>, + comment_char: Option, + quote_char: Option, null_values: Option>, missing_utf8_is_empty_string: bool, try_parse_dates: bool, skip_rows_after_header: usize, row_count: Option<(String, IdxSize)>, sample_size: usize, - eol_char: &str, + eol_char: u8, raise_if_empty: bool, truncate_ragged_lines: bool, schema: Option>, ) -> PyResult { let null_values = null_values.map(|w| w.0); - let comment_char = comment_char.map(|s| s.as_bytes()[0]); - let eol_char = eol_char.as_bytes()[0]; let row_count = row_count.map(|(name, offset)| RowCount { name, offset }); - let quote_char = quote_char.and_then(|s| s.as_bytes().first().copied()); let overwrite_dtype = overwrite_dtype.map(|overwrite_dtype| { overwrite_dtype @@ -229,7 +226,7 @@ impl PyDataFrame { .infer_schema(infer_schema_length) .has_header(has_header) .with_n_rows(n_rows) - .with_delimiter(delimiter_char.as_bytes()[0]) + .with_delimiter(delimiter_char) .with_skip_rows(skip_rows) .with_ignore_errors(ignore_errors) .with_projection(projection) diff --git a/py-polars/src/lazyframe.rs b/py-polars/src/lazyframe.rs index c353bd07f688..8873bcc66f21 100644 --- a/py-polars/src/lazyframe.rs +++ b/py-polars/src/lazyframe.rs @@ -146,7 +146,7 @@ impl PyLazyFrame { )] fn new_from_csv( path: String, - delimiter_char: &str, + delimiter_char: u8, has_header: bool, ignore_errors: bool, skip_rows: usize, @@ -154,8 +154,8 @@ impl PyLazyFrame { cache: bool, overwrite_dtype: Option)>>, low_memory: bool, - comment_char: Option<&str>, - quote_char: Option<&str>, + comment_char: Option, + quote_char: Option, null_values: Option>, missing_utf8_is_empty_string: bool, infer_schema_length: Option, @@ -165,16 +165,12 @@ impl PyLazyFrame { encoding: Wrap, row_count: Option<(String, IdxSize)>, try_parse_dates: bool, - eol_char: &str, + eol_char: u8, raise_if_empty: bool, truncate_ragged_lines: bool, schema: Option>, ) -> PyResult { let null_values = null_values.map(|w| w.0); - let comment_char = comment_char.map(|s| s.as_bytes()[0]); - let quote_char = quote_char.map(|s| s.as_bytes()[0]); - let delimiter = delimiter_char.as_bytes()[0]; - let eol_char = eol_char.as_bytes()[0]; let row_count = row_count.map(|(name, offset)| RowCount { name, offset }); let overwrite_dtype = overwrite_dtype.map(|overwrite_dtype| { @@ -185,7 +181,7 @@ impl PyLazyFrame { }); let mut r = LazyCsvReader::new(path) .with_infer_schema_length(infer_schema_length) - .with_delimiter(delimiter) + .with_delimiter(delimiter_char) .has_header(has_header) .with_ignore_errors(ignore_errors) .with_skip_rows(skip_rows)