Skip to content

Commit

Permalink
feat: allow specifying schema in pl.scan_ndjson (#10963)
Browse files Browse the repository at this point in the history
Co-authored-by: ritchie <[email protected]>
  • Loading branch information
sd2k and ritchie46 authored Oct 13, 2023
1 parent f40eea6 commit 57139eb
Show file tree
Hide file tree
Showing 10 changed files with 68 additions and 11 deletions.
9 changes: 7 additions & 2 deletions crates/polars-lazy/src/physical_plan/executors/scan/ndjson.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,15 +19,20 @@ impl AnonymousScan for LazyJsonLineReader {
.finish()
}

fn schema(&self, infer_schema_length: Option<usize>) -> PolarsResult<Schema> {
fn schema(&self, infer_schema_length: Option<usize>) -> PolarsResult<SchemaRef> {
// Short-circuit schema inference if the schema has been explicitly provided.
if let Some(schema) = &self.schema {
return Ok(schema.clone());
}

let f = polars_utils::open_file(&self.path)?;
let mut reader = std::io::BufReader::new(f);

let data_type =
polars_json::ndjson::infer(&mut reader, infer_schema_length).map_err(to_compute_err)?;
let schema = Schema::from_iter(StructArray::get_fields(&data_type));

Ok(schema)
Ok(Arc::new(schema))
}
fn allows_projection_pushdown(&self) -> bool {
true
Expand Down
2 changes: 1 addition & 1 deletion crates/polars-lazy/src/scan/anonymous_scan.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ use crate::prelude::*;
#[derive(Clone)]
pub struct ScanArgsAnonymous {
pub infer_schema_length: Option<usize>,
pub schema: Option<Schema>,
pub schema: Option<SchemaRef>,
pub skip_rows: Option<usize>,
pub n_rows: Option<usize>,
pub row_count: Option<RowCount>,
Expand Down
7 changes: 4 additions & 3 deletions crates/polars-lazy/src/scan/ndjson.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ pub struct LazyJsonLineReader {
pub(crate) batch_size: Option<usize>,
pub(crate) low_memory: bool,
pub(crate) rechunk: bool,
pub(crate) schema: Option<Schema>,
pub(crate) schema: Option<SchemaRef>,
pub(crate) row_count: Option<RowCount>,
pub(crate) infer_schema_length: Option<usize>,
pub(crate) n_rows: Option<usize>,
Expand Down Expand Up @@ -52,6 +52,7 @@ impl LazyJsonLineReader {
}
/// Set the number of rows to use when inferring the json schema.
/// the default is 100 rows.
/// Ignored when the schema is specified explicitly using [`Self::with_schema`].
/// Setting to `None` will do a full table scan, very slow.
#[must_use]
pub fn with_infer_schema_length(mut self, num_rows: Option<usize>) -> Self {
Expand All @@ -60,8 +61,8 @@ impl LazyJsonLineReader {
}
/// Set the JSON file's schema
#[must_use]
pub fn with_schema(mut self, schema: Schema) -> Self {
self.schema = Some(schema);
pub fn with_schema(mut self, schema: Option<SchemaRef>) -> Self {
self.schema = schema;
self
}

Expand Down
15 changes: 15 additions & 0 deletions crates/polars-lazy/src/tests/io.rs
Original file line number Diff line number Diff line change
Expand Up @@ -432,6 +432,21 @@ fn scan_predicate_on_set_null_values() -> PolarsResult<()> {
Ok(())
}

#[test]
fn scan_anonymous_fn() -> PolarsResult<()> {
let function = Arc::new(|_scan_opts: AnonymousScanArgs| Ok(fruits_cars()));

let args = ScanArgsAnonymous {
schema: Some(Arc::new(fruits_cars().schema())),
..ScanArgsAnonymous::default()
};

let df = LazyFrame::anonymous_scan(function, args)?.collect()?;

assert_eq!(df.shape(), (5, 4));
Ok(())
}

#[test]
#[cfg(feature = "dtype-full")]
fn scan_small_dtypes() -> PolarsResult<()> {
Expand Down
2 changes: 1 addition & 1 deletion crates/polars-plan/src/logical_plan/anonymous_scan.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ pub trait AnonymousScan: Send + Sync {

/// function to supply the schema.
/// Allows for an optional infer schema argument for data sources with dynamic schemas
fn schema(&self, _infer_schema_length: Option<usize>) -> PolarsResult<Schema> {
fn schema(&self, _infer_schema_length: Option<usize>) -> PolarsResult<SchemaRef> {
polars_bail!(ComputeError: "must supply either a schema or a schema function");
}
/// specify if the scan provider should allow predicate pushdowns
Expand Down
6 changes: 3 additions & 3 deletions crates/polars-plan/src/logical_plan/builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -95,16 +95,16 @@ fn prepare_schema(mut schema: Schema, row_count: Option<&RowCount>) -> SchemaRef
impl LogicalPlanBuilder {
pub fn anonymous_scan(
function: Arc<dyn AnonymousScan>,
schema: Option<Schema>,
schema: Option<SchemaRef>,
infer_schema_length: Option<usize>,
skip_rows: Option<usize>,
n_rows: Option<usize>,
name: &'static str,
) -> PolarsResult<Self> {
let schema = Arc::new(match schema {
let schema = match schema {
Some(s) => s,
None => function.schema(infer_schema_length)?,
});
};

let file_info = FileInfo::new(schema.clone(), (n_rows, n_rows.unwrap_or(usize::MAX)));
let file_options = FileScanOptions {
Expand Down
12 changes: 12 additions & 0 deletions py-polars/polars/io/ndjson.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ def scan_ndjson(
rechunk: bool = True,
row_count_name: str | None = None,
row_count_offset: int = 0,
schema: SchemaDefinition | None = None,
) -> LazyFrame:
"""
Lazily read from a newline delimited JSON file or multiple files via glob patterns.
Expand All @@ -91,11 +92,22 @@ def scan_ndjson(
DataFrame
row_count_offset
Offset to start the row_count column (only use if the name is set)
schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict
The DataFrame schema may be declared in several ways:
* As a dict of {name:type} pairs; if type is None, it will be auto-inferred.
* As a list of column names; in this case types are automatically inferred.
* As a list of (name,type) pairs; this is equivalent to the dictionary form.
If you supply a list of column names that does not match the names in the
underlying data, the names given here will overwrite them. The number
of names given in the schema should match the underlying data dimensions.
"""
return pl.LazyFrame._scan_ndjson(
source,
infer_schema_length=infer_schema_length,
schema=schema,
batch_size=batch_size,
n_rows=n_rows,
low_memory=low_memory,
Expand Down
2 changes: 2 additions & 0 deletions py-polars/polars/lazyframe/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -519,6 +519,7 @@ def _scan_ndjson(
source: str | Path | list[str] | list[Path],
*,
infer_schema_length: int | None = None,
schema: SchemaDefinition | None = None,
batch_size: int | None = None,
n_rows: int | None = None,
low_memory: bool = False,
Expand Down Expand Up @@ -547,6 +548,7 @@ def _scan_ndjson(
source,
sources,
infer_schema_length,
schema,
batch_size,
n_rows,
low_memory,
Expand Down
6 changes: 5 additions & 1 deletion py-polars/src/lazyframe.rs
Original file line number Diff line number Diff line change
Expand Up @@ -110,11 +110,13 @@ impl PyLazyFrame {

#[staticmethod]
#[cfg(feature = "json")]
#[pyo3(signature = (path, paths, infer_schema_length, batch_size, n_rows, low_memory, rechunk, row_count))]
#[allow(clippy::too_many_arguments)]
#[pyo3(signature = (path, paths, infer_schema_length, schema, batch_size, n_rows, low_memory, rechunk, row_count))]
fn new_from_ndjson(
path: Option<PathBuf>,
paths: Vec<PathBuf>,
infer_schema_length: Option<usize>,
schema: Option<Wrap<Schema>>,
batch_size: Option<usize>,
n_rows: Option<usize>,
low_memory: bool,
Expand All @@ -135,9 +137,11 @@ impl PyLazyFrame {
.with_n_rows(n_rows)
.low_memory(low_memory)
.with_rechunk(rechunk)
.with_schema(schema.map(|schema| Arc::new(schema.0)))
.with_row_count(row_count)
.finish()
.map_err(PyPolarsErr::from)?;

Ok(lf.into())
}

Expand Down
18 changes: 18 additions & 0 deletions py-polars/tests/unit/io/test_lazy_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,24 @@ def test_scan_ndjson(foods_ndjson_path: Path) -> None:
assert df["foo"].to_list() == [10, 16, 21, 23, 24, 30, 35]


def test_scan_ndjson_with_schema(foods_ndjson_path: Path) -> None:
schema = {
"category": pl.Categorical,
"calories": pl.Int64,
"fats_g": pl.Float64,
"sugars_g": pl.Int64,
}
df = pl.scan_ndjson(foods_ndjson_path, schema=schema).collect()
assert df["category"].dtype == pl.Categorical
assert df["calories"].dtype == pl.Int64
assert df["fats_g"].dtype == pl.Float64
assert df["sugars_g"].dtype == pl.Int64

schema["sugars_g"] = pl.Float64
df = pl.scan_ndjson(foods_ndjson_path, schema=schema).collect()
assert df["sugars_g"].dtype == pl.Float64


@pytest.mark.write_disk()
def test_scan_with_projection(tmp_path: Path) -> None:
tmp_path.mkdir(exist_ok=True)
Expand Down

0 comments on commit 57139eb

Please sign in to comment.