feat: allow specifying schema in pl.scan_ndjson (#10963)

Co-authored-by: ritchie <[email protected]>
pola-rs · Oct 13, 2023 · 57139eb · 57139eb
1 parent f40eea6
commit 57139eb
Show file tree

Hide file tree

Showing 10 changed files with 68 additions and 11 deletions.
diff --git a/crates/polars-lazy/src/physical_plan/executors/scan/ndjson.rs b/crates/polars-lazy/src/physical_plan/executors/scan/ndjson.rs
@@ -19,15 +19,20 @@ impl AnonymousScan for LazyJsonLineReader {
             .finish()
     }
 
-    fn schema(&self, infer_schema_length: Option<usize>) -> PolarsResult<Schema> {
+    fn schema(&self, infer_schema_length: Option<usize>) -> PolarsResult<SchemaRef> {
+        // Short-circuit schema inference if the schema has been explicitly provided.
+        if let Some(schema) = &self.schema {
+            return Ok(schema.clone());
+        }
+
         let f = polars_utils::open_file(&self.path)?;
         let mut reader = std::io::BufReader::new(f);
 
         let data_type =
             polars_json::ndjson::infer(&mut reader, infer_schema_length).map_err(to_compute_err)?;
         let schema = Schema::from_iter(StructArray::get_fields(&data_type));
 
-        Ok(schema)
+        Ok(Arc::new(schema))
     }
     fn allows_projection_pushdown(&self) -> bool {
         true

diff --git a/crates/polars-lazy/src/scan/anonymous_scan.rs b/crates/polars-lazy/src/scan/anonymous_scan.rs
@@ -6,7 +6,7 @@ use crate::prelude::*;
 #[derive(Clone)]
 pub struct ScanArgsAnonymous {
     pub infer_schema_length: Option<usize>,
-    pub schema: Option<Schema>,
+    pub schema: Option<SchemaRef>,
     pub skip_rows: Option<usize>,
     pub n_rows: Option<usize>,
     pub row_count: Option<RowCount>,

diff --git a/crates/polars-lazy/src/scan/ndjson.rs b/crates/polars-lazy/src/scan/ndjson.rs
@@ -13,7 +13,7 @@ pub struct LazyJsonLineReader {
     pub(crate) batch_size: Option<usize>,
     pub(crate) low_memory: bool,
     pub(crate) rechunk: bool,
-    pub(crate) schema: Option<Schema>,
+    pub(crate) schema: Option<SchemaRef>,
     pub(crate) row_count: Option<RowCount>,
     pub(crate) infer_schema_length: Option<usize>,
     pub(crate) n_rows: Option<usize>,
@@ -52,6 +52,7 @@ impl LazyJsonLineReader {
     }
     /// Set the number of rows to use when inferring the json schema.
     /// the default is 100 rows.
+    /// Ignored when the schema is specified explicitly using [`Self::with_schema`].
     /// Setting to `None` will do a full table scan, very slow.
     #[must_use]
     pub fn with_infer_schema_length(mut self, num_rows: Option<usize>) -> Self {
@@ -60,8 +61,8 @@ impl LazyJsonLineReader {
     }
     /// Set the JSON file's schema
     #[must_use]
-    pub fn with_schema(mut self, schema: Schema) -> Self {
-        self.schema = Some(schema);
+    pub fn with_schema(mut self, schema: Option<SchemaRef>) -> Self {
+        self.schema = schema;
         self
     }
 

diff --git a/crates/polars-lazy/src/tests/io.rs b/crates/polars-lazy/src/tests/io.rs
@@ -432,6 +432,21 @@ fn scan_predicate_on_set_null_values() -> PolarsResult<()> {
     Ok(())
 }
 
+#[test]
+fn scan_anonymous_fn() -> PolarsResult<()> {
+    let function = Arc::new(|_scan_opts: AnonymousScanArgs| Ok(fruits_cars()));
+
+    let args = ScanArgsAnonymous {
+        schema: Some(Arc::new(fruits_cars().schema())),
+        ..ScanArgsAnonymous::default()
+    };
+
+    let df = LazyFrame::anonymous_scan(function, args)?.collect()?;
+
+    assert_eq!(df.shape(), (5, 4));
+    Ok(())
+}
+
 #[test]
 #[cfg(feature = "dtype-full")]
 fn scan_small_dtypes() -> PolarsResult<()> {

diff --git a/crates/polars-plan/src/logical_plan/anonymous_scan.rs b/crates/polars-plan/src/logical_plan/anonymous_scan.rs
@@ -21,7 +21,7 @@ pub trait AnonymousScan: Send + Sync {
 
     /// function to supply the schema.
     /// Allows for an optional infer schema argument for data sources with dynamic schemas
-    fn schema(&self, _infer_schema_length: Option<usize>) -> PolarsResult<Schema> {
+    fn schema(&self, _infer_schema_length: Option<usize>) -> PolarsResult<SchemaRef> {
         polars_bail!(ComputeError: "must supply either a schema or a schema function");
     }
     /// specify if the scan provider should allow predicate pushdowns

diff --git a/crates/polars-plan/src/logical_plan/builder.rs b/crates/polars-plan/src/logical_plan/builder.rs
@@ -95,16 +95,16 @@ fn prepare_schema(mut schema: Schema, row_count: Option<&RowCount>) -> SchemaRef
 impl LogicalPlanBuilder {
     pub fn anonymous_scan(
         function: Arc<dyn AnonymousScan>,
-        schema: Option<Schema>,
+        schema: Option<SchemaRef>,
         infer_schema_length: Option<usize>,
         skip_rows: Option<usize>,
         n_rows: Option<usize>,
         name: &'static str,
     ) -> PolarsResult<Self> {
-        let schema = Arc::new(match schema {
+        let schema = match schema {
             Some(s) => s,
             None => function.schema(infer_schema_length)?,
-        });
+        };
 
         let file_info = FileInfo::new(schema.clone(), (n_rows, n_rows.unwrap_or(usize::MAX)));
         let file_options = FileScanOptions {

diff --git a/py-polars/polars/io/ndjson.py b/py-polars/polars/io/ndjson.py
@@ -65,6 +65,7 @@ def scan_ndjson(
     rechunk: bool = True,
     row_count_name: str | None = None,
     row_count_offset: int = 0,
+    schema: SchemaDefinition | None = None,
 ) -> LazyFrame:
     """
     Lazily read from a newline delimited JSON file or multiple files via glob patterns.
@@ -91,11 +92,22 @@ def scan_ndjson(
         DataFrame
     row_count_offset
         Offset to start the row_count column (only use if the name is set)
+    schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict
+        The DataFrame schema may be declared in several ways:
+
+        * As a dict of {name:type} pairs; if type is None, it will be auto-inferred.
+        * As a list of column names; in this case types are automatically inferred.
+        * As a list of (name,type) pairs; this is equivalent to the dictionary form.
+
+        If you supply a list of column names that does not match the names in the
+        underlying data, the names given here will overwrite them. The number
+        of names given in the schema should match the underlying data dimensions.
 
     """
     return pl.LazyFrame._scan_ndjson(
         source,
         infer_schema_length=infer_schema_length,
+        schema=schema,
         batch_size=batch_size,
         n_rows=n_rows,
         low_memory=low_memory,

diff --git a/py-polars/polars/lazyframe/frame.py b/py-polars/polars/lazyframe/frame.py
@@ -519,6 +519,7 @@ def _scan_ndjson(
         source: str | Path | list[str] | list[Path],
         *,
         infer_schema_length: int | None = None,
+        schema: SchemaDefinition | None = None,
         batch_size: int | None = None,
         n_rows: int | None = None,
         low_memory: bool = False,
@@ -547,6 +548,7 @@ def _scan_ndjson(
             source,
             sources,
             infer_schema_length,
+            schema,
             batch_size,
             n_rows,
             low_memory,

diff --git a/py-polars/src/lazyframe.rs b/py-polars/src/lazyframe.rs
@@ -110,11 +110,13 @@ impl PyLazyFrame {
 
     #[staticmethod]
     #[cfg(feature = "json")]
-    #[pyo3(signature = (path, paths, infer_schema_length, batch_size, n_rows, low_memory, rechunk, row_count))]
+    #[allow(clippy::too_many_arguments)]
+    #[pyo3(signature = (path, paths, infer_schema_length, schema, batch_size, n_rows, low_memory, rechunk, row_count))]
     fn new_from_ndjson(
         path: Option<PathBuf>,
         paths: Vec<PathBuf>,
         infer_schema_length: Option<usize>,
+        schema: Option<Wrap<Schema>>,
         batch_size: Option<usize>,
         n_rows: Option<usize>,
         low_memory: bool,
@@ -135,9 +137,11 @@ impl PyLazyFrame {
             .with_n_rows(n_rows)
             .low_memory(low_memory)
             .with_rechunk(rechunk)
+            .with_schema(schema.map(|schema| Arc::new(schema.0)))
             .with_row_count(row_count)
             .finish()
             .map_err(PyPolarsErr::from)?;
+
         Ok(lf.into())
     }
 

diff --git a/py-polars/tests/unit/io/test_lazy_json.py b/py-polars/tests/unit/io/test_lazy_json.py
@@ -38,6 +38,24 @@ def test_scan_ndjson(foods_ndjson_path: Path) -> None:
     assert df["foo"].to_list() == [10, 16, 21, 23, 24, 30, 35]
 
 
+def test_scan_ndjson_with_schema(foods_ndjson_path: Path) -> None:
+    schema = {
+        "category": pl.Categorical,
+        "calories": pl.Int64,
+        "fats_g": pl.Float64,
+        "sugars_g": pl.Int64,
+    }
+    df = pl.scan_ndjson(foods_ndjson_path, schema=schema).collect()
+    assert df["category"].dtype == pl.Categorical
+    assert df["calories"].dtype == pl.Int64
+    assert df["fats_g"].dtype == pl.Float64
+    assert df["sugars_g"].dtype == pl.Int64
+
+    schema["sugars_g"] = pl.Float64
+    df = pl.scan_ndjson(foods_ndjson_path, schema=schema).collect()
+    assert df["sugars_g"].dtype == pl.Float64
+
+
 @pytest.mark.write_disk()
 def test_scan_with_projection(tmp_path: Path) -> None:
     tmp_path.mkdir(exist_ok=True)