diff --git a/docs/source/src/python/user-guide/lazy/execution.py b/docs/source/src/python/user-guide/lazy/execution.py index ffef5ba20e8a..d410c8a45371 100644 --- a/docs/source/src/python/user-guide/lazy/execution.py +++ b/docs/source/src/python/user-guide/lazy/execution.py @@ -28,4 +28,13 @@ .collect(streaming=True) ) # --8<-- [end:stream] +# --8<-- [start:partial] +q9 = ( + pl.scan_csv(f"docs/assets/data/reddit.csv") + .head(10) + .with_columns(pl.col("name").str.to_uppercase()) + .filter(pl.col("comment_karma") > 0) + .collect() +) +# --8<-- [end:partial] """ diff --git a/docs/source/user-guide/lazy/execution.md b/docs/source/user-guide/lazy/execution.md index d5fcf1a89675..618926852f75 100644 --- a/docs/source/user-guide/lazy/execution.md +++ b/docs/source/user-guide/lazy/execution.md @@ -48,3 +48,22 @@ If your data requires more memory than you have available Polars may be able to {{code_block('user-guide/lazy/execution','stream',['scan_csv','collect'])}} We look at [streaming in more detail here](streaming.md). + +### Execution on a partial dataset + +While you're writing, optimizing or checking your query on a large dataset, querying all available data may lead to a slow development process. + +You can instead limit the number of scanned partitions or use .head early in the query when testing. Keep in mind that aggregations and filters may behave unpredictably on subsets of data. + +{{code_block('user-guide/lazy/execution','partial',['scan_csv','collect','head'])}} + +```text +shape: (1, 6) +┌─────┬─────────────────────────┬─────────────┬────────────┬───────────────┬────────────┐ +│ id ┆ name ┆ created_utc ┆ updated_on ┆ comment_karma ┆ link_karma │ +│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ +│ i64 ┆ str ┆ i64 ┆ i64 ┆ i64 ┆ i64 │ +╞═════╪═════════════════════════╪═════════════╪════════════╪═══════════════╪════════════╡ +│ 6 ┆ TAOJIANLONG_JASONBROKEN ┆ 1397113510 ┆ 1536527864 ┆ 4 ┆ 0 │ +└─────┴─────────────────────────┴─────────────┴────────────┴───────────────┴────────────┘ + ``` \ No newline at end of file