From 329c1ef4c9de99f5992bdc7adcb5ca5562316bf2 Mon Sep 17 00:00:00 2001 From: Simon Lin Date: Mon, 28 Oct 2024 15:56:56 +1100 Subject: [PATCH] c --- .../src/python/user-guide/io/cloud-storage.py | 50 ++++++++++++++++--- .../src/rust/user-guide/io/cloud-storage.rs | 12 +++-- docs/source/user-guide/io/cloud-storage.md | 32 +++++++++--- 3 files changed, 75 insertions(+), 19 deletions(-) diff --git a/docs/source/src/python/user-guide/io/cloud-storage.py b/docs/source/src/python/user-guide/io/cloud-storage.py index 73cf597ec84e..12b02df28e61 100644 --- a/docs/source/src/python/user-guide/io/cloud-storage.py +++ b/docs/source/src/python/user-guide/io/cloud-storage.py @@ -7,7 +7,16 @@ df = pl.read_parquet(source) # --8<-- [end:read_parquet] -# --8<-- [start:scan_parquet] +# --8<-- [start:scan_parquet_query] +import polars as pl + +source = "s3://bucket/*.parquet" + +df = pl.scan_parquet(source).filter(pl.col("id") < 100).select("id","value").collect() +# --8<-- [end:scan_parquet_query] + + +# --8<-- [start:scan_parquet_storage_options_aws] import polars as pl source = "s3://bucket/*.parquet" @@ -17,17 +26,42 @@ "aws_secret_access_key": "", "aws_region": "us-east-1", } -df = pl.scan_parquet(source, storage_options=storage_options) -# --8<-- [end:scan_parquet] +df = pl.scan_parquet(source, storage_options=storage_options).collect() +# --8<-- [end:scan_parquet_storage_options_aws] + +# --8<-- [start:credential_provider_class] +lf = pl.scan_parquet( + "s3://.../...", + credential_provider=pl.CredentialProviderAWS( + profile_name="..." + assume_role={ + "RoleArn": f"...", + "RoleSessionName": "...", + } + ), +) -# --8<-- [start:scan_parquet_query] -import polars as pl +df = lf.collect() +# --8<-- [end:credential_provider_class] -source = "s3://bucket/*.parquet" +# --8<-- [start:credential_provider_custom_func] +def get_credentials() -> pl.CredentialProviderFunctionReturn: + expiry = None + return { + "aws_access_key_id": "...", + "aws_secret_access_key": "...", + "aws_session_token": "...", + }, expiry -df = pl.scan_parquet(source).filter(pl.col("id") < 100).select("id","value").collect() -# --8<-- [end:scan_parquet_query] + +lf = pl.scan_parquet( + "s3://.../...", + credential_provider=get_credentials, +) + +df = lf.collect() +# --8<-- [end:credential_provider_custom_func] # --8<-- [start:scan_pyarrow_dataset] import polars as pl diff --git a/docs/source/src/rust/user-guide/io/cloud-storage.rs b/docs/source/src/rust/user-guide/io/cloud-storage.rs index 5c297739eeee..2c40c31ff2ee 100644 --- a/docs/source/src/rust/user-guide/io/cloud-storage.rs +++ b/docs/source/src/rust/user-guide/io/cloud-storage.rs @@ -31,12 +31,18 @@ async fn main() { } // --8<-- [end:read_parquet] -// --8<-- [start:scan_parquet] -// --8<-- [end:scan_parquet] - // --8<-- [start:scan_parquet_query] // --8<-- [end:scan_parquet_query] +// --8<-- [start:scan_parquet_storage_options_aws] +// --8<-- [end:scan_parquet_storage_options_aws] + +// --8<-- [start:credential_provider_class] +// --8<-- [end:credential_provider_class] + +// --8<-- [start:credential_provider_custom_func] +// --8<-- [end:credential_provider_custom_func] + // --8<-- [start:scan_pyarrow_dataset] // --8<-- [end:scan_pyarrow_dataset] diff --git a/docs/source/user-guide/io/cloud-storage.md b/docs/source/user-guide/io/cloud-storage.md index ba686a5a0f11..f3b5d7a8fb09 100644 --- a/docs/source/user-guide/io/cloud-storage.md +++ b/docs/source/user-guide/io/cloud-storage.md @@ -18,23 +18,39 @@ To read from cloud storage, additional dependencies may be needed depending on t ## Reading from cloud storage -Polars can read a CSV, IPC or Parquet file in eager mode from cloud storage. +Polars supports reading Parquet, CSV, IPC and NDJSON files from cloud storage: {{code_block('user-guide/io/cloud-storage','read_parquet',['read_parquet','read_csv','read_ipc'])}} -This eager query downloads the file to a buffer in memory and creates a `DataFrame` from there. Polars uses `fsspec` to manage this download internally for all cloud storage providers. - ## Scanning from cloud storage with query optimisation -Polars can scan a Parquet file in lazy mode from cloud storage. We may need to provide further details beyond the source url such as authentication details or storage region. Polars looks for these as environment variables but we can also do this manually by passing a `dict` as the `storage_options` argument. +Using `pl.scan_*` functions to read from cloud storage can benefit from [predicate and projection pushdowns](../lazy/optimizations.md), where the query optimizer will apply them before the file is downloaded. This can significantly reduce the amount of data that needs to be downloaded. The query evaluation is triggered by calling `collect`. -{{code_block('user-guide/io/cloud-storage','scan_parquet',['scan_parquet'])}} +{{code_block('user-guide/io/cloud-storage','scan_parquet_query',[])}} -This query creates a `LazyFrame` without downloading the file. In the `LazyFrame` we have access to file metadata such as the schema. Polars uses the `object_store.rs` library internally to manage the interface with the cloud storage providers and so no extra dependencies are required in Python to scan a cloud Parquet file. +## Cloud authentication -If we create a lazy query with [predicate and projection pushdowns](../lazy/optimizations.md), the query optimizer will apply them before the file is downloaded. This can significantly reduce the amount of data that needs to be downloaded. The query evaluation is triggered by calling `collect`. +Polars is able to automatically load default credential configurations for some cloud providers. For +cases when this does not happen, it is possible to manually configure the credentials for Polars to +use for authentication. This can be done in a few ways: -{{code_block('user-guide/io/cloud-storage','scan_parquet_query',[])}} +### Using `storage_options`: + +- Credentials can be passed as configuration keys in a dict with the `storage_options` parameter: + +{{code_block('user-guide/io/cloud-storage','scan_parquet_storage_options_aws',['scan_parquet'])}} + +### Using one of the available `CredentialProvider*` utility classes + +- There may be a utility class `pl.CredentialProvider*` that provides the required authentication functionality. For example, `pl.CredentialProviderAWS` supports selecting AWS profiles, as well as assuming an IAM role: + +{{code_block('user-guide/io/cloud-storage','credential_provider_class',['scan_parquet'])}} + +### Using a custom `credential_provider` function + +- Some environments may require custom authentication logic (e.g. AWS IAM role-chaining). For these cases a Python function can be provided for Polars to use to retrieve credentials: + +{{code_block('user-guide/io/cloud-storage','credential_provider_custom_func',['scan_parquet'])}} ## Scanning with PyArrow