From 77b0bf953402ab945e5e07f91ef5716d9e757878 Mon Sep 17 00:00:00 2001 From: Alex-Monahan Date: Tue, 27 Jun 2023 08:47:08 -0700 Subject: [PATCH] Standardize on read_parquet --- docs/data/partitioning/hive_partitioning.md | 6 ++++-- docs/extensions/httpfs.md | 15 +++++++++------ 2 files changed, 13 insertions(+), 8 deletions(-) diff --git a/docs/data/partitioning/hive_partitioning.md b/docs/data/partitioning/hive_partitioning.md index bd63a113768..90617f4039a 100644 --- a/docs/data/partitioning/hive_partitioning.md +++ b/docs/data/partitioning/hive_partitioning.md @@ -7,6 +7,8 @@ title: Hive Partitioning ```sql -- read data from a hive partitioned data set +SELECT * FROM read_parquet('orders/*/*/*.parquet', hive_partitioning=1); +-- parquet_scan is an alias of read_parquet, so they are equivalent SELECT * FROM parquet_scan('orders/*/*/*.parquet', hive_partitioning=1); -- write a table to a hive partitioned data set COPY orders TO 'orders' (FORMAT PARQUET, PARTITION_BY (year, month)); @@ -36,7 +38,7 @@ orders Files stored in this hierarchy can be read using the `hive_partitioning` flag. ```sql -SELECT * FROM parquet_scan('orders/*/*/*.parquet', hive_partitioning=1); +SELECT * FROM read_parquet('orders/*/*/*.parquet', hive_partitioning=1); ``` When we specify the `hive_partitioning` flag, the values of the columns will be read from the directories. @@ -46,7 +48,7 @@ Filters on the partition keys are automatically pushed down into the files. This ```sql SELECT * -FROM parquet_scan('orders/*/*/*.parquet', hive_partitioning=1) +FROM read_parquet('orders/*/*/*.parquet', hive_partitioning=1) WHERE year=2022 AND month=11; ``` diff --git a/docs/extensions/httpfs.md b/docs/extensions/httpfs.md index 60d22bc6246..ed8fdf982ea 100644 --- a/docs/extensions/httpfs.md +++ b/docs/extensions/httpfs.md @@ -37,6 +37,9 @@ SELECT COUNT(*) FROM 'https://domain.tld/file.parquet'; Scanning multiple files over HTTP(S) is also supported: ```sql +SELECT * FROM read_parquet(['https://domain.tld/file1.parquet', 'https://domain.tld/file2.parquet']); + +-- parquet_scan is an alias of read_parquet, so they are equivalent SELECT * FROM parquet_scan(['https://domain.tld/file1.parquet', 'https://domain.tld/file2.parquet']); ``` @@ -129,7 +132,7 @@ SELECT * FROM 's3://bucket/file.extension'; Multiple files are also possible, for example: ```sql -SELECT * FROM parquet_scan(['s3://bucket/file1.parquet', 's3://bucket/file2.parquet']); +SELECT * FROM read_parquet(['s3://bucket/file1.parquet', 's3://bucket/file2.parquet']); ``` ### Glob @@ -138,7 +141,7 @@ File globbing is implemented using the ListObjectV2 API call and allows to use f multiple files, for example: ```sql -SELECT * from parquet_scan('s3://bucket/*.parquet') +SELECT * from read_parquet('s3://bucket/*.parquet') ``` This query matches all files in the root of the bucket with the parquet extension. @@ -147,13 +150,13 @@ Several features for matching are supported, such as `*` to match any number of character or `[0-9]` for a single character in a range of characters: ```sql -SELECT COUNT(*) FROM parquet_scan('s3://bucket/folder*/100?/t[0-9].parquet') +SELECT COUNT(*) FROM read_parquet('s3://bucket/folder*/100?/t[0-9].parquet') ``` A useful feature when using globs is the `filename` option which adds a column with the file that a row originated from: ```sql -SELECT * FROM parquet_scan('s3://bucket/*.parquet', FILENAME = 1); +SELECT * FROM read_parquet('s3://bucket/*.parquet', FILENAME = 1); ``` could for example result in: @@ -178,7 +181,7 @@ s3://bucket/year=2014/file.parquet If scanning these files with the HIVE_PARTITIONING option enabled: ```sql -SELECT * FROM parquet_scan('s3://bucket/*/file.parquet', HIVE_PARTITIONING = 1); +SELECT * FROM read_parquet('s3://bucket/*/file.parquet', HIVE_PARTITIONING = 1); ``` could result in: @@ -194,7 +197,7 @@ however, these columns behave just like regular columns. For example, filters ca columns: ```sql -SELECT * FROM parquet_scan('s3://bucket/*/file.parquet', HIVE_PARTITIONING = 1) where year=2013; +SELECT * FROM read_parquet('s3://bucket/*/file.parquet', HIVE_PARTITIONING = 1) where year=2013; ``` ## Writing