From 444a3a61e84c4e2646827d078bd9ff0e72895aef Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 22 Jul 2022 14:30:34 -0400 Subject: [PATCH 001/397] Donate `object_store` code from object_store_rs to arrow-rs (#2081) * Import https://github.com/influxdata/object_store_rs/commit/3c51870ac41a90942c2e45bb499a893d514ed1da * Add object_store to workspace, update notes and readme * Remove old github items * Remove old gitignore * Remove kodiak config * Remove redundant license files * Remove influx specific security policy * Remove redudant rust-toolchain and rustfmt * Add Apache License (RAT) * ignore bubble_up_io_errors test * Fix list_store with explicit lifetime, only run `test_list_root` on linux * Only run object_store throttle tests on a mac --- .circleci/config.yml | 262 +++++++++++ CONTRIBUTING.md | 94 ++++ Cargo.toml | 79 ++++ README.md | 26 ++ deny.toml | 45 ++ src/aws.rs | 1041 ++++++++++++++++++++++++++++++++++++++++++ src/azure.rs | 646 ++++++++++++++++++++++++++ src/gcp.rs | 721 +++++++++++++++++++++++++++++ src/lib.rs | 706 ++++++++++++++++++++++++++++ src/local.rs | 773 +++++++++++++++++++++++++++++++ src/memory.rs | 297 ++++++++++++ src/oauth.rs | 215 +++++++++ src/path/mod.rs | 531 +++++++++++++++++++++ src/path/parts.rs | 148 ++++++ src/throttle.rs | 540 ++++++++++++++++++++++ src/token.rs | 64 +++ src/util.rs | 73 +++ 17 files changed, 6261 insertions(+) create mode 100644 .circleci/config.yml create mode 100644 CONTRIBUTING.md create mode 100644 Cargo.toml create mode 100644 README.md create mode 100644 deny.toml create mode 100644 src/aws.rs create mode 100644 src/azure.rs create mode 100644 src/gcp.rs create mode 100644 src/lib.rs create mode 100644 src/local.rs create mode 100644 src/memory.rs create mode 100644 src/oauth.rs create mode 100644 src/path/mod.rs create mode 100644 src/path/parts.rs create mode 100644 src/throttle.rs create mode 100644 src/token.rs create mode 100644 src/util.rs diff --git a/.circleci/config.yml b/.circleci/config.yml new file mode 100644 index 0000000..b4dff6d --- /dev/null +++ b/.circleci/config.yml @@ -0,0 +1,262 @@ +--- +# CI Overview +# ----------- +# +# Each night: +# +# A build image is created (ci_image) from `docker/Dockerfile.ci` and is +# pushed to `quay.io/influxdb/rust:ci`. This build image is then used to run +# the CI tasks for the day. +# +# Every commit: +# +# The CI for every PR and merge to main runs tests, fmt, lints and compiles debug binaries +# +# On main if all these checks pass it will then additionally compile in "release" mode and +# publish a docker image to quay.io/influxdb/iox:$COMMIT_SHA +# +# Manual CI Image: +# +# It is possible to manually trigger a rebuild of the image used in CI. To do this, navigate to +# https://app.circleci.com/pipelines/github/influxdata/influxdb_iox?branch=main (overriding the +# branch name if desired). Then: +# - Click "Run Pipeline" in the top-right +# - Expand "Add Parameters" +# - Add a "boolean" parameter called "ci_image" with the value true +# - Click "Run Pipeline" +# +# If you refresh the page you should see a newly running ci_image workflow +# + +version: 2.1 + +orbs: + win: circleci/windows@4.1 + +commands: + rust_components: + description: Verify installed components + steps: + - run: + name: Verify installed components + command: | + rustup --version + rustup show + cargo fmt --version + cargo clippy --version + + cache_restore: + description: Restore Cargo Cache + steps: + - restore_cache: + name: Restoring Cargo Cache + keys: + - cargo-cache-{{ arch }}-{{ .Branch }}-{{ checksum "Cargo.lock" }} + - cargo-cache-{{ arch }}-{{ .Branch }} + - cargo-cache + cache_save: + description: Save Cargo Cache + steps: + - save_cache: + name: Save Cargo Cache + paths: + - /usr/local/cargo/registry + key: cargo-cache-{{ arch }}-{{ .Branch }}-{{ checksum "Cargo.lock" }} + +jobs: + fmt: + docker: + - image: quay.io/influxdb/rust:ci + environment: + # Disable incremental compilation to avoid overhead. We are not preserving these files anyway. + CARGO_INCREMENTAL: "0" + # Disable full debug symbol generation to speed up CI build + # "1" means line tables only, which is useful for panic tracebacks. + RUSTFLAGS: "-C debuginfo=1" + # https://github.com/rust-lang/cargo/issues/10280 + CARGO_NET_GIT_FETCH_WITH_CLI: "true" + steps: + - checkout + - rust_components + - cache_restore + - run: + name: Rust fmt + command: cargo fmt --all -- --check + - cache_save + lint: + docker: + - image: quay.io/influxdb/rust:ci + environment: + # Disable incremental compilation to avoid overhead. We are not preserving these files anyway. + CARGO_INCREMENTAL: "0" + # Disable full debug symbol generation to speed up CI build + # "1" means line tables only, which is useful for panic tracebacks. + RUSTFLAGS: "-C debuginfo=1" + # https://github.com/rust-lang/cargo/issues/10280 + CARGO_NET_GIT_FETCH_WITH_CLI: "true" + steps: + - checkout + - rust_components + - cache_restore + - run: + name: Clippy + command: cargo clippy --all-targets --all-features --workspace -- -D warnings + - cache_save + cargo_audit: + docker: + - image: quay.io/influxdb/rust:ci + environment: + # Disable incremental compilation to avoid overhead. We are not preserving these files anyway. + CARGO_INCREMENTAL: "0" + # Disable full debug symbol generation to speed up CI build + # "1" means line tables only, which is useful for panic tracebacks. + RUSTFLAGS: "-C debuginfo=1" + # https://github.com/rust-lang/cargo/issues/10280 + CARGO_NET_GIT_FETCH_WITH_CLI: "true" + steps: + - checkout + - rust_components + - cache_restore + - run: + name: Install cargo-deny + command: cargo install --force cargo-deny + - run: + name: cargo-deny Checks + command: cargo deny check -s + - cache_save + check: + docker: + - image: quay.io/influxdb/rust:ci + environment: + # Disable incremental compilation to avoid overhead. We are not preserving these files anyway. + CARGO_INCREMENTAL: "0" + # Disable full debug symbol generation to speed up CI build + # "1" means line tables only, which is useful for panic tracebacks. + RUSTFLAGS: "-C debuginfo=1" + # https://github.com/rust-lang/cargo/issues/10280 + CARGO_NET_GIT_FETCH_WITH_CLI: "true" + steps: + - checkout + - rust_components + - cache_restore + - run: + name: Install cargo-hack + command: cargo install cargo-hack + - run: + name: Check all features + command: cargo hack check --feature-powerset --no-dev-deps --workspace + - cache_save + doc: + docker: + - image: quay.io/influxdb/rust:ci + environment: + # Disable incremental compilation to avoid overhead. We are not preserving these files anyway. + CARGO_INCREMENTAL: "0" + # Disable full debug symbol generation to speed up CI build + # "1" means line tables only, which is useful for panic tracebacks. + RUSTFLAGS: "-C debuginfo=1" + # https://github.com/rust-lang/cargo/issues/10280 + CARGO_NET_GIT_FETCH_WITH_CLI: "true" + steps: + - checkout + - rust_components + - cache_restore + - run: + name: Cargo doc + # excluding datafusion because it's effectively a dependency masqueraded as workspace crate. + command: cargo doc --document-private-items --no-deps --workspace --exclude datafusion + - cache_save + - run: + name: Compress Docs + command: tar -cvzf rustdoc.tar.gz target/doc/ + - store_artifacts: + path: rustdoc.tar.gz + test: + # setup multiple docker images (see https://circleci.com/docs/2.0/configuration-reference/#docker) + docker: + - image: quay.io/influxdb/rust:ci + - image: localstack/localstack:0.14.4 + - image: mcr.microsoft.com/azure-storage/azurite + - image: fsouza/fake-gcs-server + command: + - "-scheme" + - "http" + resource_class: 2xlarge # use of a smaller executor tends crashes on link + environment: + # Disable incremental compilation to avoid overhead. We are not preserving these files anyway. + CARGO_INCREMENTAL: "0" + # Disable full debug symbol generation to speed up CI build + # "1" means line tables only, which is useful for panic tracebacks. + RUSTFLAGS: "-C debuginfo=1" + # https://github.com/rust-lang/cargo/issues/10280 + CARGO_NET_GIT_FETCH_WITH_CLI: "true" + RUST_BACKTRACE: "1" + # Run integration tests + TEST_INTEGRATION: 1 + AWS_DEFAULT_REGION: "us-east-1" + AWS_ACCESS_KEY_ID: test + AWS_SECRET_ACCESS_KEY: test + AWS_ENDPOINT: http://127.0.0.1:4566 + AZURE_USE_EMULATOR: "1" + GOOGLE_SERVICE_ACCOUNT: "/tmp/gcs.json" + OBJECT_STORE_BUCKET: test-bucket + steps: + - run: + name: Setup localstack (AWS emulation) + command: | + cd /tmp + curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip" + unzip awscliv2.zip + sudo ./aws/install + aws --endpoint-url=http://localhost:4566 s3 mb s3://test-bucket + - run: + name: Setup Azurite (Azure emulation) + # the magical connection string is from https://docs.microsoft.com/en-us/azure/storage/common/storage-use-azurite?tabs=visual-studio#http-connection-strings + command: | + curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash + az storage container create -n test-bucket --connection-string 'DefaultEndpointsProtocol=http;AccountName=devstoreaccount1;AccountKey=Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==;BlobEndpoint=http://127.0.0.1:10000/devstoreaccount1;QueueEndpoint=http://127.0.0.1:10001/devstoreaccount1;' + - run: + name: Setup fake GCS server + command: | + curl -X POST --data-binary '{"name":"test-bucket"}' -H "Content-Type: application/json" "http://localhost:4443/storage/v1/b" + echo '{"gcs_base_url": "http://localhost:4443", "disable_oauth": true, "client_email": "", "private_key": ""}' > "$GOOGLE_SERVICE_ACCOUNT" + - checkout + - rust_components + - cache_restore + - run: + name: Cargo test + command: cargo test --workspace --features=aws,azure,azure_test,gcp + - cache_save + + test_windows: + executor: + name: win/default + size: medium + environment: + # https://github.com/rust-lang/cargo/issues/10280 + CARGO_NET_GIT_FETCH_WITH_CLI: "true" + steps: + - checkout + - run: + name: Download rustup + command: wget https://win.rustup.rs/x86_64 -O rustup-init.exe + - run: + name: Install rustup + command: .\rustup-init.exe -y --default-host=x86_64-pc-windows-msvc + - run: + name: Cargo test + command: cargo test --workspace + +workflows: + version: 2 + + # CI for all pull requests. + ci: + jobs: + - check + - fmt + - lint + - cargo_audit + - test + - test_windows + - doc diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 0000000..2e216dd --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,94 @@ + + +# Development instructions + +## Running Tests + +Tests can be run using `cargo` + +```shell +cargo test +``` + +## Running Integration Tests + +By default, integration tests are not run. To run them you will need to set `TEST_INTEGRATION=1` and then provide the +necessary configuration for that object store + +### AWS + +To test the S3 integration against [localstack](https://localstack.cloud/) + +First start up a container running localstack + +``` +$ podman run --rm -it -p 4566:4566 -p 4510-4559:4510-4559 localstack/localstack +``` + +Setup environment + +``` +export TEST_INTEGRATION=1 +export AWS_DEFAULT_REGION=us-east-1 +export AWS_ACCESS_KEY_ID=test +export AWS_SECRET_ACCESS_KEY=test +export AWS_ENDPOINT=http://127.0.0.1:4566 +export OBJECT_STORE_BUCKET=test-bucket +``` + +Create a bucket using the AWS CLI + +``` +podman run --net=host --env-host amazon/aws-cli --endpoint-url=http://localhost:4566 s3 mb s3://test-bucket +``` + +Run tests + +``` +$ cargo test --features aws +``` + +### Azure + +To test the Azure integration +against [azurite](https://docs.microsoft.com/en-us/azure/storage/common/storage-use-azurite?tabs=visual-studio) + +Startup azurite + +``` +$ podman run -p 10000:10000 -p 10001:10001 -p 10002:10002 mcr.microsoft.com/azure-storage/azurite +``` + +Create a bucket + +``` +$ podman run --net=host mcr.microsoft.com/azure-cli az storage container create -n test-bucket --connection-string 'DefaultEndpointsProtocol=http;AccountName=devstoreaccount1;AccountKey=Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==;BlobEndpoint=http://127.0.0.1:10000/devstoreaccount1;QueueEndpoint=http://127.0.0.1:10001/devstoreaccount1;' +``` + +Run tests + +``` +$ cargo test --features azure +``` + +### GCP + +We don't have a good story yet for testing the GCP integration locally. You will need to create a GCS bucket, a +service account that has access to it, and use this to run the tests. diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..613b6ab --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,79 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +[package] +name = "object_store" +version = "0.3.0" +edition = "2021" +license = "MIT/Apache-2.0" +readme = "README.md" +description = "A generic object store interface for uniformly interacting with AWS S3, Google Cloud Storage and Azure Blob Storage" +keywords = [ + "object", + "storage", + "cloud", +] +repository = "https://github.com/apache/arrow-rs" + +[package.metadata.docs.rs] +all-features = true + +[dependencies] # In alphabetical order +async-trait = "0.1.53" +# Microsoft Azure Blob storage integration +azure_core = { version = "0.2", optional = true, default-features = false, features = ["enable_reqwest_rustls"] } +azure_storage = { version = "0.2", optional = true, default-features = false, features = ["account"] } +azure_storage_blobs = { version = "0.2", optional = true, default-features = false, features = ["enable_reqwest_rustls"] } +bytes = "1.0" +chrono = { version = "0.4", default-features = false, features = ["clock"] } +# Google Cloud Storage integration +futures = "0.3" +serde = { version = "1.0", default-features = false, features = ["derive"], optional = true } +serde_json = { version = "1.0", default-features = false, optional = true } +rustls-pemfile = { version = "1.0", default-features = false, optional = true } +ring = { version = "0.16", default-features = false, features = ["std"] } +base64 = { version = "0.13", default-features = false, optional = true } +# for rusoto +hyper = { version = "0.14", optional = true, default-features = false } +# for rusoto +hyper-rustls = { version = "0.23.0", optional = true, default-features = false, features = ["webpki-tokio", "http1", "http2", "tls12"] } +itertools = "0.10.1" +percent-encoding = "2.1" +# rusoto crates are for Amazon S3 integration +rusoto_core = { version = "0.48.0", optional = true, default-features = false, features = ["rustls"] } +rusoto_credential = { version = "0.48.0", optional = true, default-features = false } +rusoto_s3 = { version = "0.48.0", optional = true, default-features = false, features = ["rustls"] } +rusoto_sts = { version = "0.48.0", optional = true, default-features = false, features = ["rustls"] } +snafu = "0.7" +tokio = { version = "1.18", features = ["sync", "macros", "parking_lot", "rt-multi-thread", "time"] } +tracing = { version = "0.1" } +reqwest = { version = "0.11", optional = true, default-features = false, features = ["rustls-tls"] } +parking_lot = { version = "0.12" } +# Filesystem integration +url = "2.2" +walkdir = "2" + +[features] +azure = ["azure_core", "azure_storage_blobs", "azure_storage", "reqwest"] +azure_test = ["azure", "azure_core/azurite_workaround", "azure_storage/azurite_workaround", "azure_storage_blobs/azurite_workaround"] +gcp = ["serde", "serde_json", "reqwest", "reqwest/json", "reqwest/stream", "chrono/serde", "rustls-pemfile", "base64"] +aws = ["rusoto_core", "rusoto_credential", "rusoto_s3", "rusoto_sts", "hyper", "hyper-rustls"] + +[dev-dependencies] # In alphabetical order +dotenv = "0.15.0" +tempfile = "3.1.0" +futures-test = "0.3" diff --git a/README.md b/README.md new file mode 100644 index 0000000..313588b --- /dev/null +++ b/README.md @@ -0,0 +1,26 @@ + + +# Rust Object Store + +A crate providing a generic interface to object stores, such as S3, Azure Blob Storage and Google Cloud Storage. + +Originally developed for [InfluxDB IOx](https://github.com/influxdata/influxdb_iox/) and later split out and donated to Apache Arrow. + +See [docs.rs](https://docs.rs/object_store) for usage instructions diff --git a/deny.toml b/deny.toml new file mode 100644 index 0000000..bfd060a --- /dev/null +++ b/deny.toml @@ -0,0 +1,45 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# Configuration documentation: +#  https://embarkstudios.github.io/cargo-deny/index.html + +[advisories] +vulnerability = "deny" +yanked = "deny" +unmaintained = "warn" +notice = "warn" +ignore = [ +] +git-fetch-with-cli = true + +[licenses] +default = "allow" +unlicensed = "allow" +copyleft = "allow" + +[bans] +multiple-versions = "warn" +deny = [ + # We are using rustls as the TLS implementation, so we shouldn't be linking + # in OpenSSL too. + # + # If you're hitting this, you might want to take a look at what new + # dependencies you have introduced and check if there's a way to depend on + # rustls instead of OpenSSL (tip: check the crate's feature flags). + { name = "openssl-sys" } +] diff --git a/src/aws.rs b/src/aws.rs new file mode 100644 index 0000000..7ebcc2a --- /dev/null +++ b/src/aws.rs @@ -0,0 +1,1041 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! An object store implementation for S3 +use crate::util::format_http_range; +use crate::{ + collect_bytes, + path::{Path, DELIMITER}, + util::format_prefix, + GetResult, ListResult, ObjectMeta, ObjectStore, Result, +}; +use async_trait::async_trait; +use bytes::Bytes; +use chrono::{DateTime, Utc}; +use futures::{ + stream::{self, BoxStream}, + Future, Stream, StreamExt, TryStreamExt, +}; +use hyper::client::Builder as HyperBuilder; +use rusoto_core::ByteStream; +use rusoto_credential::{InstanceMetadataProvider, StaticProvider}; +use rusoto_s3::S3; +use rusoto_sts::WebIdentityProvider; +use snafu::{OptionExt, ResultExt, Snafu}; +use std::ops::Range; +use std::{ + convert::TryFrom, fmt, num::NonZeroUsize, ops::Deref, sync::Arc, time::Duration, +}; +use tokio::sync::{OwnedSemaphorePermit, Semaphore}; +use tracing::{debug, warn}; + +/// The maximum number of times a request will be retried in the case of an AWS server error +pub const MAX_NUM_RETRIES: u32 = 3; + +/// A specialized `Error` for object store-related errors +#[derive(Debug, Snafu)] +#[allow(missing_docs)] +enum Error { + #[snafu(display( + "Expected streamed data to have length {}, got {}", + expected, + actual + ))] + DataDoesNotMatchLength { expected: usize, actual: usize }, + + #[snafu(display( + "Did not receive any data. Bucket: {}, Location: {}", + bucket, + path + ))] + NoData { bucket: String, path: String }, + + #[snafu(display( + "Unable to DELETE data. Bucket: {}, Location: {}, Error: {} ({:?})", + bucket, + path, + source, + source, + ))] + UnableToDeleteData { + source: rusoto_core::RusotoError, + bucket: String, + path: String, + }, + + #[snafu(display( + "Unable to GET data. Bucket: {}, Location: {}, Error: {} ({:?})", + bucket, + path, + source, + source, + ))] + UnableToGetData { + source: rusoto_core::RusotoError, + bucket: String, + path: String, + }, + + #[snafu(display( + "Unable to HEAD data. Bucket: {}, Location: {}, Error: {} ({:?})", + bucket, + path, + source, + source, + ))] + UnableToHeadData { + source: rusoto_core::RusotoError, + bucket: String, + path: String, + }, + + #[snafu(display( + "Unable to GET part of the data. Bucket: {}, Location: {}, Error: {} ({:?})", + bucket, + path, + source, + source, + ))] + UnableToGetPieceOfData { + source: std::io::Error, + bucket: String, + path: String, + }, + + #[snafu(display( + "Unable to PUT data. Bucket: {}, Location: {}, Error: {} ({:?})", + bucket, + path, + source, + source, + ))] + UnableToPutData { + source: rusoto_core::RusotoError, + bucket: String, + path: String, + }, + + #[snafu(display( + "Unable to list data. Bucket: {}, Error: {} ({:?})", + bucket, + source, + source, + ))] + UnableToListData { + source: rusoto_core::RusotoError, + bucket: String, + }, + + #[snafu(display( + "Unable to copy object. Bucket: {}, From: {}, To: {}, Error: {}", + bucket, + from, + to, + source, + ))] + UnableToCopyObject { + source: rusoto_core::RusotoError, + bucket: String, + from: String, + to: String, + }, + + #[snafu(display( + "Unable to parse last modified date. Bucket: {}, Error: {} ({:?})", + bucket, + source, + source, + ))] + UnableToParseLastModified { + source: chrono::ParseError, + bucket: String, + }, + + #[snafu(display( + "Unable to buffer data into temporary file, Error: {} ({:?})", + source, + source, + ))] + UnableToBufferStream { source: std::io::Error }, + + #[snafu(display( + "Could not parse `{}` as an AWS region. Regions should look like `us-east-2`. {} ({:?})", + region, + source, + source, + ))] + InvalidRegion { + region: String, + source: rusoto_core::region::ParseRegionError, + }, + + #[snafu(display("Missing aws-access-key"))] + MissingAccessKey, + + #[snafu(display("Missing aws-secret-access-key"))] + MissingSecretAccessKey, + + NotFound { + path: String, + source: Box, + }, +} + +impl From for super::Error { + fn from(source: Error) -> Self { + match source { + Error::NotFound { path, source } => Self::NotFound { path, source }, + _ => Self::Generic { + store: "S3", + source: Box::new(source), + }, + } + } +} + +/// Configuration for connecting to [Amazon S3](https://aws.amazon.com/s3/). +pub struct AmazonS3 { + /// S3 client w/o any connection limit. + /// + /// You should normally use [`Self::client`] instead. + client_unrestricted: rusoto_s3::S3Client, + + /// Semaphore that limits the usage of [`client_unrestricted`](Self::client_unrestricted). + connection_semaphore: Arc, + + /// Bucket name used by this object store client. + bucket_name: String, +} + +impl fmt::Debug for AmazonS3 { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("AmazonS3") + .field("client", &"rusoto_s3::S3Client") + .field("bucket_name", &self.bucket_name) + .finish() + } +} + +impl fmt::Display for AmazonS3 { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "AmazonS3({})", self.bucket_name) + } +} + +#[async_trait] +impl ObjectStore for AmazonS3 { + async fn put(&self, location: &Path, bytes: Bytes) -> Result<()> { + let bucket_name = self.bucket_name.clone(); + let request_factory = move || { + let bytes = bytes.clone(); + + let length = bytes.len(); + let stream_data = Ok(bytes); + let stream = futures::stream::once(async move { stream_data }); + let byte_stream = ByteStream::new_with_size(stream, length); + + rusoto_s3::PutObjectRequest { + bucket: bucket_name.clone(), + key: location.to_string(), + body: Some(byte_stream), + ..Default::default() + } + }; + + let s3 = self.client().await; + + s3_request(move || { + let (s3, request_factory) = (s3.clone(), request_factory.clone()); + + async move { s3.put_object(request_factory()).await } + }) + .await + .context(UnableToPutDataSnafu { + bucket: &self.bucket_name, + path: location.as_ref(), + })?; + + Ok(()) + } + + async fn get(&self, location: &Path) -> Result { + Ok(GetResult::Stream( + self.get_object(location, None).await?.boxed(), + )) + } + + async fn get_range(&self, location: &Path, range: Range) -> Result { + let size_hint = range.end - range.start; + let stream = self.get_object(location, Some(range)).await?; + collect_bytes(stream, Some(size_hint)).await + } + + async fn head(&self, location: &Path) -> Result { + let key = location.to_string(); + let head_request = rusoto_s3::HeadObjectRequest { + bucket: self.bucket_name.clone(), + key: key.clone(), + ..Default::default() + }; + let s = self + .client() + .await + .head_object(head_request) + .await + .map_err(|e| match e { + rusoto_core::RusotoError::Service( + rusoto_s3::HeadObjectError::NoSuchKey(_), + ) => Error::NotFound { + path: key.clone(), + source: e.into(), + }, + rusoto_core::RusotoError::Unknown(h) if h.status.as_u16() == 404 => { + Error::NotFound { + path: key.clone(), + source: "resource not found".into(), + } + } + _ => Error::UnableToHeadData { + bucket: self.bucket_name.to_owned(), + path: key.clone(), + source: e, + }, + })?; + + // Note: GetObject and HeadObject return a different date format from ListObjects + // + // S3 List returns timestamps in the form + // 2013-09-17T18:07:53.000Z + // S3 GetObject returns timestamps in the form + // Last-Modified: Sun, 1 Jan 2006 12:00:00 GMT + let last_modified = match s.last_modified { + Some(lm) => DateTime::parse_from_rfc2822(&lm) + .context(UnableToParseLastModifiedSnafu { + bucket: &self.bucket_name, + })? + .with_timezone(&Utc), + None => Utc::now(), + }; + + Ok(ObjectMeta { + last_modified, + location: location.clone(), + size: usize::try_from(s.content_length.unwrap_or(0)) + .expect("unsupported size on this platform"), + }) + } + + async fn delete(&self, location: &Path) -> Result<()> { + let bucket_name = self.bucket_name.clone(); + + let request_factory = move || rusoto_s3::DeleteObjectRequest { + bucket: bucket_name.clone(), + key: location.to_string(), + ..Default::default() + }; + + let s3 = self.client().await; + + s3_request(move || { + let (s3, request_factory) = (s3.clone(), request_factory.clone()); + + async move { s3.delete_object(request_factory()).await } + }) + .await + .context(UnableToDeleteDataSnafu { + bucket: &self.bucket_name, + path: location.as_ref(), + })?; + + Ok(()) + } + + async fn list( + &self, + prefix: Option<&Path>, + ) -> Result>> { + Ok(self + .list_objects_v2(prefix, None) + .await? + .map_ok(move |list_objects_v2_result| { + let contents = list_objects_v2_result.contents.unwrap_or_default(); + let iter = contents + .into_iter() + .map(|object| convert_object_meta(object, &self.bucket_name)); + + futures::stream::iter(iter) + }) + .try_flatten() + .boxed()) + } + + async fn list_with_delimiter(&self, prefix: Option<&Path>) -> Result { + Ok(self + .list_objects_v2(prefix, Some(DELIMITER.to_string())) + .await? + .try_fold( + ListResult { + common_prefixes: vec![], + objects: vec![], + }, + |acc, list_objects_v2_result| async move { + let mut res = acc; + let contents = list_objects_v2_result.contents.unwrap_or_default(); + let mut objects = contents + .into_iter() + .map(|object| convert_object_meta(object, &self.bucket_name)) + .collect::>>()?; + + res.objects.append(&mut objects); + + let prefixes = + list_objects_v2_result.common_prefixes.unwrap_or_default(); + res.common_prefixes.reserve(prefixes.len()); + + for p in prefixes { + let prefix = + p.prefix.expect("can't have a prefix without a value"); + res.common_prefixes.push(Path::parse(prefix)?); + } + + Ok(res) + }, + ) + .await?) + } + + async fn copy(&self, from: &Path, to: &Path) -> Result<()> { + let from = from.as_ref(); + let to = to.as_ref(); + let bucket_name = self.bucket_name.clone(); + + let request_factory = move || rusoto_s3::CopyObjectRequest { + bucket: bucket_name.clone(), + copy_source: format!("{}/{}", &bucket_name, from), + key: to.to_string(), + ..Default::default() + }; + + let s3 = self.client().await; + + s3_request(move || { + let (s3, request_factory) = (s3.clone(), request_factory.clone()); + + async move { s3.copy_object(request_factory()).await } + }) + .await + .context(UnableToCopyObjectSnafu { + bucket: &self.bucket_name, + from, + to, + })?; + + Ok(()) + } + + async fn copy_if_not_exists(&self, _source: &Path, _dest: &Path) -> Result<()> { + // Will need dynamodb_lock + Err(crate::Error::NotImplemented) + } +} + +fn convert_object_meta(object: rusoto_s3::Object, bucket: &str) -> Result { + let key = object.key.expect("object doesn't exist without a key"); + let location = Path::parse(key)?; + let last_modified = match object.last_modified { + Some(lm) => DateTime::parse_from_rfc3339(&lm) + .context(UnableToParseLastModifiedSnafu { bucket })? + .with_timezone(&Utc), + None => Utc::now(), + }; + let size = usize::try_from(object.size.unwrap_or(0)) + .expect("unsupported size on this platform"); + + Ok(ObjectMeta { + location, + last_modified, + size, + }) +} + +/// Configure a connection to Amazon S3 using the specified credentials in +/// the specified Amazon region and bucket. +#[allow(clippy::too_many_arguments)] +pub fn new_s3( + access_key_id: Option>, + secret_access_key: Option>, + region: impl Into, + bucket_name: impl Into, + endpoint: Option>, + session_token: Option>, + max_connections: NonZeroUsize, + allow_http: bool, +) -> Result { + let region = region.into(); + let region: rusoto_core::Region = match endpoint { + None => region.parse().context(InvalidRegionSnafu { region })?, + Some(endpoint) => rusoto_core::Region::Custom { + name: region, + endpoint: endpoint.into(), + }, + }; + + let mut builder = HyperBuilder::default(); + builder.pool_max_idle_per_host(max_connections.get()); + + let connector = if allow_http { + hyper_rustls::HttpsConnectorBuilder::new() + .with_webpki_roots() + .https_or_http() + .enable_http1() + .enable_http2() + .build() + } else { + hyper_rustls::HttpsConnectorBuilder::new() + .with_webpki_roots() + .https_only() + .enable_http1() + .enable_http2() + .build() + }; + + let http_client = rusoto_core::request::HttpClient::from_builder(builder, connector); + + let client = match (access_key_id, secret_access_key, session_token) { + (Some(access_key_id), Some(secret_access_key), Some(session_token)) => { + let credentials_provider = StaticProvider::new( + access_key_id.into(), + secret_access_key.into(), + Some(session_token.into()), + None, + ); + rusoto_s3::S3Client::new_with(http_client, credentials_provider, region) + } + (Some(access_key_id), Some(secret_access_key), None) => { + let credentials_provider = StaticProvider::new_minimal( + access_key_id.into(), + secret_access_key.into(), + ); + rusoto_s3::S3Client::new_with(http_client, credentials_provider, region) + } + (None, Some(_), _) => return Err(Error::MissingAccessKey.into()), + (Some(_), None, _) => return Err(Error::MissingSecretAccessKey.into()), + _ if std::env::var_os("AWS_WEB_IDENTITY_TOKEN_FILE").is_some() => { + rusoto_s3::S3Client::new_with( + http_client, + WebIdentityProvider::from_k8s_env(), + region, + ) + } + _ => rusoto_s3::S3Client::new_with( + http_client, + InstanceMetadataProvider::new(), + region, + ), + }; + + Ok(AmazonS3 { + client_unrestricted: client, + connection_semaphore: Arc::new(Semaphore::new(max_connections.get())), + bucket_name: bucket_name.into(), + }) +} + +/// Create a new [`AmazonS3`] that always errors +pub fn new_failing_s3() -> Result { + new_s3( + Some("foo"), + Some("bar"), + "us-east-1", + "bucket", + None as Option<&str>, + None as Option<&str>, + NonZeroUsize::new(16).unwrap(), + true, + ) +} + +/// S3 client bundled w/ a semaphore permit. +#[derive(Clone)] +struct SemaphoreClient { + /// Permit for this specific use of the client. + /// + /// Note that this field is never read and therefore considered "dead code" by rustc. + #[allow(dead_code)] + permit: Arc, + + inner: rusoto_s3::S3Client, +} + +impl Deref for SemaphoreClient { + type Target = rusoto_s3::S3Client; + + fn deref(&self) -> &Self::Target { + &self.inner + } +} + +impl AmazonS3 { + /// Get a client according to the current connection limit. + async fn client(&self) -> SemaphoreClient { + let permit = Arc::clone(&self.connection_semaphore) + .acquire_owned() + .await + .expect("semaphore shouldn't be closed yet"); + SemaphoreClient { + permit: Arc::new(permit), + inner: self.client_unrestricted.clone(), + } + } + + async fn get_object( + &self, + location: &Path, + range: Option>, + ) -> Result>> { + let key = location.to_string(); + let get_request = rusoto_s3::GetObjectRequest { + bucket: self.bucket_name.clone(), + key: key.clone(), + range: range.map(format_http_range), + ..Default::default() + }; + let bucket_name = self.bucket_name.clone(); + let stream = self + .client() + .await + .get_object(get_request) + .await + .map_err(|e| match e { + rusoto_core::RusotoError::Service( + rusoto_s3::GetObjectError::NoSuchKey(_), + ) => Error::NotFound { + path: key.clone(), + source: e.into(), + }, + _ => Error::UnableToGetData { + bucket: self.bucket_name.to_owned(), + path: key.clone(), + source: e, + }, + })? + .body + .context(NoDataSnafu { + bucket: self.bucket_name.to_owned(), + path: key.clone(), + })? + .map_err(move |source| Error::UnableToGetPieceOfData { + source, + bucket: bucket_name.clone(), + path: key.clone(), + }) + .err_into(); + + Ok(stream) + } + + async fn list_objects_v2( + &self, + prefix: Option<&Path>, + delimiter: Option, + ) -> Result>> { + enum ListState { + Start, + HasMore(String), + Done, + } + + let prefix = format_prefix(prefix); + let bucket = self.bucket_name.clone(); + + let request_factory = move || rusoto_s3::ListObjectsV2Request { + bucket, + prefix, + delimiter, + ..Default::default() + }; + let s3 = self.client().await; + + Ok(stream::unfold(ListState::Start, move |state| { + let request_factory = request_factory.clone(); + let s3 = s3.clone(); + + async move { + let continuation_token = match &state { + ListState::HasMore(continuation_token) => Some(continuation_token), + ListState::Done => { + return None; + } + // If this is the first request we've made, we don't need to make any + // modifications to the request + ListState::Start => None, + }; + + let resp = s3_request(move || { + let (s3, request_factory, continuation_token) = ( + s3.clone(), + request_factory.clone(), + continuation_token.cloned(), + ); + + async move { + s3.list_objects_v2(rusoto_s3::ListObjectsV2Request { + continuation_token, + ..request_factory() + }) + .await + } + }) + .await; + + let resp = match resp { + Ok(resp) => resp, + Err(e) => return Some((Err(e), state)), + }; + + // The AWS response contains a field named `is_truncated` as well as + // `next_continuation_token`, and we're assuming that `next_continuation_token` + // is only set when `is_truncated` is true (and therefore not + // checking `is_truncated`). + let next_state = if let Some(next_continuation_token) = + &resp.next_continuation_token + { + ListState::HasMore(next_continuation_token.to_string()) + } else { + ListState::Done + }; + + Some((Ok(resp), next_state)) + } + }) + .map_err(move |e| { + Error::UnableToListData { + source: e, + bucket: self.bucket_name.clone(), + } + .into() + }) + .boxed()) + } +} + +/// Handles retrying a request to S3 up to `MAX_NUM_RETRIES` times if S3 returns 5xx server errors. +/// +/// The `future_factory` argument is a function `F` that takes no arguments and, when called, will +/// return a `Future` (type `G`) that, when `await`ed, will perform a request to S3 through +/// `rusoto` and return a `Result` that returns some type `R` on success and some +/// `rusoto_core::RusotoError` on error. +/// +/// If the executed `Future` returns success, this function will return that success. +/// If the executed `Future` returns a 5xx server error, this function will wait an amount of +/// time that increases exponentially with the number of times it has retried, get a new `Future` by +/// calling `future_factory` again, and retry the request by `await`ing the `Future` again. +/// The retries will continue until the maximum number of retries has been attempted. In that case, +/// this function will return the last encountered error. +/// +/// Client errors (4xx) will never be retried by this function. +async fn s3_request( + future_factory: F, +) -> Result> +where + E: std::error::Error + Send, + F: Fn() -> G + Send, + G: Future>> + Send, + R: Send, +{ + let mut attempts = 0; + + loop { + let request = future_factory(); + + let result = request.await; + + match result { + Ok(r) => return Ok(r), + Err(error) => { + attempts += 1; + + let should_retry = matches!( + error, + rusoto_core::RusotoError::Unknown(ref response) + if response.status.is_server_error() + ); + + if attempts > MAX_NUM_RETRIES { + warn!( + ?error, + attempts, "maximum number of retries exceeded for AWS S3 request" + ); + return Err(error); + } else if !should_retry { + return Err(error); + } else { + debug!(?error, attempts, "retrying AWS S3 request"); + let wait_time = Duration::from_millis(2u64.pow(attempts) * 50); + tokio::time::sleep(wait_time).await; + } + } + } + } +} + +impl Error { + #[cfg(test)] + fn s3_error_due_to_credentials(&self) -> bool { + use rusoto_core::RusotoError; + use Error::*; + + matches!( + self, + UnableToPutData { + source: RusotoError::Credentials(_), + bucket: _, + path: _, + } | UnableToGetData { + source: RusotoError::Credentials(_), + bucket: _, + path: _, + } | UnableToDeleteData { + source: RusotoError::Credentials(_), + bucket: _, + path: _, + } | UnableToListData { + source: RusotoError::Credentials(_), + bucket: _, + } + ) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::{ + tests::{ + get_nonexistent_object, list_uses_directories_correctly, list_with_delimiter, + put_get_delete_list, rename_and_copy, + }, + Error as ObjectStoreError, ObjectStore, + }; + use bytes::Bytes; + use std::env; + + type TestError = Box; + type Result = std::result::Result; + + const NON_EXISTENT_NAME: &str = "nonexistentname"; + + #[derive(Debug)] + struct AwsConfig { + access_key_id: String, + secret_access_key: String, + region: String, + bucket: String, + endpoint: Option, + token: Option, + } + + // Helper macro to skip tests if TEST_INTEGRATION and the AWS environment variables are not set. + macro_rules! maybe_skip_integration { + () => {{ + dotenv::dotenv().ok(); + + let required_vars = [ + "AWS_DEFAULT_REGION", + "OBJECT_STORE_BUCKET", + "AWS_ACCESS_KEY_ID", + "AWS_SECRET_ACCESS_KEY", + ]; + let unset_vars: Vec<_> = required_vars + .iter() + .filter_map(|&name| match env::var(name) { + Ok(_) => None, + Err(_) => Some(name), + }) + .collect(); + let unset_var_names = unset_vars.join(", "); + + let force = env::var("TEST_INTEGRATION"); + + if force.is_ok() && !unset_var_names.is_empty() { + panic!( + "TEST_INTEGRATION is set, \ + but variable(s) {} need to be set", + unset_var_names + ); + } else if force.is_err() { + eprintln!( + "skipping AWS integration test - set {}TEST_INTEGRATION to run", + if unset_var_names.is_empty() { + String::new() + } else { + format!("{} and ", unset_var_names) + } + ); + return; + } else { + AwsConfig { + access_key_id: env::var("AWS_ACCESS_KEY_ID") + .expect("already checked AWS_ACCESS_KEY_ID"), + secret_access_key: env::var("AWS_SECRET_ACCESS_KEY") + .expect("already checked AWS_SECRET_ACCESS_KEY"), + region: env::var("AWS_DEFAULT_REGION") + .expect("already checked AWS_DEFAULT_REGION"), + bucket: env::var("OBJECT_STORE_BUCKET") + .expect("already checked OBJECT_STORE_BUCKET"), + endpoint: env::var("AWS_ENDPOINT").ok(), + token: env::var("AWS_SESSION_TOKEN").ok(), + } + } + }}; + } + + fn check_credentials(r: Result) -> Result { + if let Err(e) = &r { + let e = &**e; + if let Some(e) = e.downcast_ref::() { + if e.s3_error_due_to_credentials() { + eprintln!( + "Try setting the AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY \ + environment variables" + ); + } + } + } + + r + } + + fn make_integration(config: AwsConfig) -> AmazonS3 { + new_s3( + Some(config.access_key_id), + Some(config.secret_access_key), + config.region, + config.bucket, + config.endpoint, + config.token, + NonZeroUsize::new(16).unwrap(), + true, + ) + .expect("Valid S3 config") + } + + #[tokio::test] + async fn s3_test() { + let config = maybe_skip_integration!(); + let integration = make_integration(config); + + check_credentials(put_get_delete_list(&integration).await).unwrap(); + check_credentials(list_uses_directories_correctly(&integration).await).unwrap(); + check_credentials(list_with_delimiter(&integration).await).unwrap(); + check_credentials(rename_and_copy(&integration).await).unwrap(); + } + + #[tokio::test] + async fn s3_test_get_nonexistent_location() { + let config = maybe_skip_integration!(); + let integration = make_integration(config); + + let location = Path::from_iter([NON_EXISTENT_NAME]); + + let err = get_nonexistent_object(&integration, Some(location)) + .await + .unwrap_err(); + if let ObjectStoreError::NotFound { path, source } = err { + let source_variant = source.downcast_ref::>(); + assert!( + matches!( + source_variant, + Some(rusoto_core::RusotoError::Service( + rusoto_s3::GetObjectError::NoSuchKey(_) + )), + ), + "got: {:?}", + source_variant + ); + assert_eq!(path, NON_EXISTENT_NAME); + } else { + panic!("unexpected error type: {:?}", err); + } + } + + #[tokio::test] + async fn s3_test_get_nonexistent_bucket() { + let mut config = maybe_skip_integration!(); + config.bucket = NON_EXISTENT_NAME.into(); + let integration = make_integration(config); + + let location = Path::from_iter([NON_EXISTENT_NAME]); + + let err = integration.get(&location).await.unwrap_err().to_string(); + assert!( + err.contains("The specified bucket does not exist"), + "{}", + err + ) + } + + #[tokio::test] + async fn s3_test_put_nonexistent_bucket() { + let mut config = maybe_skip_integration!(); + config.bucket = NON_EXISTENT_NAME.into(); + let integration = make_integration(config); + + let location = Path::from_iter([NON_EXISTENT_NAME]); + let data = Bytes::from("arbitrary data"); + + let err = integration + .put(&location, data) + .await + .unwrap_err() + .to_string(); + + assert!( + err.contains("The specified bucket does not exist") + && err.contains("Unable to PUT data"), + "{}", + err + ) + } + + #[tokio::test] + async fn s3_test_delete_nonexistent_location() { + let config = maybe_skip_integration!(); + let integration = make_integration(config); + + let location = Path::from_iter([NON_EXISTENT_NAME]); + + integration.delete(&location).await.unwrap(); + } + + #[tokio::test] + async fn s3_test_delete_nonexistent_bucket() { + let mut config = maybe_skip_integration!(); + config.bucket = NON_EXISTENT_NAME.into(); + let integration = make_integration(config); + + let location = Path::from_iter([NON_EXISTENT_NAME]); + + let err = integration.delete(&location).await.unwrap_err().to_string(); + assert!( + err.contains("The specified bucket does not exist") + && err.contains("Unable to DELETE data"), + "{}", + err + ) + } +} diff --git a/src/azure.rs b/src/azure.rs new file mode 100644 index 0000000..5f43279 --- /dev/null +++ b/src/azure.rs @@ -0,0 +1,646 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! An object store implementation for Azure blob storage +use crate::{ + path::{Path, DELIMITER}, + util::format_prefix, + GetResult, ListResult, ObjectMeta, ObjectStore, Result, +}; +use async_trait::async_trait; +use azure_core::{prelude::*, HttpClient}; +use azure_storage::core::prelude::{AsStorageClient, StorageAccountClient}; +use azure_storage_blobs::blob::responses::ListBlobsResponse; +use azure_storage_blobs::blob::Blob; +use azure_storage_blobs::{ + prelude::{AsBlobClient, AsContainerClient, ContainerClient}, + DeleteSnapshotsMethod, +}; +use bytes::Bytes; +use futures::{ + stream::{self, BoxStream}, + StreamExt, TryStreamExt, +}; +use snafu::{ResultExt, Snafu}; +use std::collections::BTreeSet; +use std::{convert::TryInto, sync::Arc}; + +/// A specialized `Error` for Azure object store-related errors +#[derive(Debug, Snafu)] +#[allow(missing_docs)] +enum Error { + #[snafu(display( + "Unable to DELETE data. Container: {}, Location: {}, Error: {} ({:?})", + container, + path, + source, + source, + ))] + UnableToDeleteData { + source: Box, + container: String, + path: String, + }, + + #[snafu(display( + "Unable to GET data. Container: {}, Location: {}, Error: {} ({:?})", + container, + path, + source, + source, + ))] + UnableToGetData { + source: Box, + container: String, + path: String, + }, + + #[snafu(display( + "Unable to HEAD data. Container: {}, Location: {}, Error: {} ({:?})", + container, + path, + source, + source, + ))] + UnableToHeadData { + source: Box, + container: String, + path: String, + }, + + #[snafu(display( + "Unable to GET part of the data. Container: {}, Location: {}, Error: {} ({:?})", + container, + path, + source, + source, + ))] + UnableToGetPieceOfData { + source: Box, + container: String, + path: String, + }, + + #[snafu(display( + "Unable to PUT data. Bucket: {}, Location: {}, Error: {} ({:?})", + container, + path, + source, + source, + ))] + UnableToPutData { + source: Box, + container: String, + path: String, + }, + + #[snafu(display( + "Unable to list data. Bucket: {}, Error: {} ({:?})", + container, + source, + source, + ))] + UnableToListData { + source: Box, + container: String, + }, + + #[snafu(display( + "Unable to copy object. Container: {}, From: {}, To: {}, Error: {}", + container, + from, + to, + source + ))] + UnableToCopyFile { + source: Box, + container: String, + from: String, + to: String, + }, + + #[snafu(display( + "Unable parse source url. Container: {}, Error: {}", + container, + source + ))] + UnableToParseUrl { + source: url::ParseError, + container: String, + }, + + NotFound { + path: String, + source: Box, + }, + + AlreadyExists { + path: String, + source: Box, + }, + + #[cfg(not(feature = "azure_test"))] + #[snafu(display( + "Azurite (azure emulator) support not compiled in, please add `azure_test` feature" + ))] + NoEmulatorFeature, +} + +impl From for super::Error { + fn from(source: Error) -> Self { + match source { + Error::NotFound { path, source } => Self::NotFound { path, source }, + Error::AlreadyExists { path, source } => Self::AlreadyExists { path, source }, + _ => Self::Generic { + store: "Azure Blob Storage", + source: Box::new(source), + }, + } + } +} + +/// Configuration for connecting to [Microsoft Azure Blob Storage](https://azure.microsoft.com/en-us/services/storage/blobs/). +#[derive(Debug)] +pub struct MicrosoftAzure { + container_client: Arc, + container_name: String, + blob_base_url: String, + is_emulator: bool, +} + +impl std::fmt::Display for MicrosoftAzure { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self.is_emulator { + true => write!(f, "MicrosoftAzureEmulator({})", self.container_name), + false => write!(f, "MicrosoftAzure({})", self.container_name), + } + } +} + +#[allow(clippy::borrowed_box)] +fn check_err_not_found(err: &Box) -> bool { + if let Some(azure_core::HttpError::StatusCode { status, .. }) = + err.downcast_ref::() + { + return status.as_u16() == 404; + }; + false +} + +#[async_trait] +impl ObjectStore for MicrosoftAzure { + async fn put(&self, location: &Path, bytes: Bytes) -> Result<()> { + let bytes = bytes::BytesMut::from(&*bytes); + + self.container_client + .as_blob_client(location.as_ref()) + .put_block_blob(bytes) + .execute() + .await + .context(UnableToPutDataSnafu { + container: &self.container_name, + path: location.to_owned(), + })?; + + Ok(()) + } + + async fn get(&self, location: &Path) -> Result { + let blob = self + .container_client + .as_blob_client(location.as_ref()) + .get() + .execute() + .await + .map_err(|err| { + if check_err_not_found(&err) { + return Error::NotFound { + source: err, + path: location.to_string(), + }; + }; + Error::UnableToGetData { + source: err, + container: self.container_name.clone(), + path: location.to_string(), + } + })?; + + Ok(GetResult::Stream( + futures::stream::once(async move { Ok(blob.data) }).boxed(), + )) + } + + async fn get_range( + &self, + location: &Path, + range: std::ops::Range, + ) -> Result { + let blob = self + .container_client + .as_blob_client(location.as_ref()) + .get() + .range(range) + .execute() + .await + .map_err(|err| { + if check_err_not_found(&err) { + return Error::NotFound { + source: err, + path: location.to_string(), + }; + }; + Error::UnableToGetPieceOfData { + source: err, + container: self.container_name.clone(), + path: location.to_string(), + } + })?; + + Ok(blob.data) + } + + async fn head(&self, location: &Path) -> Result { + let res = self + .container_client + .as_blob_client(location.as_ref()) + .get_properties() + .execute() + .await + .map_err(|err| { + if check_err_not_found(&err) { + return Error::NotFound { + source: err, + path: location.to_string(), + }; + }; + Error::UnableToHeadData { + source: err, + container: self.container_name.clone(), + path: location.to_string(), + } + })?; + + convert_object_meta(res.blob)?.ok_or_else(|| super::Error::NotFound { + path: location.to_string(), + source: "is directory".to_string().into(), + }) + } + + async fn delete(&self, location: &Path) -> Result<()> { + self.container_client + .as_blob_client(location.as_ref()) + .delete() + .delete_snapshots_method(DeleteSnapshotsMethod::Include) + .execute() + .await + .context(UnableToDeleteDataSnafu { + container: &self.container_name, + path: location.to_string(), + })?; + + Ok(()) + } + + async fn list( + &self, + prefix: Option<&Path>, + ) -> Result>> { + let stream = self + .list_impl(prefix, false) + .await? + .map_ok(|resp| { + let names = resp + .blobs + .blobs + .into_iter() + .filter_map(|blob| convert_object_meta(blob).transpose()); + futures::stream::iter(names) + }) + .try_flatten() + .boxed(); + + Ok(stream) + } + + async fn list_with_delimiter(&self, prefix: Option<&Path>) -> Result { + let mut stream = self.list_impl(prefix, true).await?; + + let mut common_prefixes = BTreeSet::new(); + let mut objects = Vec::new(); + + while let Some(res) = stream.next().await { + let response = res?; + + let prefixes = response.blobs.blob_prefix.unwrap_or_default(); + for p in prefixes { + common_prefixes.insert(Path::parse(&p.name)?); + } + + let blobs = response.blobs.blobs; + objects.reserve(blobs.len()); + for blob in blobs { + if let Some(meta) = convert_object_meta(blob)? { + objects.push(meta); + } + } + } + + Ok(ListResult { + common_prefixes: common_prefixes.into_iter().collect(), + objects, + }) + } + + async fn copy(&self, from: &Path, to: &Path) -> Result<()> { + let from_url = self.get_copy_from_url(from)?; + self.container_client + .as_blob_client(to.as_ref()) + .copy(&from_url) + .execute() + .await + .context(UnableToCopyFileSnafu { + container: &self.container_name, + from: from.as_ref(), + to: to.as_ref(), + })?; + Ok(()) + } + + async fn copy_if_not_exists(&self, from: &Path, to: &Path) -> Result<()> { + let from_url = self.get_copy_from_url(from)?; + self.container_client + .as_blob_client(to.as_ref()) + .copy(&from_url) + .if_match_condition(IfMatchCondition::NotMatch("*".to_string())) + .execute() + .await + .map_err(|err| { + if let Some(azure_core::HttpError::StatusCode { status, .. }) = + err.downcast_ref::() + { + if status.as_u16() == 409 { + return Error::AlreadyExists { + source: err, + path: to.to_string(), + }; + }; + }; + Error::UnableToCopyFile { + source: err, + container: self.container_name.clone(), + from: from.to_string(), + to: to.to_string(), + } + })?; + Ok(()) + } +} + +impl MicrosoftAzure { + /// helper function to create a source url for copy function + fn get_copy_from_url(&self, from: &Path) -> Result { + Ok(reqwest::Url::parse(&format!( + "{}/{}/{}", + &self.blob_base_url, self.container_name, from + )) + .context(UnableToParseUrlSnafu { + container: &self.container_name, + })?) + } + + async fn list_impl( + &self, + prefix: Option<&Path>, + delimiter: bool, + ) -> Result>> { + enum ListState { + Start, + HasMore(String), + Done, + } + + let prefix_raw = format_prefix(prefix); + + Ok(stream::unfold(ListState::Start, move |state| { + let mut request = self.container_client.list_blobs(); + + if let Some(p) = prefix_raw.as_deref() { + request = request.prefix(p); + } + + if delimiter { + request = request.delimiter(Delimiter::new(DELIMITER)); + } + + async move { + match state { + ListState::HasMore(ref marker) => { + request = request.next_marker(marker as &str); + } + ListState::Done => { + return None; + } + ListState::Start => {} + } + + let resp = match request.execute().await.context(UnableToListDataSnafu { + container: &self.container_name, + }) { + Ok(resp) => resp, + Err(err) => return Some((Err(crate::Error::from(err)), state)), + }; + + let next_state = if let Some(marker) = &resp.next_marker { + ListState::HasMore(marker.as_str().to_string()) + } else { + ListState::Done + }; + + Some((Ok(resp), next_state)) + } + }) + .boxed()) + } +} + +/// Returns `None` if is a directory +fn convert_object_meta(blob: Blob) -> Result> { + let location = Path::parse(blob.name)?; + let last_modified = blob.properties.last_modified; + let size = blob + .properties + .content_length + .try_into() + .expect("unsupported size on this platform"); + + // This is needed to filter out gen2 directories + // https://docs.microsoft.com/en-us/azure/storage/blobs/data-lake-storage-known-issues#blob-storage-apis + Ok((size > 0).then(|| ObjectMeta { + location, + last_modified, + size, + })) +} + +#[cfg(feature = "azure_test")] +fn check_if_emulator_works() -> Result<()> { + Ok(()) +} + +#[cfg(not(feature = "azure_test"))] +fn check_if_emulator_works() -> Result<()> { + Err(Error::NoEmulatorFeature.into()) +} + +/// Configure a connection to container with given name on Microsoft Azure +/// Blob store. +/// +/// The credentials `account` and `access_key` must provide access to the +/// store. +pub fn new_azure( + account: impl Into, + access_key: impl Into, + container_name: impl Into, + use_emulator: bool, +) -> Result { + let account = account.into(); + let access_key = access_key.into(); + let http_client: Arc = Arc::new(reqwest::Client::new()); + + let (is_emulator, storage_account_client) = if use_emulator { + check_if_emulator_works()?; + (true, StorageAccountClient::new_emulator_default()) + } else { + ( + false, + StorageAccountClient::new_access_key( + Arc::clone(&http_client), + &account, + &access_key, + ), + ) + }; + + let storage_client = storage_account_client.as_storage_client(); + let blob_base_url = storage_account_client + .blob_storage_url() + .as_ref() + // make url ending consistent between the emulator and remote storage account + .trim_end_matches('/') + .to_string(); + + let container_name = container_name.into(); + + let container_client = storage_client.as_container_client(&container_name); + + Ok(MicrosoftAzure { + container_client, + container_name, + blob_base_url, + is_emulator, + }) +} + +#[cfg(test)] +mod tests { + use crate::azure::new_azure; + use crate::tests::{ + copy_if_not_exists, list_uses_directories_correctly, list_with_delimiter, + put_get_delete_list, rename_and_copy, + }; + use std::env; + + #[derive(Debug)] + struct AzureConfig { + storage_account: String, + access_key: String, + bucket: String, + use_emulator: bool, + } + + // Helper macro to skip tests if TEST_INTEGRATION and the Azure environment + // variables are not set. + macro_rules! maybe_skip_integration { + () => {{ + dotenv::dotenv().ok(); + + let use_emulator = std::env::var("AZURE_USE_EMULATOR").is_ok(); + + let mut required_vars = vec!["OBJECT_STORE_BUCKET"]; + if !use_emulator { + required_vars.push("AZURE_STORAGE_ACCOUNT"); + required_vars.push("AZURE_STORAGE_ACCESS_KEY"); + } + let unset_vars: Vec<_> = required_vars + .iter() + .filter_map(|&name| match env::var(name) { + Ok(_) => None, + Err(_) => Some(name), + }) + .collect(); + let unset_var_names = unset_vars.join(", "); + + let force = std::env::var("TEST_INTEGRATION"); + + if force.is_ok() && !unset_var_names.is_empty() { + panic!( + "TEST_INTEGRATION is set, \ + but variable(s) {} need to be set", + unset_var_names + ) + } else if force.is_err() { + eprintln!( + "skipping Azure integration test - set {}TEST_INTEGRATION to run", + if unset_var_names.is_empty() { + String::new() + } else { + format!("{} and ", unset_var_names) + } + ); + return; + } else { + AzureConfig { + storage_account: env::var("AZURE_STORAGE_ACCOUNT") + .unwrap_or_default(), + access_key: env::var("AZURE_STORAGE_ACCESS_KEY").unwrap_or_default(), + bucket: env::var("OBJECT_STORE_BUCKET") + .expect("already checked OBJECT_STORE_BUCKET"), + use_emulator, + } + } + }}; + } + + #[tokio::test] + async fn azure_blob_test() { + let config = maybe_skip_integration!(); + let integration = new_azure( + config.storage_account, + config.access_key, + config.bucket, + config.use_emulator, + ) + .unwrap(); + + put_get_delete_list(&integration).await.unwrap(); + list_uses_directories_correctly(&integration).await.unwrap(); + list_with_delimiter(&integration).await.unwrap(); + rename_and_copy(&integration).await.unwrap(); + copy_if_not_exists(&integration).await.unwrap(); + } +} diff --git a/src/gcp.rs b/src/gcp.rs new file mode 100644 index 0000000..84fb572 --- /dev/null +++ b/src/gcp.rs @@ -0,0 +1,721 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! An object store implementation for Google Cloud Storage +use std::collections::BTreeSet; +use std::fs::File; +use std::io::BufReader; +use std::ops::Range; + +use async_trait::async_trait; +use bytes::Bytes; +use chrono::{DateTime, Utc}; +use futures::{stream::BoxStream, StreamExt, TryStreamExt}; +use percent_encoding::{percent_encode, NON_ALPHANUMERIC}; +use reqwest::header::RANGE; +use reqwest::{header, Client, Method, Response, StatusCode}; +use snafu::{ResultExt, Snafu}; + +use crate::util::format_http_range; +use crate::{ + oauth::OAuthProvider, + path::{Path, DELIMITER}, + token::TokenCache, + util::format_prefix, + GetResult, ListResult, ObjectMeta, ObjectStore, Result, +}; + +#[derive(Debug, Snafu)] +enum Error { + #[snafu(display("Unable to open service account file: {}", source))] + OpenCredentials { source: std::io::Error }, + + #[snafu(display("Unable to decode service account file: {}", source))] + DecodeCredentials { source: serde_json::Error }, + + #[snafu(display("Error performing list request: {}", source))] + ListRequest { source: reqwest::Error }, + + #[snafu(display("Error performing get request {}: {}", path, source))] + GetRequest { + source: reqwest::Error, + path: String, + }, + + #[snafu(display("Error performing delete request {}: {}", path, source))] + DeleteRequest { + source: reqwest::Error, + path: String, + }, + + #[snafu(display("Error performing copy request {}: {}", path, source))] + CopyRequest { + source: reqwest::Error, + path: String, + }, + + #[snafu(display("Error performing put request: {}", source))] + PutRequest { source: reqwest::Error }, + + #[snafu(display("Error decoding object size: {}", source))] + InvalidSize { source: std::num::ParseIntError }, +} + +impl From for super::Error { + fn from(err: Error) -> Self { + match err { + Error::GetRequest { source, path } + | Error::DeleteRequest { source, path } + | Error::CopyRequest { source, path } + if matches!(source.status(), Some(StatusCode::NOT_FOUND)) => + { + Self::NotFound { + path, + source: Box::new(source), + } + } + _ => Self::Generic { + store: "GCS", + source: Box::new(err), + }, + } + } +} + +/// A deserialized `service-account-********.json`-file. +#[derive(serde::Deserialize, Debug)] +struct ServiceAccountCredentials { + /// The private key in RSA format. + pub private_key: String, + + /// The email address associated with the service account. + pub client_email: String, + + /// Base URL for GCS + #[serde(default = "default_gcs_base_url")] + pub gcs_base_url: String, + + /// Disable oauth and use empty tokens. + #[serde(default = "default_disable_oauth")] + pub disable_oauth: bool, +} + +fn default_gcs_base_url() -> String { + "https://storage.googleapis.com".to_owned() +} + +fn default_disable_oauth() -> bool { + false +} + +#[derive(serde::Deserialize, Debug)] +#[serde(rename_all = "camelCase")] +struct ListResponse { + next_page_token: Option, + #[serde(default)] + prefixes: Vec, + #[serde(default)] + items: Vec, +} + +#[derive(serde::Deserialize, Debug)] +struct Object { + name: String, + size: String, + updated: DateTime, +} + +/// Configuration for connecting to [Google Cloud Storage](https://cloud.google.com/storage/). +#[derive(Debug)] +pub struct GoogleCloudStorage { + client: Client, + base_url: String, + + oauth_provider: Option, + token_cache: TokenCache, + + bucket_name: String, + bucket_name_encoded: String, + + // TODO: Hook this up in tests + max_list_results: Option, +} + +impl std::fmt::Display for GoogleCloudStorage { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "GoogleCloudStorage({})", self.bucket_name) + } +} + +impl GoogleCloudStorage { + async fn get_token(&self) -> Result { + if let Some(oauth_provider) = &self.oauth_provider { + Ok(self + .token_cache + .get_or_insert_with(|| oauth_provider.fetch_token(&self.client)) + .await?) + } else { + Ok("".to_owned()) + } + } + + fn object_url(&self, path: &Path) -> String { + let encoded = + percent_encoding::utf8_percent_encode(path.as_ref(), NON_ALPHANUMERIC); + format!( + "{}/storage/v1/b/{}/o/{}", + self.base_url, self.bucket_name_encoded, encoded + ) + } + + /// Perform a get request + async fn get_request( + &self, + path: &Path, + range: Option>, + head: bool, + ) -> Result { + let token = self.get_token().await?; + let url = self.object_url(path); + + let mut builder = self.client.request(Method::GET, url); + + if let Some(range) = range { + builder = builder.header(RANGE, format_http_range(range)); + } + + let alt = match head { + true => "json", + false => "media", + }; + + let response = builder + .bearer_auth(token) + .query(&[("alt", alt)]) + .send() + .await + .context(GetRequestSnafu { + path: path.as_ref(), + })? + .error_for_status() + .context(GetRequestSnafu { + path: path.as_ref(), + })?; + + Ok(response) + } + + /// Perform a put request + async fn put_request(&self, path: &Path, payload: Bytes) -> Result<()> { + let token = self.get_token().await?; + let url = format!( + "{}/upload/storage/v1/b/{}/o", + self.base_url, self.bucket_name_encoded + ); + + self.client + .request(Method::POST, url) + .bearer_auth(token) + .header(header::CONTENT_TYPE, "application/octet-stream") + .header(header::CONTENT_LENGTH, payload.len()) + .query(&[("uploadType", "media"), ("name", path.as_ref())]) + .body(payload) + .send() + .await + .context(PutRequestSnafu)? + .error_for_status() + .context(PutRequestSnafu)?; + + Ok(()) + } + + /// Perform a delete request + async fn delete_request(&self, path: &Path) -> Result<()> { + let token = self.get_token().await?; + let url = self.object_url(path); + + let builder = self.client.request(Method::DELETE, url); + builder + .bearer_auth(token) + .send() + .await + .context(DeleteRequestSnafu { + path: path.as_ref(), + })? + .error_for_status() + .context(DeleteRequestSnafu { + path: path.as_ref(), + })?; + + Ok(()) + } + + /// Perform a copy request + async fn copy_request( + &self, + from: &Path, + to: &Path, + if_not_exists: bool, + ) -> Result<()> { + let token = self.get_token().await?; + + let source = + percent_encoding::utf8_percent_encode(from.as_ref(), NON_ALPHANUMERIC); + let destination = + percent_encoding::utf8_percent_encode(to.as_ref(), NON_ALPHANUMERIC); + let url = format!( + "{}/storage/v1/b/{}/o/{}/copyTo/b/{}/o/{}", + self.base_url, + self.bucket_name_encoded, + source, + self.bucket_name_encoded, + destination + ); + + let mut builder = self.client.request(Method::POST, url); + + if if_not_exists { + builder = builder.query(&[("ifGenerationMatch", "0")]); + } + + builder + .bearer_auth(token) + .send() + .await + .context(CopyRequestSnafu { + path: from.as_ref(), + })? + .error_for_status() + .context(CopyRequestSnafu { + path: from.as_ref(), + })?; + + Ok(()) + } + + /// Perform a list request + async fn list_request( + &self, + prefix: Option<&str>, + delimiter: bool, + page_token: Option<&str>, + ) -> Result { + let token = self.get_token().await?; + + let url = format!( + "{}/storage/v1/b/{}/o", + self.base_url, self.bucket_name_encoded + ); + + let mut query = Vec::with_capacity(4); + if delimiter { + query.push(("delimiter", DELIMITER)) + } + + if let Some(prefix) = &prefix { + query.push(("prefix", prefix)) + } + + if let Some(page_token) = page_token { + query.push(("pageToken", page_token)) + } + + if let Some(max_results) = &self.max_list_results { + query.push(("maxResults", max_results)) + } + + let response: ListResponse = self + .client + .request(Method::GET, url) + .query(&query) + .bearer_auth(token) + .send() + .await + .context(ListRequestSnafu)? + .error_for_status() + .context(ListRequestSnafu)? + .json() + .await + .context(ListRequestSnafu)?; + + Ok(response) + } + + /// Perform a list operation automatically handling pagination + fn list_paginated( + &self, + prefix: Option<&Path>, + delimiter: bool, + ) -> Result>> { + let prefix = format_prefix(prefix); + + enum ListState { + Start, + HasMore(String), + Done, + } + + Ok(futures::stream::unfold(ListState::Start, move |state| { + let prefix = prefix.clone(); + + async move { + let page_token = match &state { + ListState::Start => None, + ListState::HasMore(page_token) => Some(page_token.as_str()), + ListState::Done => { + return None; + } + }; + + let resp = match self + .list_request(prefix.as_deref(), delimiter, page_token) + .await + { + Ok(resp) => resp, + Err(e) => return Some((Err(e), state)), + }; + + let next_state = match &resp.next_page_token { + Some(token) => ListState::HasMore(token.clone()), + None => ListState::Done, + }; + + Some((Ok(resp), next_state)) + } + }) + .boxed()) + } +} + +#[async_trait] +impl ObjectStore for GoogleCloudStorage { + async fn put(&self, location: &Path, bytes: Bytes) -> Result<()> { + self.put_request(location, bytes).await + } + + async fn get(&self, location: &Path) -> Result { + let response = self.get_request(location, None, false).await?; + let stream = response + .bytes_stream() + .map_err(|source| crate::Error::Generic { + store: "GCS", + source: Box::new(source), + }) + .boxed(); + + Ok(GetResult::Stream(stream)) + } + + async fn get_range(&self, location: &Path, range: Range) -> Result { + let response = self.get_request(location, Some(range), false).await?; + Ok(response.bytes().await.context(GetRequestSnafu { + path: location.as_ref(), + })?) + } + + async fn head(&self, location: &Path) -> Result { + let response = self.get_request(location, None, true).await?; + let object = response.json().await.context(GetRequestSnafu { + path: location.as_ref(), + })?; + convert_object_meta(&object) + } + + async fn delete(&self, location: &Path) -> Result<()> { + self.delete_request(location).await + } + + async fn list( + &self, + prefix: Option<&Path>, + ) -> Result>> { + let stream = self + .list_paginated(prefix, false)? + .map_ok(|r| { + futures::stream::iter( + r.items.into_iter().map(|x| convert_object_meta(&x)), + ) + }) + .try_flatten() + .boxed(); + + Ok(stream) + } + + async fn list_with_delimiter(&self, prefix: Option<&Path>) -> Result { + let mut stream = self.list_paginated(prefix, true)?; + + let mut common_prefixes = BTreeSet::new(); + let mut objects = Vec::new(); + + while let Some(result) = stream.next().await { + let response = result?; + + for p in response.prefixes { + common_prefixes.insert(Path::parse(p)?); + } + + objects.reserve(response.items.len()); + for object in &response.items { + objects.push(convert_object_meta(object)?); + } + } + + Ok(ListResult { + common_prefixes: common_prefixes.into_iter().collect(), + objects, + }) + } + + async fn copy(&self, from: &Path, to: &Path) -> Result<()> { + self.copy_request(from, to, false).await + } + + async fn copy_if_not_exists(&self, from: &Path, to: &Path) -> Result<()> { + self.copy_request(from, to, true).await + } +} + +fn reader_credentials_file( + service_account_path: impl AsRef, +) -> Result { + let file = File::open(service_account_path).context(OpenCredentialsSnafu)?; + let reader = BufReader::new(file); + Ok(serde_json::from_reader(reader).context(DecodeCredentialsSnafu)?) +} + +/// Configure a connection to Google Cloud Storage. +pub fn new_gcs( + service_account_path: impl AsRef, + bucket_name: impl Into, +) -> Result { + let credentials = reader_credentials_file(service_account_path)?; + let client = Client::new(); + + // TODO: https://cloud.google.com/storage/docs/authentication#oauth-scopes + let scope = "https://www.googleapis.com/auth/devstorage.full_control"; + let audience = "https://www.googleapis.com/oauth2/v4/token".to_string(); + + let oauth_provider = (!credentials.disable_oauth) + .then(|| { + OAuthProvider::new( + credentials.client_email, + credentials.private_key, + scope.to_string(), + audience, + ) + }) + .transpose()?; + + let bucket_name = bucket_name.into(); + let encoded_bucket_name = + percent_encode(bucket_name.as_bytes(), NON_ALPHANUMERIC).to_string(); + + // The cloud storage crate currently only supports authentication via + // environment variables. Set the environment variable explicitly so + // that we can optionally accept command line arguments instead. + Ok(GoogleCloudStorage { + client, + base_url: credentials.gcs_base_url, + oauth_provider, + token_cache: Default::default(), + bucket_name, + bucket_name_encoded: encoded_bucket_name, + max_list_results: None, + }) +} + +fn convert_object_meta(object: &Object) -> Result { + let location = Path::parse(&object.name)?; + let last_modified = object.updated; + let size = object.size.parse().context(InvalidSizeSnafu)?; + + Ok(ObjectMeta { + location, + last_modified, + size, + }) +} + +#[cfg(test)] +mod test { + use std::env; + + use bytes::Bytes; + + use crate::{ + tests::{ + get_nonexistent_object, list_uses_directories_correctly, list_with_delimiter, + put_get_delete_list, rename_and_copy, + }, + Error as ObjectStoreError, ObjectStore, + }; + + use super::*; + + const NON_EXISTENT_NAME: &str = "nonexistentname"; + + #[derive(Debug)] + struct GoogleCloudConfig { + bucket: String, + service_account: String, + } + + // Helper macro to skip tests if TEST_INTEGRATION and the GCP environment variables are not set. + macro_rules! maybe_skip_integration { + () => {{ + dotenv::dotenv().ok(); + + let required_vars = ["OBJECT_STORE_BUCKET", "GOOGLE_SERVICE_ACCOUNT"]; + let unset_vars: Vec<_> = required_vars + .iter() + .filter_map(|&name| match env::var(name) { + Ok(_) => None, + Err(_) => Some(name), + }) + .collect(); + let unset_var_names = unset_vars.join(", "); + + let force = std::env::var("TEST_INTEGRATION"); + + if force.is_ok() && !unset_var_names.is_empty() { + panic!( + "TEST_INTEGRATION is set, \ + but variable(s) {} need to be set", + unset_var_names + ) + } else if force.is_err() { + eprintln!( + "skipping Google Cloud integration test - set {}TEST_INTEGRATION to run", + if unset_var_names.is_empty() { + String::new() + } else { + format!("{} and ", unset_var_names) + } + ); + return; + } else { + GoogleCloudConfig { + bucket: env::var("OBJECT_STORE_BUCKET") + .expect("already checked OBJECT_STORE_BUCKET"), + service_account: env::var("GOOGLE_SERVICE_ACCOUNT") + .expect("already checked GOOGLE_SERVICE_ACCOUNT"), + } + } + }}; + } + + #[tokio::test] + async fn gcs_test() { + let config = maybe_skip_integration!(); + let integration = new_gcs(config.service_account, config.bucket).unwrap(); + + put_get_delete_list(&integration).await.unwrap(); + list_uses_directories_correctly(&integration).await.unwrap(); + list_with_delimiter(&integration).await.unwrap(); + rename_and_copy(&integration).await.unwrap(); + } + + #[tokio::test] + async fn gcs_test_get_nonexistent_location() { + let config = maybe_skip_integration!(); + let integration = new_gcs(config.service_account, &config.bucket).unwrap(); + + let location = Path::from_iter([NON_EXISTENT_NAME]); + + let err = integration.get(&location).await.unwrap_err(); + + assert!( + matches!(err, ObjectStoreError::NotFound { .. }), + "unexpected error type: {}", + err + ); + } + + #[tokio::test] + async fn gcs_test_get_nonexistent_bucket() { + let mut config = maybe_skip_integration!(); + config.bucket = NON_EXISTENT_NAME.into(); + let integration = new_gcs(config.service_account, &config.bucket).unwrap(); + + let location = Path::from_iter([NON_EXISTENT_NAME]); + + let err = get_nonexistent_object(&integration, Some(location)) + .await + .unwrap_err(); + + assert!( + matches!(err, ObjectStoreError::NotFound { .. }), + "unexpected error type: {}", + err + ); + } + + #[tokio::test] + async fn gcs_test_delete_nonexistent_location() { + let config = maybe_skip_integration!(); + let integration = new_gcs(config.service_account, &config.bucket).unwrap(); + + let location = Path::from_iter([NON_EXISTENT_NAME]); + + let err = integration.delete(&location).await.unwrap_err(); + assert!( + matches!(err, ObjectStoreError::NotFound { .. }), + "unexpected error type: {}", + err + ); + } + + #[tokio::test] + async fn gcs_test_delete_nonexistent_bucket() { + let mut config = maybe_skip_integration!(); + config.bucket = NON_EXISTENT_NAME.into(); + let integration = new_gcs(config.service_account, &config.bucket).unwrap(); + + let location = Path::from_iter([NON_EXISTENT_NAME]); + + let err = integration.delete(&location).await.unwrap_err(); + assert!( + matches!(err, ObjectStoreError::NotFound { .. }), + "unexpected error type: {}", + err + ); + } + + #[tokio::test] + async fn gcs_test_put_nonexistent_bucket() { + let mut config = maybe_skip_integration!(); + config.bucket = NON_EXISTENT_NAME.into(); + let integration = new_gcs(config.service_account, &config.bucket).unwrap(); + + let location = Path::from_iter([NON_EXISTENT_NAME]); + let data = Bytes::from("arbitrary data"); + + let err = integration + .put(&location, data) + .await + .unwrap_err() + .to_string(); + assert!( + err.contains( + "Error performing put request: HTTP status client error (404 Not Found)" + ), + "{}", + err + ) + } +} diff --git a/src/lib.rs b/src/lib.rs new file mode 100644 index 0000000..4a56b03 --- /dev/null +++ b/src/lib.rs @@ -0,0 +1,706 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#![deny(rustdoc::broken_intra_doc_links, rustdoc::bare_urls, rust_2018_idioms)] +#![warn( + missing_copy_implementations, + missing_debug_implementations, + missing_docs, + clippy::explicit_iter_loop, + clippy::future_not_send, + clippy::use_self, + clippy::clone_on_ref_ptr +)] + +//! # object_store +//! +//! This crate provides APIs for interacting with object storage services. +//! +//! It currently supports PUT, GET, DELETE, HEAD and list for: +//! +//! * [Google Cloud Storage](https://cloud.google.com/storage/) +//! * [Amazon S3](https://aws.amazon.com/s3/) +//! * [Azure Blob Storage](https://azure.microsoft.com/en-gb/services/storage/blobs/#overview) +//! * In-memory +//! * Local file storage +//! + +#[cfg(feature = "aws")] +pub mod aws; +#[cfg(feature = "azure")] +pub mod azure; +#[cfg(feature = "gcp")] +pub mod gcp; +pub mod local; +pub mod memory; +pub mod path; +pub mod throttle; + +#[cfg(feature = "gcp")] +mod oauth; + +#[cfg(feature = "gcp")] +mod token; + +mod util; + +use crate::path::Path; +use crate::util::{collect_bytes, maybe_spawn_blocking}; +use async_trait::async_trait; +use bytes::Bytes; +use chrono::{DateTime, Utc}; +use futures::{stream::BoxStream, StreamExt}; +use snafu::Snafu; +use std::fmt::{Debug, Formatter}; +use std::io::{Read, Seek, SeekFrom}; +use std::ops::Range; + +/// An alias for a dynamically dispatched object store implementation. +pub type DynObjectStore = dyn ObjectStore; + +/// Universal API to multiple object store services. +#[async_trait] +pub trait ObjectStore: std::fmt::Display + Send + Sync + Debug + 'static { + /// Save the provided bytes to the specified location. + async fn put(&self, location: &Path, bytes: Bytes) -> Result<()>; + + /// Return the bytes that are stored at the specified location. + async fn get(&self, location: &Path) -> Result; + + /// Return the bytes that are stored at the specified location + /// in the given byte range + async fn get_range(&self, location: &Path, range: Range) -> Result; + + /// Return the metadata for the specified location + async fn head(&self, location: &Path) -> Result; + + /// Delete the object at the specified location. + async fn delete(&self, location: &Path) -> Result<()>; + + /// List all the objects with the given prefix. + /// + /// Prefixes are evaluated on a path segment basis, i.e. `foo/bar/` is a prefix of `foo/bar/x` but not of + /// `foo/bar_baz/x`. + async fn list( + &self, + prefix: Option<&Path>, + ) -> Result>>; + + /// List objects with the given prefix and an implementation specific + /// delimiter. Returns common prefixes (directories) in addition to object + /// metadata. + /// + /// Prefixes are evaluated on a path segment basis, i.e. `foo/bar/` is a prefix of `foo/bar/x` but not of + /// `foo/bar_baz/x`. + async fn list_with_delimiter(&self, prefix: Option<&Path>) -> Result; + + /// Copy an object from one path to another in the same object store. + /// + /// If there exists an object at the destination, it will be overwritten. + async fn copy(&self, from: &Path, to: &Path) -> Result<()>; + + /// Move an object from one path to another in the same object store. + /// + /// By default, this is implemented as a copy and then delete source. It may not + /// check when deleting source that it was the same object that was originally copied. + /// + /// If there exists an object at the destination, it will be overwritten. + async fn rename(&self, from: &Path, to: &Path) -> Result<()> { + self.copy(from, to).await?; + self.delete(from).await + } + + /// Copy an object from one path to another, only if destination is empty. + /// + /// Will return an error if the destination already has an object. + async fn copy_if_not_exists(&self, from: &Path, to: &Path) -> Result<()>; + + /// Move an object from one path to another in the same object store. + /// + /// Will return an error if the destination already has an object. + async fn rename_if_not_exists(&self, from: &Path, to: &Path) -> Result<()> { + self.copy_if_not_exists(from, to).await?; + self.delete(from).await + } +} + +/// Result of a list call that includes objects, prefixes (directories) and a +/// token for the next set of results. Individual result sets may be limited to +/// 1,000 objects based on the underlying object storage's limitations. +#[derive(Debug)] +pub struct ListResult { + /// Prefixes that are common (like directories) + pub common_prefixes: Vec, + /// Object metadata for the listing + pub objects: Vec, +} + +/// The metadata that describes an object. +#[derive(Debug, Clone, PartialEq)] +pub struct ObjectMeta { + /// The full path to the object + pub location: Path, + /// The last modified time + pub last_modified: DateTime, + /// The size in bytes of the object + pub size: usize, +} + +/// Result for a get request +/// +/// This special cases the case of a local file, as some systems may +/// be able to optimise the case of a file already present on local disk +pub enum GetResult { + /// A file and its path on the local filesystem + File(std::fs::File, std::path::PathBuf), + /// An asynchronous stream + Stream(BoxStream<'static, Result>), +} + +impl Debug for GetResult { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + match self { + Self::File(_, _) => write!(f, "GetResult(File)"), + Self::Stream(_) => write!(f, "GetResult(Stream)"), + } + } +} + +impl GetResult { + /// Collects the data into a [`Bytes`] + pub async fn bytes(self) -> Result { + match self { + Self::File(mut file, path) => { + maybe_spawn_blocking(move || { + let len = file.seek(SeekFrom::End(0)).map_err(|source| { + local::Error::Seek { + source, + path: path.clone(), + } + })?; + + file.seek(SeekFrom::Start(0)).map_err(|source| { + local::Error::Seek { + source, + path: path.clone(), + } + })?; + + let mut buffer = Vec::with_capacity(len as usize); + file.read_to_end(&mut buffer).map_err(|source| { + local::Error::UnableToReadBytes { source, path } + })?; + + Ok(buffer.into()) + }) + .await + } + Self::Stream(s) => collect_bytes(s, None).await, + } + } + + /// Converts this into a byte stream + /// + /// If the result is [`Self::File`] will perform chunked reads of the file, otherwise + /// will return the [`Self::Stream`]. + /// + /// # Tokio Compatibility + /// + /// Tokio discourages performing blocking IO on a tokio worker thread, however, + /// no major operating systems have stable async file APIs. Therefore if called from + /// a tokio context, this will use [`tokio::runtime::Handle::spawn_blocking`] to dispatch + /// IO to a blocking thread pool, much like `tokio::fs` does under-the-hood. + /// + /// If not called from a tokio context, this will perform IO on the current thread with + /// no additional complexity or overheads + pub fn into_stream(self) -> BoxStream<'static, Result> { + match self { + Self::File(file, path) => { + const CHUNK_SIZE: usize = 8 * 1024; + + futures::stream::try_unfold( + (file, path, false), + |(mut file, path, finished)| { + maybe_spawn_blocking(move || { + if finished { + return Ok(None); + } + + let mut buffer = Vec::with_capacity(CHUNK_SIZE); + let read = file + .by_ref() + .take(CHUNK_SIZE as u64) + .read_to_end(&mut buffer) + .map_err(|e| local::Error::UnableToReadBytes { + source: e, + path: path.clone(), + })?; + + Ok(Some((buffer.into(), (file, path, read != CHUNK_SIZE)))) + }) + }, + ) + .boxed() + } + Self::Stream(s) => s, + } + } +} + +/// A specialized `Result` for object store-related errors +pub type Result = std::result::Result; + +/// A specialized `Error` for object store-related errors +#[derive(Debug, Snafu)] +#[allow(missing_docs)] +pub enum Error { + #[snafu(display("Generic {} error: {}", store, source))] + Generic { + store: &'static str, + source: Box, + }, + + #[snafu(display("Object at location {} not found: {}", path, source))] + NotFound { + path: String, + source: Box, + }, + + #[snafu( + display("Encountered object with invalid path: {}", source), + context(false) + )] + InvalidPath { source: path::Error }, + + #[snafu(display("Error joining spawned task: {}", source), context(false))] + JoinError { source: tokio::task::JoinError }, + + #[snafu(display("Operation not supported: {}", source))] + NotSupported { + source: Box, + }, + + #[snafu(display("Object at location {} already exists: {}", path, source))] + AlreadyExists { + path: String, + source: Box, + }, + + #[snafu(display("Operation not yet implemented."))] + NotImplemented, + + #[cfg(feature = "gcp")] + #[snafu(display("OAuth error: {}", source), context(false))] + OAuth { source: oauth::Error }, +} + +#[cfg(test)] +mod test_util { + use super::*; + use futures::TryStreamExt; + + pub async fn flatten_list_stream( + storage: &DynObjectStore, + prefix: Option<&Path>, + ) -> Result> { + storage + .list(prefix) + .await? + .map_ok(|meta| meta.location) + .try_collect::>() + .await + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::test_util::flatten_list_stream; + + type Error = Box; + type Result = std::result::Result; + + pub(crate) async fn put_get_delete_list(storage: &DynObjectStore) -> Result<()> { + let store_str = storage.to_string(); + + delete_fixtures(storage).await; + + let content_list = flatten_list_stream(storage, None).await?; + assert!( + content_list.is_empty(), + "Expected list to be empty; found: {:?}", + content_list + ); + + let location = Path::from("test_dir/test_file.json"); + + let data = Bytes::from("arbitrary data"); + let expected_data = data.clone(); + storage.put(&location, data).await?; + + let root = Path::from("/"); + + // List everything + let content_list = flatten_list_stream(storage, None).await?; + assert_eq!(content_list, &[location.clone()]); + + // Should behave the same as no prefix + let content_list = flatten_list_stream(storage, Some(&root)).await?; + assert_eq!(content_list, &[location.clone()]); + + // List with delimiter + let result = storage.list_with_delimiter(None).await.unwrap(); + assert_eq!(&result.objects, &[]); + assert_eq!(result.common_prefixes.len(), 1); + assert_eq!(result.common_prefixes[0], Path::from("test_dir")); + + // Should behave the same as no prefix + let result = storage.list_with_delimiter(Some(&root)).await.unwrap(); + assert!(result.objects.is_empty()); + assert_eq!(result.common_prefixes.len(), 1); + assert_eq!(result.common_prefixes[0], Path::from("test_dir")); + + // List everything starting with a prefix that should return results + let prefix = Path::from("test_dir"); + let content_list = flatten_list_stream(storage, Some(&prefix)).await?; + assert_eq!(content_list, &[location.clone()]); + + // List everything starting with a prefix that shouldn't return results + let prefix = Path::from("something"); + let content_list = flatten_list_stream(storage, Some(&prefix)).await?; + assert!(content_list.is_empty()); + + let read_data = storage.get(&location).await?.bytes().await?; + assert_eq!(&*read_data, expected_data); + + // Test range request + let range = 3..7; + let range_result = storage.get_range(&location, range.clone()).await; + + let out_of_range = 200..300; + let out_of_range_result = storage.get_range(&location, out_of_range).await; + + if store_str.starts_with("MicrosoftAzureEmulator") { + // Azurite doesn't support x-ms-range-get-content-crc64 set by Azure SDK + // https://github.com/Azure/Azurite/issues/444 + let err = range_result.unwrap_err().to_string(); + assert!(err.contains("x-ms-range-get-content-crc64 header or parameter is not supported in Azurite strict mode"), "{}", err); + + let err = out_of_range_result.unwrap_err().to_string(); + assert!(err.contains("x-ms-range-get-content-crc64 header or parameter is not supported in Azurite strict mode"), "{}", err); + } else { + let bytes = range_result.unwrap(); + assert_eq!(bytes, expected_data.slice(range)); + + // Should be a non-fatal error + out_of_range_result.unwrap_err(); + } + + let head = storage.head(&location).await?; + assert_eq!(head.size, expected_data.len()); + + storage.delete(&location).await?; + + let content_list = flatten_list_stream(storage, None).await?; + assert!(content_list.is_empty()); + + let err = storage.get(&location).await.unwrap_err(); + assert!(matches!(err, crate::Error::NotFound { .. }), "{}", err); + + let err = storage.head(&location).await.unwrap_err(); + assert!(matches!(err, crate::Error::NotFound { .. }), "{}", err); + + // Test handling of paths containing an encoded delimiter + + let file_with_delimiter = Path::from_iter(["a", "b/c", "foo.file"]); + storage + .put(&file_with_delimiter, Bytes::from("arbitrary")) + .await + .unwrap(); + + let files = flatten_list_stream(storage, None).await.unwrap(); + assert_eq!(files, vec![file_with_delimiter.clone()]); + + let files = flatten_list_stream(storage, Some(&Path::from("a/b"))) + .await + .unwrap(); + assert!(files.is_empty()); + + let files = storage + .list_with_delimiter(Some(&Path::from("a/b"))) + .await + .unwrap(); + assert!(files.common_prefixes.is_empty()); + assert!(files.objects.is_empty()); + + let files = storage + .list_with_delimiter(Some(&Path::from("a"))) + .await + .unwrap(); + assert_eq!(files.common_prefixes, vec![Path::from_iter(["a", "b/c"])]); + assert!(files.objects.is_empty()); + + let files = storage + .list_with_delimiter(Some(&Path::from_iter(["a", "b/c"]))) + .await + .unwrap(); + assert!(files.common_prefixes.is_empty()); + assert_eq!(files.objects.len(), 1); + assert_eq!(files.objects[0].location, file_with_delimiter); + + storage.delete(&file_with_delimiter).await.unwrap(); + + // Test handling of paths containing non-ASCII characters, e.g. emoji + + let emoji_prefix = Path::from("🙀"); + let emoji_file = Path::from("🙀/😀.parquet"); + storage + .put(&emoji_file, Bytes::from("arbitrary")) + .await + .unwrap(); + + storage.head(&emoji_file).await.unwrap(); + storage + .get(&emoji_file) + .await + .unwrap() + .bytes() + .await + .unwrap(); + + let files = flatten_list_stream(storage, Some(&emoji_prefix)) + .await + .unwrap(); + + assert_eq!(files, vec![emoji_file.clone()]); + + storage.delete(&emoji_file).await.unwrap(); + let files = flatten_list_stream(storage, Some(&emoji_prefix)) + .await + .unwrap(); + assert!(files.is_empty()); + + Ok(()) + } + + pub(crate) async fn list_uses_directories_correctly( + storage: &DynObjectStore, + ) -> Result<()> { + delete_fixtures(storage).await; + + let content_list = flatten_list_stream(storage, None).await?; + assert!( + content_list.is_empty(), + "Expected list to be empty; found: {:?}", + content_list + ); + + let location1 = Path::from("foo/x.json"); + let location2 = Path::from("foo.bar/y.json"); + + let data = Bytes::from("arbitrary data"); + storage.put(&location1, data.clone()).await?; + storage.put(&location2, data).await?; + + let prefix = Path::from("foo"); + let content_list = flatten_list_stream(storage, Some(&prefix)).await?; + assert_eq!(content_list, &[location1.clone()]); + + let prefix = Path::from("foo/x"); + let content_list = flatten_list_stream(storage, Some(&prefix)).await?; + assert_eq!(content_list, &[]); + + Ok(()) + } + + pub(crate) async fn list_with_delimiter(storage: &DynObjectStore) -> Result<()> { + delete_fixtures(storage).await; + + // ==================== check: store is empty ==================== + let content_list = flatten_list_stream(storage, None).await?; + assert!(content_list.is_empty()); + + // ==================== do: create files ==================== + let data = Bytes::from("arbitrary data"); + + let files: Vec<_> = [ + "test_file", + "mydb/wb/000/000/000.segment", + "mydb/wb/000/000/001.segment", + "mydb/wb/000/000/002.segment", + "mydb/wb/001/001/000.segment", + "mydb/wb/foo.json", + "mydb/wbwbwb/111/222/333.segment", + "mydb/data/whatevs", + ] + .iter() + .map(|&s| Path::from(s)) + .collect(); + + for f in &files { + let data = data.clone(); + storage.put(f, data).await.unwrap(); + } + + // ==================== check: prefix-list `mydb/wb` (directory) ==================== + let prefix = Path::from("mydb/wb"); + + let expected_000 = Path::from("mydb/wb/000"); + let expected_001 = Path::from("mydb/wb/001"); + let expected_location = Path::from("mydb/wb/foo.json"); + + let result = storage.list_with_delimiter(Some(&prefix)).await.unwrap(); + + assert_eq!(result.common_prefixes, vec![expected_000, expected_001]); + assert_eq!(result.objects.len(), 1); + + let object = &result.objects[0]; + + assert_eq!(object.location, expected_location); + assert_eq!(object.size, data.len()); + + // ==================== check: prefix-list `mydb/wb/000/000/001` (partial filename doesn't match) ==================== + let prefix = Path::from("mydb/wb/000/000/001"); + + let result = storage.list_with_delimiter(Some(&prefix)).await.unwrap(); + assert!(result.common_prefixes.is_empty()); + assert_eq!(result.objects.len(), 0); + + // ==================== check: prefix-list `not_there` (non-existing prefix) ==================== + let prefix = Path::from("not_there"); + + let result = storage.list_with_delimiter(Some(&prefix)).await.unwrap(); + assert!(result.common_prefixes.is_empty()); + assert!(result.objects.is_empty()); + + // ==================== do: remove all files ==================== + for f in &files { + storage.delete(f).await.unwrap(); + } + + // ==================== check: store is empty ==================== + let content_list = flatten_list_stream(storage, None).await?; + assert!(content_list.is_empty()); + + Ok(()) + } + + pub(crate) async fn get_nonexistent_object( + storage: &DynObjectStore, + location: Option, + ) -> crate::Result { + let location = + location.unwrap_or_else(|| Path::from("this_file_should_not_exist")); + + let err = storage.head(&location).await.unwrap_err(); + assert!(matches!(err, crate::Error::NotFound { .. })); + + storage.get(&location).await?.bytes().await + } + + pub(crate) async fn rename_and_copy(storage: &DynObjectStore) -> Result<()> { + // Create two objects + let path1 = Path::from("test1"); + let path2 = Path::from("test2"); + let contents1 = Bytes::from("cats"); + let contents2 = Bytes::from("dogs"); + + // copy() make both objects identical + storage.put(&path1, contents1.clone()).await?; + storage.put(&path2, contents2.clone()).await?; + storage.copy(&path1, &path2).await?; + let new_contents = storage.get(&path2).await?.bytes().await?; + assert_eq!(&new_contents, &contents1); + + // rename() copies contents and deletes original + storage.put(&path1, contents1.clone()).await?; + storage.put(&path2, contents2.clone()).await?; + storage.rename(&path1, &path2).await?; + let new_contents = storage.get(&path2).await?.bytes().await?; + assert_eq!(&new_contents, &contents1); + let result = storage.get(&path1).await; + assert!(result.is_err()); + assert!(matches!(result.unwrap_err(), crate::Error::NotFound { .. })); + + // Clean up + storage.delete(&path2).await?; + + Ok(()) + } + + pub(crate) async fn copy_if_not_exists(storage: &DynObjectStore) -> Result<()> { + // Create two objects + let path1 = Path::from("test1"); + let path2 = Path::from("test2"); + let contents1 = Bytes::from("cats"); + let contents2 = Bytes::from("dogs"); + + // copy_if_not_exists() errors if destination already exists + storage.put(&path1, contents1.clone()).await?; + storage.put(&path2, contents2.clone()).await?; + let result = storage.copy_if_not_exists(&path1, &path2).await; + assert!(result.is_err()); + assert!(matches!( + result.unwrap_err(), + crate::Error::AlreadyExists { .. } + )); + + // copy_if_not_exists() copies contents and allows deleting original + storage.delete(&path2).await?; + storage.copy_if_not_exists(&path1, &path2).await?; + storage.delete(&path1).await?; + let new_contents = storage.get(&path2).await?.bytes().await?; + assert_eq!(&new_contents, &contents1); + let result = storage.get(&path1).await; + assert!(result.is_err()); + assert!(matches!(result.unwrap_err(), crate::Error::NotFound { .. })); + + // Clean up + storage.delete(&path2).await?; + + Ok(()) + } + + async fn delete_fixtures(storage: &DynObjectStore) { + let paths = flatten_list_stream(storage, None).await.unwrap(); + + for f in &paths { + let _ = storage.delete(f).await; + } + } + + /// Test that the returned stream does not borrow the lifetime of Path + async fn list_store<'a, 'b>( + store: &'a dyn ObjectStore, + path_str: &'b str, + ) -> super::Result>> { + let path = Path::from(path_str); + store.list(Some(&path)).await + } + + #[tokio::test] + async fn test_list_lifetimes() { + let store = memory::InMemory::new(); + let stream = list_store(&store, "path").await.unwrap(); + assert_eq!(stream.count().await, 0); + } + + // Tests TODO: + // GET nonexisting location (in_memory/file) + // DELETE nonexisting location + // PUT overwriting +} diff --git a/src/local.rs b/src/local.rs new file mode 100644 index 0000000..8a9462e --- /dev/null +++ b/src/local.rs @@ -0,0 +1,773 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! An object store implementation for a local filesystem +use crate::{ + maybe_spawn_blocking, + path::{filesystem_path_to_url, Path}, + GetResult, ListResult, ObjectMeta, ObjectStore, Result, +}; +use async_trait::async_trait; +use bytes::Bytes; +use futures::{stream::BoxStream, StreamExt}; +use snafu::{ensure, OptionExt, ResultExt, Snafu}; +use std::collections::VecDeque; +use std::fs::File; +use std::io::{Read, Seek, SeekFrom, Write}; +use std::ops::Range; +use std::sync::Arc; +use std::{collections::BTreeSet, convert::TryFrom, io}; +use url::Url; +use walkdir::{DirEntry, WalkDir}; + +/// A specialized `Error` for filesystem object store-related errors +#[derive(Debug, Snafu)] +#[allow(missing_docs)] +pub(crate) enum Error { + #[snafu(display("File size for {} did not fit in a usize: {}", path, source))] + FileSizeOverflowedUsize { + source: std::num::TryFromIntError, + path: String, + }, + + #[snafu(display("Unable to walk dir: {}", source))] + UnableToWalkDir { + source: walkdir::Error, + }, + + #[snafu(display("Unable to access metadata for {}: {}", path, source))] + UnableToAccessMetadata { + source: Box, + path: String, + }, + + #[snafu(display("Unable to copy data to file: {}", source))] + UnableToCopyDataToFile { + source: io::Error, + }, + + #[snafu(display("Unable to create dir {}: {}", path.display(), source))] + UnableToCreateDir { + source: io::Error, + path: std::path::PathBuf, + }, + + #[snafu(display("Unable to create file {}: {}", path.display(), err))] + UnableToCreateFile { + path: std::path::PathBuf, + err: io::Error, + }, + + #[snafu(display("Unable to delete file {}: {}", path.display(), source))] + UnableToDeleteFile { + source: io::Error, + path: std::path::PathBuf, + }, + + #[snafu(display("Unable to open file {}: {}", path.display(), source))] + UnableToOpenFile { + source: io::Error, + path: std::path::PathBuf, + }, + + #[snafu(display("Unable to read data from file {}: {}", path.display(), source))] + UnableToReadBytes { + source: io::Error, + path: std::path::PathBuf, + }, + + #[snafu(display("Out of range of file {}, expected: {}, actual: {}", path.display(), expected, actual))] + OutOfRange { + path: std::path::PathBuf, + expected: usize, + actual: usize, + }, + + #[snafu(display("Unable to copy file from {} to {}: {}", from.display(), to.display(), source))] + UnableToCopyFile { + from: std::path::PathBuf, + to: std::path::PathBuf, + source: io::Error, + }, + + NotFound { + path: std::path::PathBuf, + source: io::Error, + }, + + #[snafu(display("Error seeking file {}: {}", path.display(), source))] + Seek { + source: io::Error, + path: std::path::PathBuf, + }, + + #[snafu(display("Unable to convert URL \"{}\" to filesystem path", url))] + InvalidUrl { + url: Url, + }, + + AlreadyExists { + path: String, + source: io::Error, + }, +} + +impl From for super::Error { + fn from(source: Error) -> Self { + match source { + Error::NotFound { path, source } => Self::NotFound { + path: path.to_string_lossy().to_string(), + source: source.into(), + }, + Error::AlreadyExists { path, source } => Self::AlreadyExists { + path, + source: source.into(), + }, + _ => Self::Generic { + store: "LocalFileSystem", + source: Box::new(source), + }, + } + } +} + +/// Local filesystem storage providing an [`ObjectStore`] interface to files on +/// local disk. Can optionally be created with a directory prefix +/// +/// # Path Semantics +/// +/// This implementation follows the [file URI] scheme outlined in [RFC 3986]. In +/// particular paths are delimited by `/` +/// +/// [file URI]: https://en.wikipedia.org/wiki/File_URI_scheme +/// [RFC 3986]: https://www.rfc-editor.org/rfc/rfc3986 +/// +/// # Tokio Compatibility +/// +/// Tokio discourages performing blocking IO on a tokio worker thread, however, +/// no major operating systems have stable async file APIs. Therefore if called from +/// a tokio context, this will use [`tokio::runtime::Handle::spawn_blocking`] to dispatch +/// IO to a blocking thread pool, much like `tokio::fs` does under-the-hood. +/// +/// If not called from a tokio context, this will perform IO on the current thread with +/// no additional complexity or overheads +#[derive(Debug)] +pub struct LocalFileSystem { + config: Arc, +} + +#[derive(Debug)] +struct Config { + root: Url, +} + +impl std::fmt::Display for LocalFileSystem { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "LocalFileSystem({})", self.config.root) + } +} + +impl Default for LocalFileSystem { + fn default() -> Self { + Self::new() + } +} + +impl LocalFileSystem { + /// Create new filesystem storage with no prefix + pub fn new() -> Self { + Self { + config: Arc::new(Config { + root: Url::parse("file:///").unwrap(), + }), + } + } + + /// Create new filesystem storage with `prefix` applied to all paths + pub fn new_with_prefix(prefix: impl AsRef) -> Result { + Ok(Self { + config: Arc::new(Config { + root: filesystem_path_to_url(prefix)?, + }), + }) + } +} + +impl Config { + /// Return filesystem path of the given location + fn path_to_filesystem(&self, location: &Path) -> Result { + let mut url = self.root.clone(); + url.path_segments_mut() + .expect("url path") + .extend(location.parts()); + + url.to_file_path() + .map_err(|_| Error::InvalidUrl { url }.into()) + } + + fn filesystem_to_path(&self, location: &std::path::Path) -> Result { + Ok(Path::from_filesystem_path_with_base( + location, + Some(&self.root), + )?) + } +} + +#[async_trait] +impl ObjectStore for LocalFileSystem { + async fn put(&self, location: &Path, bytes: Bytes) -> Result<()> { + let path = self.config.path_to_filesystem(location)?; + + maybe_spawn_blocking(move || { + let mut file = match File::create(&path) { + Ok(f) => f, + Err(err) if err.kind() == std::io::ErrorKind::NotFound => { + let parent = path + .parent() + .context(UnableToCreateFileSnafu { path: &path, err })?; + std::fs::create_dir_all(&parent) + .context(UnableToCreateDirSnafu { path: parent })?; + + match File::create(&path) { + Ok(f) => f, + Err(err) => { + return Err(Error::UnableToCreateFile { path, err }.into()) + } + } + } + Err(err) => return Err(Error::UnableToCreateFile { path, err }.into()), + }; + + file.write_all(&bytes) + .context(UnableToCopyDataToFileSnafu)?; + + Ok(()) + }) + .await + } + + async fn get(&self, location: &Path) -> Result { + let path = self.config.path_to_filesystem(location)?; + maybe_spawn_blocking(move || { + let file = open_file(&path)?; + Ok(GetResult::File(file, path)) + }) + .await + } + + async fn get_range(&self, location: &Path, range: Range) -> Result { + let path = self.config.path_to_filesystem(location)?; + maybe_spawn_blocking(move || { + let mut file = open_file(&path)?; + let to_read = range.end - range.start; + file.seek(SeekFrom::Start(range.start as u64)) + .context(SeekSnafu { path: &path })?; + + let mut buf = Vec::with_capacity(to_read); + let read = file + .take(to_read as u64) + .read_to_end(&mut buf) + .context(UnableToReadBytesSnafu { path: &path })?; + + ensure!( + read == to_read, + OutOfRangeSnafu { + path: &path, + expected: to_read, + actual: read + } + ); + + Ok(buf.into()) + }) + .await + } + + async fn head(&self, location: &Path) -> Result { + let path = self.config.path_to_filesystem(location)?; + let location = location.clone(); + + maybe_spawn_blocking(move || { + let file = open_file(&path)?; + let metadata = + file.metadata().map_err(|e| Error::UnableToAccessMetadata { + source: e.into(), + path: location.to_string(), + })?; + + convert_metadata(metadata, location) + }) + .await + } + + async fn delete(&self, location: &Path) -> Result<()> { + let path = self.config.path_to_filesystem(location)?; + maybe_spawn_blocking(move || { + std::fs::remove_file(&path).context(UnableToDeleteFileSnafu { path })?; + Ok(()) + }) + .await + } + + async fn list( + &self, + prefix: Option<&Path>, + ) -> Result>> { + let config = Arc::clone(&self.config); + + let root_path = match prefix { + Some(prefix) => config.path_to_filesystem(prefix)?, + None => self.config.root.to_file_path().unwrap(), + }; + + let walkdir = WalkDir::new(&root_path) + // Don't include the root directory itself + .min_depth(1); + + let s = walkdir.into_iter().flat_map(move |result_dir_entry| { + match convert_walkdir_result(result_dir_entry) { + Err(e) => Some(Err(e)), + Ok(None) => None, + Ok(entry @ Some(_)) => entry + .filter(|dir_entry| dir_entry.file_type().is_file()) + .map(|entry| { + let location = config.filesystem_to_path(entry.path())?; + convert_entry(entry, location) + }), + } + }); + + // If no tokio context, return iterator directly as no + // need to perform chunked spawn_blocking reads + if tokio::runtime::Handle::try_current().is_err() { + return Ok(futures::stream::iter(s).boxed()); + } + + // Otherwise list in batches of CHUNK_SIZE + const CHUNK_SIZE: usize = 1024; + + let buffer = VecDeque::with_capacity(CHUNK_SIZE); + let stream = + futures::stream::try_unfold((s, buffer), |(mut s, mut buffer)| async move { + if buffer.is_empty() { + (s, buffer) = tokio::task::spawn_blocking(move || { + for _ in 0..CHUNK_SIZE { + match s.next() { + Some(r) => buffer.push_back(r), + None => break, + } + } + (s, buffer) + }) + .await?; + } + + match buffer.pop_front() { + Some(Err(e)) => Err(e), + Some(Ok(meta)) => Ok(Some((meta, (s, buffer)))), + None => Ok(None), + } + }); + + Ok(stream.boxed()) + } + + async fn list_with_delimiter(&self, prefix: Option<&Path>) -> Result { + let config = Arc::clone(&self.config); + + let prefix = prefix.cloned().unwrap_or_default(); + let resolved_prefix = config.path_to_filesystem(&prefix)?; + + maybe_spawn_blocking(move || { + let walkdir = WalkDir::new(&resolved_prefix).min_depth(1).max_depth(1); + + let mut common_prefixes = BTreeSet::new(); + let mut objects = Vec::new(); + + for entry_res in walkdir.into_iter().map(convert_walkdir_result) { + if let Some(entry) = entry_res? { + let is_directory = entry.file_type().is_dir(); + let entry_location = config.filesystem_to_path(entry.path())?; + + let mut parts = match entry_location.prefix_match(&prefix) { + Some(parts) => parts, + None => continue, + }; + + let common_prefix = match parts.next() { + Some(p) => p, + None => continue, + }; + + drop(parts); + + if is_directory { + common_prefixes.insert(prefix.child(common_prefix)); + } else { + objects.push(convert_entry(entry, entry_location)?); + } + } + } + + Ok(ListResult { + common_prefixes: common_prefixes.into_iter().collect(), + objects, + }) + }) + .await + } + + async fn copy(&self, from: &Path, to: &Path) -> Result<()> { + let from = self.config.path_to_filesystem(from)?; + let to = self.config.path_to_filesystem(to)?; + + maybe_spawn_blocking(move || { + std::fs::copy(&from, &to).context(UnableToCopyFileSnafu { from, to })?; + Ok(()) + }) + .await + } + + async fn rename(&self, from: &Path, to: &Path) -> Result<()> { + let from = self.config.path_to_filesystem(from)?; + let to = self.config.path_to_filesystem(to)?; + maybe_spawn_blocking(move || { + std::fs::rename(&from, &to).context(UnableToCopyFileSnafu { from, to })?; + Ok(()) + }) + .await + } + + async fn copy_if_not_exists(&self, from: &Path, to: &Path) -> Result<()> { + let from = self.config.path_to_filesystem(from)?; + let to = self.config.path_to_filesystem(to)?; + + maybe_spawn_blocking(move || { + std::fs::hard_link(&from, &to).map_err(|err| match err.kind() { + io::ErrorKind::AlreadyExists => Error::AlreadyExists { + path: to.to_str().unwrap().to_string(), + source: err, + } + .into(), + _ => Error::UnableToCopyFile { + from, + to, + source: err, + } + .into(), + }) + }) + .await + } +} + +fn open_file(path: &std::path::PathBuf) -> Result { + let file = File::open(path).map_err(|e| { + if e.kind() == std::io::ErrorKind::NotFound { + Error::NotFound { + path: path.clone(), + source: e, + } + } else { + Error::UnableToOpenFile { + path: path.clone(), + source: e, + } + } + })?; + Ok(file) +} + +fn convert_entry(entry: DirEntry, location: Path) -> Result { + let metadata = entry + .metadata() + .map_err(|e| Error::UnableToAccessMetadata { + source: e.into(), + path: location.to_string(), + })?; + convert_metadata(metadata, location) +} + +fn convert_metadata(metadata: std::fs::Metadata, location: Path) -> Result { + let last_modified = metadata + .modified() + .expect("Modified file time should be supported on this platform") + .into(); + + let size = usize::try_from(metadata.len()).context(FileSizeOverflowedUsizeSnafu { + path: location.as_ref(), + })?; + + Ok(ObjectMeta { + location, + last_modified, + size, + }) +} + +/// Convert walkdir results and converts not-found errors into `None`. +fn convert_walkdir_result( + res: std::result::Result, +) -> Result> { + match res { + Ok(entry) => Ok(Some(entry)), + Err(walkdir_err) => match walkdir_err.io_error() { + Some(io_err) => match io_err.kind() { + io::ErrorKind::NotFound => Ok(None), + _ => Err(Error::UnableToWalkDir { + source: walkdir_err, + } + .into()), + }, + None => Err(Error::UnableToWalkDir { + source: walkdir_err, + } + .into()), + }, + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::test_util::flatten_list_stream; + use crate::{ + tests::{ + copy_if_not_exists, get_nonexistent_object, list_uses_directories_correctly, + list_with_delimiter, put_get_delete_list, rename_and_copy, + }, + Error as ObjectStoreError, ObjectStore, + }; + use tempfile::TempDir; + + #[tokio::test] + async fn file_test() { + let root = TempDir::new().unwrap(); + let integration = LocalFileSystem::new_with_prefix(root.path()).unwrap(); + + put_get_delete_list(&integration).await.unwrap(); + list_uses_directories_correctly(&integration).await.unwrap(); + list_with_delimiter(&integration).await.unwrap(); + rename_and_copy(&integration).await.unwrap(); + copy_if_not_exists(&integration).await.unwrap(); + } + + #[test] + fn test_non_tokio() { + let root = TempDir::new().unwrap(); + let integration = LocalFileSystem::new_with_prefix(root.path()).unwrap(); + futures::executor::block_on(async move { + put_get_delete_list(&integration).await.unwrap(); + list_uses_directories_correctly(&integration).await.unwrap(); + list_with_delimiter(&integration).await.unwrap(); + }); + } + + #[tokio::test] + async fn creates_dir_if_not_present() { + let root = TempDir::new().unwrap(); + let integration = LocalFileSystem::new_with_prefix(root.path()).unwrap(); + + let location = Path::from("nested/file/test_file"); + + let data = Bytes::from("arbitrary data"); + let expected_data = data.clone(); + + integration.put(&location, data).await.unwrap(); + + let read_data = integration + .get(&location) + .await + .unwrap() + .bytes() + .await + .unwrap(); + assert_eq!(&*read_data, expected_data); + } + + #[tokio::test] + async fn unknown_length() { + let root = TempDir::new().unwrap(); + let integration = LocalFileSystem::new_with_prefix(root.path()).unwrap(); + + let location = Path::from("some_file"); + + let data = Bytes::from("arbitrary data"); + let expected_data = data.clone(); + + integration.put(&location, data).await.unwrap(); + + let read_data = integration + .get(&location) + .await + .unwrap() + .bytes() + .await + .unwrap(); + assert_eq!(&*read_data, expected_data); + } + + #[tokio::test] + #[cfg(target_family = "unix")] + // Fails on github actions runner (which runs the tests as root) + #[ignore] + async fn bubble_up_io_errors() { + use std::{fs::set_permissions, os::unix::prelude::PermissionsExt}; + + let root = TempDir::new().unwrap(); + + // make non-readable + let metadata = root.path().metadata().unwrap(); + let mut permissions = metadata.permissions(); + permissions.set_mode(0o000); + set_permissions(root.path(), permissions).unwrap(); + + let store = LocalFileSystem::new_with_prefix(root.path()).unwrap(); + + // `list` must fail + match store.list(None).await { + Err(_) => { + // ok, error found + } + Ok(mut stream) => { + let mut any_err = false; + while let Some(res) = stream.next().await { + if res.is_err() { + any_err = true; + } + } + assert!(any_err); + } + } + + // `list_with_delimiter + assert!(store.list_with_delimiter(None).await.is_err()); + } + + const NON_EXISTENT_NAME: &str = "nonexistentname"; + + #[tokio::test] + async fn get_nonexistent_location() { + let root = TempDir::new().unwrap(); + let integration = LocalFileSystem::new_with_prefix(root.path()).unwrap(); + + let location = Path::from(NON_EXISTENT_NAME); + + let err = get_nonexistent_object(&integration, Some(location)) + .await + .unwrap_err(); + if let ObjectStoreError::NotFound { path, source } = err { + let source_variant = source.downcast_ref::(); + assert!( + matches!(source_variant, Some(std::io::Error { .. }),), + "got: {:?}", + source_variant + ); + assert!(path.ends_with(NON_EXISTENT_NAME), "{}", path); + } else { + panic!("unexpected error type: {:?}", err); + } + } + + #[tokio::test] + async fn root() { + let integration = LocalFileSystem::new(); + + let canonical = std::path::Path::new("Cargo.toml").canonicalize().unwrap(); + let url = Url::from_directory_path(&canonical).unwrap(); + let path = Path::parse(url.path()).unwrap(); + + let roundtrip = integration.config.path_to_filesystem(&path).unwrap(); + + // Needed as on Windows canonicalize returns extended length path syntax + // C:\Users\circleci -> \\?\C:\Users\circleci + let roundtrip = roundtrip.canonicalize().unwrap(); + + assert_eq!(roundtrip, canonical); + + integration.head(&path).await.unwrap(); + } + + #[tokio::test] + #[cfg(target_os = "linux")] + // macos has some magic in its root '/.VolumeIcon.icns"' which causes this test to fail + async fn test_list_root() { + let integration = LocalFileSystem::new(); + let result = integration.list_with_delimiter(None).await; + if cfg!(target_family = "windows") { + let r = result.unwrap_err().to_string(); + assert!( + r.contains("Unable to convert URL \"file:///\" to filesystem path"), + "{}", + r + ); + } else { + result.unwrap(); + } + } + + #[tokio::test] + async fn invalid_path() { + let root = TempDir::new().unwrap(); + let root = root.path().join("🙀"); + std::fs::create_dir(root.clone()).unwrap(); + + // Invalid paths supported above root of store + let integration = LocalFileSystem::new_with_prefix(root.clone()).unwrap(); + + let directory = Path::from("directory"); + let object = directory.child("child.txt"); + let data = Bytes::from("arbitrary"); + integration.put(&object, data.clone()).await.unwrap(); + integration.head(&object).await.unwrap(); + let result = integration.get(&object).await.unwrap(); + assert_eq!(result.bytes().await.unwrap(), data); + + flatten_list_stream(&integration, None).await.unwrap(); + flatten_list_stream(&integration, Some(&directory)) + .await + .unwrap(); + + let result = integration + .list_with_delimiter(Some(&directory)) + .await + .unwrap(); + assert_eq!(result.objects.len(), 1); + assert!(result.common_prefixes.is_empty()); + assert_eq!(result.objects[0].location, object); + + let illegal = root.join("💀"); + std::fs::write(illegal, "foo").unwrap(); + + // Can list directory that doesn't contain illegal path + flatten_list_stream(&integration, Some(&directory)) + .await + .unwrap(); + + // Cannot list illegal file + let err = flatten_list_stream(&integration, None) + .await + .unwrap_err() + .to_string(); + + assert!( + err.contains("Invalid path segment - got \"💀\" expected: \"%F0%9F%92%80\""), + "{}", + err + ); + } +} diff --git a/src/memory.rs b/src/memory.rs new file mode 100644 index 0000000..ffd8e3a --- /dev/null +++ b/src/memory.rs @@ -0,0 +1,297 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! An in-memory object store implementation +use crate::{path::Path, GetResult, ListResult, ObjectMeta, ObjectStore, Result}; +use async_trait::async_trait; +use bytes::Bytes; +use chrono::Utc; +use futures::{stream::BoxStream, StreamExt}; +use parking_lot::RwLock; +use snafu::{ensure, OptionExt, Snafu}; +use std::collections::BTreeMap; +use std::collections::BTreeSet; +use std::ops::Range; + +/// A specialized `Error` for in-memory object store-related errors +#[derive(Debug, Snafu)] +#[allow(missing_docs)] +enum Error { + #[snafu(display("No data in memory found. Location: {path}"))] + NoDataInMemory { path: String }, + + #[snafu(display("Out of range"))] + OutOfRange, + + #[snafu(display("Bad range"))] + BadRange, + + #[snafu(display("Object already exists at that location: {path}"))] + AlreadyExists { path: String }, +} + +impl From for super::Error { + fn from(source: Error) -> Self { + match source { + Error::NoDataInMemory { ref path } => Self::NotFound { + path: path.into(), + source: source.into(), + }, + Error::AlreadyExists { ref path } => Self::AlreadyExists { + path: path.into(), + source: source.into(), + }, + _ => Self::Generic { + store: "InMemory", + source: Box::new(source), + }, + } + } +} + +/// In-memory storage suitable for testing or for opting out of using a cloud +/// storage provider. +#[derive(Debug, Default)] +pub struct InMemory { + storage: RwLock>, +} + +impl std::fmt::Display for InMemory { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "InMemory") + } +} + +#[async_trait] +impl ObjectStore for InMemory { + async fn put(&self, location: &Path, bytes: Bytes) -> Result<()> { + self.storage.write().insert(location.clone(), bytes); + Ok(()) + } + + async fn get(&self, location: &Path) -> Result { + let data = self.get_bytes(location).await?; + + Ok(GetResult::Stream( + futures::stream::once(async move { Ok(data) }).boxed(), + )) + } + + async fn get_range(&self, location: &Path, range: Range) -> Result { + let data = self.get_bytes(location).await?; + ensure!(range.end <= data.len(), OutOfRangeSnafu); + ensure!(range.start <= range.end, BadRangeSnafu); + + Ok(data.slice(range)) + } + + async fn head(&self, location: &Path) -> Result { + let last_modified = Utc::now(); + let bytes = self.get_bytes(location).await?; + Ok(ObjectMeta { + location: location.clone(), + last_modified, + size: bytes.len(), + }) + } + + async fn delete(&self, location: &Path) -> Result<()> { + self.storage.write().remove(location); + Ok(()) + } + + async fn list( + &self, + prefix: Option<&Path>, + ) -> Result>> { + let last_modified = Utc::now(); + + let storage = self.storage.read(); + let values: Vec<_> = storage + .iter() + .filter(move |(key, _)| prefix.map(|p| key.prefix_matches(p)).unwrap_or(true)) + .map(move |(key, value)| { + Ok(ObjectMeta { + location: key.clone(), + last_modified, + size: value.len(), + }) + }) + .collect(); + + Ok(futures::stream::iter(values).boxed()) + } + + /// The memory implementation returns all results, as opposed to the cloud + /// versions which limit their results to 1k or more because of API + /// limitations. + async fn list_with_delimiter(&self, prefix: Option<&Path>) -> Result { + let root = Path::default(); + let prefix = prefix.unwrap_or(&root); + + let mut common_prefixes = BTreeSet::new(); + let last_modified = Utc::now(); + + // Only objects in this base level should be returned in the + // response. Otherwise, we just collect the common prefixes. + let mut objects = vec![]; + for (k, v) in self.storage.read().range((prefix)..) { + let mut parts = match k.prefix_match(prefix) { + Some(parts) => parts, + None => break, + }; + + // Pop first element + let common_prefix = match parts.next() { + Some(p) => p, + None => continue, + }; + + if parts.next().is_some() { + common_prefixes.insert(prefix.child(common_prefix)); + } else { + let object = ObjectMeta { + location: k.clone(), + last_modified, + size: v.len(), + }; + objects.push(object); + } + } + + Ok(ListResult { + objects, + common_prefixes: common_prefixes.into_iter().collect(), + }) + } + + async fn copy(&self, from: &Path, to: &Path) -> Result<()> { + let data = self.get_bytes(from).await?; + self.storage.write().insert(to.clone(), data); + Ok(()) + } + + async fn copy_if_not_exists(&self, from: &Path, to: &Path) -> Result<()> { + let data = self.get_bytes(from).await?; + let mut storage = self.storage.write(); + if storage.contains_key(to) { + return Err(Error::AlreadyExists { + path: to.to_string(), + } + .into()); + } + storage.insert(to.clone(), data); + Ok(()) + } +} + +impl InMemory { + /// Create new in-memory storage. + pub fn new() -> Self { + Self::default() + } + + /// Creates a clone of the store + pub async fn clone(&self) -> Self { + let storage = self.storage.read(); + let storage = storage.clone(); + + Self { + storage: RwLock::new(storage), + } + } + + async fn get_bytes(&self, location: &Path) -> Result { + let storage = self.storage.read(); + let bytes = storage + .get(location) + .cloned() + .context(NoDataInMemorySnafu { + path: location.to_string(), + })?; + Ok(bytes) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + use crate::{ + tests::{ + copy_if_not_exists, get_nonexistent_object, list_uses_directories_correctly, + list_with_delimiter, put_get_delete_list, rename_and_copy, + }, + Error as ObjectStoreError, ObjectStore, + }; + + #[tokio::test] + async fn in_memory_test() { + let integration = InMemory::new(); + + put_get_delete_list(&integration).await.unwrap(); + list_uses_directories_correctly(&integration).await.unwrap(); + list_with_delimiter(&integration).await.unwrap(); + rename_and_copy(&integration).await.unwrap(); + copy_if_not_exists(&integration).await.unwrap(); + } + + #[tokio::test] + async fn unknown_length() { + let integration = InMemory::new(); + + let location = Path::from("some_file"); + + let data = Bytes::from("arbitrary data"); + let expected_data = data.clone(); + + integration.put(&location, data).await.unwrap(); + + let read_data = integration + .get(&location) + .await + .unwrap() + .bytes() + .await + .unwrap(); + assert_eq!(&*read_data, expected_data); + } + + const NON_EXISTENT_NAME: &str = "nonexistentname"; + + #[tokio::test] + async fn nonexistent_location() { + let integration = InMemory::new(); + + let location = Path::from(NON_EXISTENT_NAME); + + let err = get_nonexistent_object(&integration, Some(location)) + .await + .unwrap_err(); + if let ObjectStoreError::NotFound { path, source } = err { + let source_variant = source.downcast_ref::(); + assert!( + matches!(source_variant, Some(Error::NoDataInMemory { .. }),), + "got: {:?}", + source_variant + ); + assert_eq!(path, NON_EXISTENT_NAME); + } else { + panic!("unexpected error type: {:?}", err); + } + } +} diff --git a/src/oauth.rs b/src/oauth.rs new file mode 100644 index 0000000..273e37b --- /dev/null +++ b/src/oauth.rs @@ -0,0 +1,215 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::token::TemporaryToken; +use reqwest::{Client, Method}; +use ring::signature::RsaKeyPair; +use snafu::{ResultExt, Snafu}; +use std::time::{Duration, Instant}; + +#[derive(Debug, Snafu)] +pub enum Error { + #[snafu(display("No RSA key found in pem file"))] + MissingKey, + + #[snafu(display("Invalid RSA key: {}", source), context(false))] + InvalidKey { source: ring::error::KeyRejected }, + + #[snafu(display("Error signing jwt: {}", source))] + Sign { source: ring::error::Unspecified }, + + #[snafu(display("Error encoding jwt payload: {}", source))] + Encode { source: serde_json::Error }, + + #[snafu(display("Unsupported key encoding: {}", encoding))] + UnsupportedKey { encoding: String }, + + #[snafu(display("Error performing token request: {}", source))] + TokenRequest { source: reqwest::Error }, +} + +pub type Result = std::result::Result; + +#[derive(Debug, Default, serde::Serialize)] +pub struct JwtHeader { + /// The type of JWS: it can only be "JWT" here + /// + /// Defined in [RFC7515#4.1.9](https://tools.ietf.org/html/rfc7515#section-4.1.9). + #[serde(skip_serializing_if = "Option::is_none")] + pub typ: Option, + /// The algorithm used + /// + /// Defined in [RFC7515#4.1.1](https://tools.ietf.org/html/rfc7515#section-4.1.1). + pub alg: String, + /// Content type + /// + /// Defined in [RFC7519#5.2](https://tools.ietf.org/html/rfc7519#section-5.2). + #[serde(skip_serializing_if = "Option::is_none")] + pub cty: Option, + /// JSON Key URL + /// + /// Defined in [RFC7515#4.1.2](https://tools.ietf.org/html/rfc7515#section-4.1.2). + #[serde(skip_serializing_if = "Option::is_none")] + pub jku: Option, + /// Key ID + /// + /// Defined in [RFC7515#4.1.4](https://tools.ietf.org/html/rfc7515#section-4.1.4). + #[serde(skip_serializing_if = "Option::is_none")] + pub kid: Option, + /// X.509 URL + /// + /// Defined in [RFC7515#4.1.5](https://tools.ietf.org/html/rfc7515#section-4.1.5). + #[serde(skip_serializing_if = "Option::is_none")] + pub x5u: Option, + /// X.509 certificate thumbprint + /// + /// Defined in [RFC7515#4.1.7](https://tools.ietf.org/html/rfc7515#section-4.1.7). + #[serde(skip_serializing_if = "Option::is_none")] + pub x5t: Option, +} + +#[derive(serde::Serialize)] +struct TokenClaims<'a> { + iss: &'a str, + scope: &'a str, + aud: &'a str, + exp: u64, + iat: u64, +} + +#[derive(serde::Deserialize, Debug)] +struct TokenResponse { + access_token: String, + expires_in: u64, +} + +/// Encapsulates the logic to perform an OAuth token challenge +#[derive(Debug)] +pub struct OAuthProvider { + issuer: String, + scope: String, + audience: String, + key_pair: RsaKeyPair, + jwt_header: String, + random: ring::rand::SystemRandom, +} + +impl OAuthProvider { + /// Create a new [`OAuthProvider`] + pub fn new( + issuer: String, + private_key_pem: String, + scope: String, + audience: String, + ) -> Result { + let key_pair = decode_first_rsa_key(private_key_pem)?; + let jwt_header = b64_encode_obj(&JwtHeader { + alg: "RS256".to_string(), + ..Default::default() + })?; + + Ok(Self { + issuer, + key_pair, + scope, + audience, + jwt_header, + random: ring::rand::SystemRandom::new(), + }) + } + + /// Fetch a fresh token + pub async fn fetch_token(&self, client: &Client) -> Result> { + let now = seconds_since_epoch(); + let exp = now + 3600; + + let claims = TokenClaims { + iss: &self.issuer, + scope: &self.scope, + aud: &self.audience, + exp, + iat: now, + }; + + let claim_str = b64_encode_obj(&claims)?; + let message = [self.jwt_header.as_ref(), claim_str.as_ref()].join("."); + let mut sig_bytes = vec![0; self.key_pair.public_modulus_len()]; + self.key_pair + .sign( + &ring::signature::RSA_PKCS1_SHA256, + &self.random, + message.as_bytes(), + &mut sig_bytes, + ) + .context(SignSnafu)?; + + let signature = base64::encode_config(&sig_bytes, base64::URL_SAFE_NO_PAD); + let jwt = [message, signature].join("."); + + let body = [ + ("grant_type", "urn:ietf:params:oauth:grant-type:jwt-bearer"), + ("assertion", &jwt), + ]; + + let response: TokenResponse = client + .request(Method::POST, &self.audience) + .form(&body) + .send() + .await + .context(TokenRequestSnafu)? + .error_for_status() + .context(TokenRequestSnafu)? + .json() + .await + .context(TokenRequestSnafu)?; + + let token = TemporaryToken { + token: response.access_token, + expiry: Instant::now() + Duration::from_secs(response.expires_in), + }; + + Ok(token) + } +} + +/// Returns the number of seconds since unix epoch +fn seconds_since_epoch() -> u64 { + std::time::SystemTime::now() + .duration_since(std::time::SystemTime::UNIX_EPOCH) + .unwrap() + .as_secs() +} + +fn decode_first_rsa_key(private_key_pem: String) -> Result { + use rustls_pemfile::Item; + use std::io::{BufReader, Cursor}; + + let mut cursor = Cursor::new(private_key_pem); + let mut reader = BufReader::new(&mut cursor); + + // Reading from string is infallible + match rustls_pemfile::read_one(&mut reader).unwrap() { + Some(Item::PKCS8Key(key)) => Ok(RsaKeyPair::from_pkcs8(&key)?), + Some(Item::RSAKey(key)) => Ok(RsaKeyPair::from_der(&key)?), + _ => Err(Error::MissingKey), + } +} + +fn b64_encode_obj(obj: &T) -> Result { + let string = serde_json::to_string(obj).context(EncodeSnafu)?; + Ok(base64::encode_config(string, base64::URL_SAFE_NO_PAD)) +} diff --git a/src/path/mod.rs b/src/path/mod.rs new file mode 100644 index 0000000..23488ef --- /dev/null +++ b/src/path/mod.rs @@ -0,0 +1,531 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Path abstraction for Object Storage + +use itertools::Itertools; +use percent_encoding::percent_decode; +use snafu::{ensure, ResultExt, Snafu}; +use std::fmt::Formatter; +use url::Url; + +/// The delimiter to separate object namespaces, creating a directory structure. +pub const DELIMITER: &str = "/"; + +/// The path delimiter as a single byte +pub const DELIMITER_BYTE: u8 = DELIMITER.as_bytes()[0]; + +mod parts; + +pub use parts::{InvalidPart, PathPart}; + +/// Error returned by [`Path::parse`] +#[derive(Debug, Snafu)] +#[allow(missing_docs)] +pub enum Error { + #[snafu(display("Path \"{}\" contained empty path segment", path))] + EmptySegment { path: String }, + + #[snafu(display("Error parsing Path \"{}\": {}", path, source))] + BadSegment { path: String, source: InvalidPart }, + + #[snafu(display("Failed to canonicalize path \"{}\": {}", path.display(), source))] + Canonicalize { + path: std::path::PathBuf, + source: std::io::Error, + }, + + #[snafu(display("Unable to convert path \"{}\" to URL", path.display()))] + InvalidPath { path: std::path::PathBuf }, + + #[snafu(display("Path \"{}\" contained non-unicode characters: {}", path, source))] + NonUnicode { + path: String, + source: std::str::Utf8Error, + }, + + #[snafu(display("Path {} does not start with prefix {}", path, prefix))] + PrefixMismatch { path: String, prefix: String }, +} + +/// A parsed path representation that can be safely written to object storage +/// +/// # Path Safety +/// +/// In theory object stores support any UTF-8 character sequence, however, certain character +/// sequences cause compatibility problems with some applications and protocols. As such the +/// naming guidelines for [S3], [GCS] and [Azure Blob Storage] all recommend sticking to a +/// limited character subset. +/// +/// [S3]: https://docs.aws.amazon.com/AmazonS3/latest/userguide/object-keys.html +/// [GCS]: https://cloud.google.com/storage/docs/naming-objects +/// [Azure Blob Storage]: https://docs.microsoft.com/en-us/rest/api/storageservices/Naming-and-Referencing-Containers--Blobs--and-Metadata#blob-names +/// +/// This presents libraries with two options for consistent path handling: +/// +/// 1. Allow constructing unsafe paths, allowing for both reading and writing of data to paths +/// that may not be consistently understood or supported +/// 2. Disallow constructing unsafe paths, ensuring data written can be consistently handled by +/// all other systems, but preventing interaction with objects at unsafe paths +/// +/// This library takes the second approach, in particular: +/// +/// * Paths are delimited by `/` +/// * Paths do not start with a `/` +/// * Empty path segments are discarded (e.g. `//` is treated as though it were `/`) +/// * Relative path segments, i.e. `.` and `..` are percent encoded +/// * Unsafe characters are percent encoded, as described by [RFC 1738] +/// * All paths are relative to the root of the object store +/// +/// In order to provide these guarantees there are two ways to safely construct a [`Path`] +/// +/// # Encode +/// +/// A string containing potentially illegal path segments can be encoded to a [`Path`] +/// using [`Path::from`] or [`Path::from_iter`]. +/// +/// ``` +/// # use object_store::path::Path; +/// assert_eq!(Path::from("foo/bar").as_ref(), "foo/bar"); +/// assert_eq!(Path::from("foo//bar").as_ref(), "foo/bar"); +/// assert_eq!(Path::from("foo/../bar").as_ref(), "foo/%2E%2E/bar"); +/// assert_eq!(Path::from_iter(["foo", "foo/bar"]).as_ref(), "foo/foo%2Fbar"); +/// ``` +/// +/// Note: if provided with an already percent encoded string, this will encode it again +/// +/// ``` +/// # use object_store::path::Path; +/// assert_eq!(Path::from("foo/foo%2Fbar").as_ref(), "foo/foo%252Fbar"); +/// ``` +/// +/// # Parse +/// +/// Alternatively a [`Path`] can be created from an existing string, returning an +/// error if it is invalid. Unlike the encoding methods, this will permit +/// valid percent encoded sequences. +/// +/// ``` +/// # use object_store::path::Path; +/// +/// assert_eq!(Path::parse("/foo/foo%2Fbar").unwrap().as_ref(), "foo/foo%2Fbar"); +/// Path::parse("..").unwrap_err(); +/// Path::parse("/foo//").unwrap_err(); +/// Path::parse("😀").unwrap_err(); +/// Path::parse("%Q").unwrap_err(); +/// ``` +/// +/// [RFC 1738]: https://www.ietf.org/rfc/rfc1738.txt +#[derive(Debug, Clone, Default, PartialEq, Eq, Hash, Ord, PartialOrd)] +pub struct Path { + /// The raw path with no leading or trailing delimiters + raw: String, +} + +impl Path { + /// Parse a string as a [`Path`], returning a [`Error`] if invalid, + /// as defined on the docstring for [`Path`] + /// + /// Note: this will strip any leading `/` or trailing `/` + pub fn parse(path: impl AsRef) -> Result { + let path = path.as_ref(); + + let stripped = path.strip_prefix(DELIMITER).unwrap_or(path); + if stripped.is_empty() { + return Ok(Default::default()); + } + + let stripped = stripped.strip_suffix(DELIMITER).unwrap_or(stripped); + + for segment in stripped.split(DELIMITER) { + ensure!(!segment.is_empty(), EmptySegmentSnafu { path }); + PathPart::parse(segment).context(BadSegmentSnafu { path })?; + } + + Ok(Self { + raw: stripped.to_string(), + }) + } + + /// Convert a filesystem path to a [`Path`] relative to the filesystem root + /// + /// This will return an error if the path does not exist, or contains illegal + /// character sequences as defined by [`Path::parse`] + pub fn from_filesystem_path( + path: impl AsRef, + ) -> Result { + Self::from_filesystem_path_with_base(path, None) + } + + /// Convert a filesystem path to a [`Path`] relative to the provided base + /// + /// This will return an error if the path does not exist on the local filesystem, + /// contains illegal character sequences as defined by [`Path::parse`], or `base` + /// does not refer to a parent path of `path` + pub(crate) fn from_filesystem_path_with_base( + path: impl AsRef, + base: Option<&Url>, + ) -> Result { + let url = filesystem_path_to_url(path)?; + let path = match base { + Some(prefix) => url.path().strip_prefix(prefix.path()).ok_or_else(|| { + Error::PrefixMismatch { + path: url.path().to_string(), + prefix: prefix.to_string(), + } + })?, + None => url.path(), + }; + + // Reverse any percent encoding performed by conversion to URL + let decoded = percent_decode(path.as_bytes()) + .decode_utf8() + .context(NonUnicodeSnafu { path })?; + + Self::parse(decoded) + } + + /// Returns the [`PathPart`] of this [`Path`] + pub fn parts(&self) -> impl Iterator> { + match self.raw.is_empty() { + true => itertools::Either::Left(std::iter::empty()), + false => itertools::Either::Right( + self.raw + .split(DELIMITER) + .map(|s| PathPart { raw: s.into() }), + ), + } + } + + /// Returns an iterator of the [`PathPart`] of this [`Path`] after `prefix` + /// + /// Returns `None` if the prefix does not match + pub fn prefix_match( + &self, + prefix: &Self, + ) -> Option> + '_> { + let diff = itertools::diff_with(self.parts(), prefix.parts(), |a, b| a == b); + + match diff { + // Both were equal + None => Some(itertools::Either::Left(std::iter::empty())), + // Mismatch or prefix was longer => None + Some( + itertools::Diff::FirstMismatch(_, _, _) | itertools::Diff::Longer(_, _), + ) => None, + // Match with remaining + Some(itertools::Diff::Shorter(_, back)) => { + Some(itertools::Either::Right(back)) + } + } + } + + /// Returns true if this [`Path`] starts with `prefix` + pub fn prefix_matches(&self, prefix: &Self) -> bool { + self.prefix_match(prefix).is_some() + } + + /// Creates a new child of this [`Path`] + pub fn child<'a>(&self, child: impl Into>) -> Self { + let raw = match self.raw.is_empty() { + true => format!("{}", child.into().raw), + false => format!("{}{}{}", self.raw, DELIMITER, child.into().raw), + }; + + Self { raw } + } +} + +impl AsRef for Path { + fn as_ref(&self) -> &str { + &self.raw + } +} + +impl From<&str> for Path { + fn from(path: &str) -> Self { + Self::from_iter(path.split(DELIMITER)) + } +} + +impl From for Path { + fn from(path: String) -> Self { + Self::from_iter(path.split(DELIMITER)) + } +} + +impl From for String { + fn from(path: Path) -> Self { + path.raw + } +} + +impl std::fmt::Display for Path { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + self.raw.fmt(f) + } +} + +impl<'a, I> FromIterator for Path +where + I: Into>, +{ + fn from_iter>(iter: T) -> Self { + let raw = T::into_iter(iter) + .map(|s| s.into()) + .filter(|s| !s.raw.is_empty()) + .map(|s| s.raw) + .join(DELIMITER); + + Self { raw } + } +} + +/// Given a filesystem path, convert it to its canonical URL representation, +/// returning an error if the file doesn't exist on the local filesystem +pub(crate) fn filesystem_path_to_url( + path: impl AsRef, +) -> Result { + let path = path.as_ref().canonicalize().context(CanonicalizeSnafu { + path: path.as_ref(), + })?; + + match path.is_dir() { + true => Url::from_directory_path(&path), + false => Url::from_file_path(&path), + } + .map_err(|_| Error::InvalidPath { path }) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn cloud_prefix_with_trailing_delimiter() { + // Use case: files exist in object storage named `foo/bar.json` and + // `foo_test.json`. A search for the prefix `foo/` should return + // `foo/bar.json` but not `foo_test.json'. + let prefix = Path::from_iter(["test"]); + assert_eq!(prefix.as_ref(), "test"); + } + + #[test] + fn push_encodes() { + let location = Path::from_iter(["foo/bar", "baz%2Ftest"]); + assert_eq!(location.as_ref(), "foo%2Fbar/baz%252Ftest"); + } + + #[test] + fn test_parse() { + assert_eq!(Path::parse("/").unwrap().as_ref(), ""); + assert_eq!(Path::parse("").unwrap().as_ref(), ""); + + let err = Path::parse("//").unwrap_err(); + assert!(matches!(err, Error::EmptySegment { .. })); + + assert_eq!(Path::parse("/foo/bar/").unwrap().as_ref(), "foo/bar"); + assert_eq!(Path::parse("foo/bar/").unwrap().as_ref(), "foo/bar"); + assert_eq!(Path::parse("foo/bar").unwrap().as_ref(), "foo/bar"); + + let err = Path::parse("foo///bar").unwrap_err(); + assert!(matches!(err, Error::EmptySegment { .. })); + } + + #[test] + fn convert_raw_before_partial_eq() { + // dir and file_name + let cloud = Path::from("test_dir/test_file.json"); + let built = Path::from_iter(["test_dir", "test_file.json"]); + + assert_eq!(built, cloud); + + // dir and file_name w/o dot + let cloud = Path::from("test_dir/test_file"); + let built = Path::from_iter(["test_dir", "test_file"]); + + assert_eq!(built, cloud); + + // dir, no file + let cloud = Path::from("test_dir/"); + let built = Path::from_iter(["test_dir"]); + assert_eq!(built, cloud); + + // file_name, no dir + let cloud = Path::from("test_file.json"); + let built = Path::from_iter(["test_file.json"]); + assert_eq!(built, cloud); + + // empty + let cloud = Path::from(""); + let built = Path::from_iter(["", ""]); + + assert_eq!(built, cloud); + } + + #[test] + fn parts_after_prefix_behavior() { + let existing_path = Path::from("apple/bear/cow/dog/egg.json"); + + // Prefix with one directory + let prefix = Path::from("apple"); + let expected_parts: Vec> = vec!["bear", "cow", "dog", "egg.json"] + .into_iter() + .map(Into::into) + .collect(); + let parts: Vec<_> = existing_path.prefix_match(&prefix).unwrap().collect(); + assert_eq!(parts, expected_parts); + + // Prefix with two directories + let prefix = Path::from("apple/bear"); + let expected_parts: Vec> = vec!["cow", "dog", "egg.json"] + .into_iter() + .map(Into::into) + .collect(); + let parts: Vec<_> = existing_path.prefix_match(&prefix).unwrap().collect(); + assert_eq!(parts, expected_parts); + + // Not a prefix + let prefix = Path::from("cow"); + assert!(existing_path.prefix_match(&prefix).is_none()); + + // Prefix with a partial directory + let prefix = Path::from("ap"); + assert!(existing_path.prefix_match(&prefix).is_none()); + + // Prefix matches but there aren't any parts after it + let existing_path = Path::from("apple/bear/cow/dog"); + + let prefix = existing_path.clone(); + assert_eq!(existing_path.prefix_match(&prefix).unwrap().count(), 0); + } + + #[test] + fn prefix_matches() { + let haystack = Path::from_iter(["foo/bar", "baz%2Ftest", "something"]); + let needle = haystack.clone(); + // self starts with self + assert!( + haystack.prefix_matches(&haystack), + "{:?} should have started with {:?}", + haystack, + haystack + ); + + // a longer prefix doesn't match + let needle = needle.child("longer now"); + assert!( + !haystack.prefix_matches(&needle), + "{:?} shouldn't have started with {:?}", + haystack, + needle + ); + + // one dir prefix matches + let needle = Path::from_iter(["foo/bar"]); + assert!( + haystack.prefix_matches(&needle), + "{:?} should have started with {:?}", + haystack, + needle + ); + + // two dir prefix matches + let needle = needle.child("baz%2Ftest"); + assert!( + haystack.prefix_matches(&needle), + "{:?} should have started with {:?}", + haystack, + needle + ); + + // partial dir prefix doesn't match + let needle = Path::from_iter(["f"]); + assert!( + !haystack.prefix_matches(&needle), + "{:?} should not have started with {:?}", + haystack, + needle + ); + + // one dir and one partial dir doesn't match + let needle = Path::from_iter(["foo/bar", "baz"]); + assert!( + !haystack.prefix_matches(&needle), + "{:?} should not have started with {:?}", + haystack, + needle + ); + + // empty prefix matches + let needle = Path::from(""); + assert!( + haystack.prefix_matches(&needle), + "{:?} should have started with {:?}", + haystack, + needle + ); + } + + #[test] + fn prefix_matches_with_file_name() { + let haystack = + Path::from_iter(["foo/bar", "baz%2Ftest", "something", "foo.segment"]); + + // All directories match and file name is a prefix + let needle = Path::from_iter(["foo/bar", "baz%2Ftest", "something", "foo"]); + + assert!( + !haystack.prefix_matches(&needle), + "{:?} should not have started with {:?}", + haystack, + needle + ); + + // All directories match but file name is not a prefix + let needle = Path::from_iter(["foo/bar", "baz%2Ftest", "something", "e"]); + + assert!( + !haystack.prefix_matches(&needle), + "{:?} should not have started with {:?}", + haystack, + needle + ); + + // Not all directories match; file name is a prefix of the next directory; this + // does not match + let needle = Path::from_iter(["foo/bar", "baz%2Ftest", "s"]); + + assert!( + !haystack.prefix_matches(&needle), + "{:?} should not have started with {:?}", + haystack, + needle + ); + + // Not all directories match; file name is NOT a prefix of the next directory; + // no match + let needle = Path::from_iter(["foo/bar", "baz%2Ftest", "p"]); + + assert!( + !haystack.prefix_matches(&needle), + "{:?} should not have started with {:?}", + haystack, + needle + ); + } +} diff --git a/src/path/parts.rs b/src/path/parts.rs new file mode 100644 index 0000000..e73b184 --- /dev/null +++ b/src/path/parts.rs @@ -0,0 +1,148 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use percent_encoding::{percent_decode, percent_encode, AsciiSet, CONTROLS}; +use std::borrow::Cow; + +use crate::path::DELIMITER_BYTE; +use snafu::Snafu; + +/// Error returned by [`PathPart::parse`] +#[derive(Debug, Snafu)] +#[snafu(display("Invalid path segment - got \"{}\" expected: \"{}\"", actual, expected))] +#[allow(missing_copy_implementations)] +pub struct InvalidPart { + actual: String, + expected: String, +} + +/// The PathPart type exists to validate the directory/file names that form part +/// of a path. +/// +/// A PathPart instance is guaranteed to to contain no illegal characters (e.g. `/`) +/// as it can only be constructed by going through the `from` impl. +#[derive(Clone, PartialEq, Eq, PartialOrd, Ord, Debug, Default, Hash)] +pub struct PathPart<'a> { + pub(super) raw: Cow<'a, str>, +} + +impl<'a> PathPart<'a> { + /// Parse the provided path segment as a [`PathPart`] returning an error if invalid + pub fn parse(segment: &'a str) -> Result { + let decoded: Cow<'a, [u8]> = percent_decode(segment.as_bytes()).into(); + let part = PathPart::from(decoded.as_ref()); + if segment != part.as_ref() { + return Err(InvalidPart { + actual: segment.to_string(), + expected: part.raw.to_string(), + }); + } + + Ok(Self { + raw: segment.into(), + }) + } +} + +/// Characters we want to encode. +const INVALID: &AsciiSet = &CONTROLS + // The delimiter we are reserving for internal hierarchy + .add(DELIMITER_BYTE) + // Characters AWS recommends avoiding for object keys + // https://docs.aws.amazon.com/AmazonS3/latest/dev/UsingMetadata.html + .add(b'\\') + .add(b'{') + .add(b'^') + .add(b'}') + .add(b'%') + .add(b'`') + .add(b']') + .add(b'"') // " <-- my editor is confused about double quotes within single quotes + .add(b'>') + .add(b'[') + .add(b'~') + .add(b'<') + .add(b'#') + .add(b'|') + // Characters Google Cloud Storage recommends avoiding for object names + // https://cloud.google.com/storage/docs/naming-objects + .add(b'\r') + .add(b'\n') + .add(b'*') + .add(b'?'); + +impl<'a> From<&'a [u8]> for PathPart<'a> { + fn from(v: &'a [u8]) -> Self { + let inner = match v { + // We don't want to encode `.` generally, but we do want to disallow parts of paths + // to be equal to `.` or `..` to prevent file system traversal shenanigans. + b"." => "%2E".into(), + b".." => "%2E%2E".into(), + other => percent_encode(other, INVALID).into(), + }; + Self { raw: inner } + } +} + +impl<'a> From<&'a str> for PathPart<'a> { + fn from(v: &'a str) -> Self { + Self::from(v.as_bytes()) + } +} + +impl From for PathPart<'static> { + fn from(s: String) -> Self { + Self { + raw: Cow::Owned(PathPart::from(s.as_str()).raw.into_owned()), + } + } +} + +impl<'a> AsRef for PathPart<'a> { + fn as_ref(&self) -> &str { + self.raw.as_ref() + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn path_part_delimiter_gets_encoded() { + let part: PathPart<'_> = "foo/bar".into(); + assert_eq!(part.raw, "foo%2Fbar"); + } + + #[test] + fn path_part_given_already_encoded_string() { + let part: PathPart<'_> = "foo%2Fbar".into(); + assert_eq!(part.raw, "foo%252Fbar"); + } + + #[test] + fn path_part_cant_be_one_dot() { + let part: PathPart<'_> = ".".into(); + assert_eq!(part.raw, "%2E"); + } + + #[test] + fn path_part_cant_be_two_dots() { + let part: PathPart<'_> = "..".into(); + assert_eq!(part.raw, "%2E%2E"); + } +} diff --git a/src/throttle.rs b/src/throttle.rs new file mode 100644 index 0000000..7a54a06 --- /dev/null +++ b/src/throttle.rs @@ -0,0 +1,540 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! A throttling object store wrapper +use parking_lot::Mutex; +use std::ops::Range; +use std::{convert::TryInto, sync::Arc}; + +use crate::{path::Path, GetResult, ListResult, ObjectMeta, ObjectStore, Result}; +use async_trait::async_trait; +use bytes::Bytes; +use futures::{stream::BoxStream, StreamExt}; +use std::time::Duration; + +/// Configuration settings for throttled store +#[derive(Debug, Default, Clone, Copy)] +pub struct ThrottleConfig { + /// Sleep duration for every call to [`delete`](ThrottledStore::delete). + /// + /// Sleeping is done before the underlying store is called and independently of the success of + /// the operation. + pub wait_delete_per_call: Duration, + + /// Sleep duration for every byte received during [`get`](ThrottledStore::get). + /// + /// Sleeping is performed after the underlying store returned and only for successful gets. The + /// sleep duration is additive to [`wait_get_per_call`](Self::wait_get_per_call). + /// + /// Note that the per-byte sleep only happens as the user consumes the output bytes. Should + /// there be an intermediate failure (i.e. after partly consuming the output bytes), the + /// resulting sleep time will be partial as well. + pub wait_get_per_byte: Duration, + + /// Sleep duration for every call to [`get`](ThrottledStore::get). + /// + /// Sleeping is done before the underlying store is called and independently of the success of + /// the operation. The sleep duration is additive to + /// [`wait_get_per_byte`](Self::wait_get_per_byte). + pub wait_get_per_call: Duration, + + /// Sleep duration for every call to [`list`](ThrottledStore::list). + /// + /// Sleeping is done before the underlying store is called and independently of the success of + /// the operation. The sleep duration is additive to + /// [`wait_list_per_entry`](Self::wait_list_per_entry). + pub wait_list_per_call: Duration, + + /// Sleep duration for every entry received during [`list`](ThrottledStore::list). + /// + /// Sleeping is performed after the underlying store returned and only for successful lists. + /// The sleep duration is additive to [`wait_list_per_call`](Self::wait_list_per_call). + /// + /// Note that the per-entry sleep only happens as the user consumes the output entries. Should + /// there be an intermediate failure (i.e. after partly consuming the output entries), the + /// resulting sleep time will be partial as well. + pub wait_list_per_entry: Duration, + + /// Sleep duration for every call to + /// [`list_with_delimiter`](ThrottledStore::list_with_delimiter). + /// + /// Sleeping is done before the underlying store is called and independently of the success of + /// the operation. The sleep duration is additive to + /// [`wait_list_with_delimiter_per_entry`](Self::wait_list_with_delimiter_per_entry). + pub wait_list_with_delimiter_per_call: Duration, + + /// Sleep duration for every entry received during + /// [`list_with_delimiter`](ThrottledStore::list_with_delimiter). + /// + /// Sleeping is performed after the underlying store returned and only for successful gets. The + /// sleep duration is additive to + /// [`wait_list_with_delimiter_per_call`](Self::wait_list_with_delimiter_per_call). + pub wait_list_with_delimiter_per_entry: Duration, + + /// Sleep duration for every call to [`put`](ThrottledStore::put). + /// + /// Sleeping is done before the underlying store is called and independently of the success of + /// the operation. + pub wait_put_per_call: Duration, +} + +/// Sleep only if non-zero duration +async fn sleep(duration: Duration) { + if !duration.is_zero() { + tokio::time::sleep(duration).await + } +} + +/// Store wrapper that wraps an inner store with some `sleep` calls. +/// +/// This can be used for performance testing. +/// +/// **Note that the behavior of the wrapper is deterministic and might not reflect real-world +/// conditions!** +#[derive(Debug)] +pub struct ThrottledStore { + inner: T, + config: Arc>, +} + +impl ThrottledStore { + /// Create new wrapper with zero waiting times. + pub fn new(inner: T, config: ThrottleConfig) -> Self { + Self { + inner, + config: Arc::new(Mutex::new(config)), + } + } + + /// Mutate config. + pub fn config_mut(&self, f: F) + where + F: Fn(&mut ThrottleConfig), + { + let mut guard = self.config.lock(); + f(&mut guard) + } + + /// Return copy of current config. + pub fn config(&self) -> ThrottleConfig { + *self.config.lock() + } +} + +impl std::fmt::Display for ThrottledStore { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "ThrottledStore({})", self.inner) + } +} + +#[async_trait] +impl ObjectStore for ThrottledStore { + async fn put(&self, location: &Path, bytes: Bytes) -> Result<()> { + sleep(self.config().wait_put_per_call).await; + + self.inner.put(location, bytes).await + } + + async fn get(&self, location: &Path) -> Result { + sleep(self.config().wait_get_per_call).await; + + // need to copy to avoid moving / referencing `self` + let wait_get_per_byte = self.config().wait_get_per_byte; + + self.inner.get(location).await.map(|result| { + let s = match result { + GetResult::Stream(s) => s, + GetResult::File(_, _) => unimplemented!(), + }; + + GetResult::Stream( + s.then(move |bytes_result| async move { + match bytes_result { + Ok(bytes) => { + let bytes_len: u32 = usize_to_u32_saturate(bytes.len()); + sleep(wait_get_per_byte * bytes_len).await; + Ok(bytes) + } + Err(err) => Err(err), + } + }) + .boxed(), + ) + }) + } + + async fn get_range(&self, location: &Path, range: Range) -> Result { + let config = self.config(); + + let sleep_duration = config.wait_delete_per_call + + config.wait_get_per_byte * (range.end - range.start) as u32; + + sleep(sleep_duration).await; + + self.inner.get_range(location, range).await + } + + async fn head(&self, location: &Path) -> Result { + sleep(self.config().wait_put_per_call).await; + self.inner.head(location).await + } + + async fn delete(&self, location: &Path) -> Result<()> { + sleep(self.config().wait_delete_per_call).await; + + self.inner.delete(location).await + } + + async fn list( + &self, + prefix: Option<&Path>, + ) -> Result>> { + sleep(self.config().wait_list_per_call).await; + + // need to copy to avoid moving / referencing `self` + let wait_list_per_entry = self.config().wait_list_per_entry; + + self.inner.list(prefix).await.map(|stream| { + stream + .then(move |result| async move { + match result { + Ok(entry) => { + sleep(wait_list_per_entry).await; + Ok(entry) + } + Err(err) => Err(err), + } + }) + .boxed() + }) + } + + async fn list_with_delimiter(&self, prefix: Option<&Path>) -> Result { + sleep(self.config().wait_list_with_delimiter_per_call).await; + + match self.inner.list_with_delimiter(prefix).await { + Ok(list_result) => { + let entries_len = usize_to_u32_saturate(list_result.objects.len()); + sleep(self.config().wait_list_with_delimiter_per_entry * entries_len) + .await; + Ok(list_result) + } + Err(err) => Err(err), + } + } + + async fn copy(&self, from: &Path, to: &Path) -> Result<()> { + sleep(self.config().wait_put_per_call).await; + + self.inner.copy(from, to).await + } + + async fn copy_if_not_exists(&self, from: &Path, to: &Path) -> Result<()> { + sleep(self.config().wait_put_per_call).await; + + self.inner.copy_if_not_exists(from, to).await + } +} + +/// Saturated `usize` to `u32` cast. +fn usize_to_u32_saturate(x: usize) -> u32 { + x.try_into().unwrap_or(u32::MAX) +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::{ + memory::InMemory, + tests::{ + copy_if_not_exists, list_uses_directories_correctly, list_with_delimiter, + put_get_delete_list, rename_and_copy, + }, + }; + use bytes::Bytes; + use futures::TryStreamExt; + use tokio::time::Duration; + use tokio::time::Instant; + + const WAIT_TIME: Duration = Duration::from_millis(100); + const ZERO: Duration = Duration::from_millis(0); // Duration::default isn't constant + + macro_rules! assert_bounds { + ($d:expr, $lower:expr) => { + assert_bounds!($d, $lower, $lower + 1); + }; + ($d:expr, $lower:expr, $upper:expr) => { + let d = $d; + let lower = $lower * WAIT_TIME; + let upper = $upper * WAIT_TIME; + assert!(d >= lower, "{:?} must be >= than {:?}", d, lower); + assert!(d < upper, "{:?} must be < than {:?}", d, upper); + }; + } + + #[tokio::test] + async fn throttle_test() { + let inner = InMemory::new(); + let store = ThrottledStore::new(inner, ThrottleConfig::default()); + + put_get_delete_list(&store).await.unwrap(); + list_uses_directories_correctly(&store).await.unwrap(); + list_with_delimiter(&store).await.unwrap(); + rename_and_copy(&store).await.unwrap(); + copy_if_not_exists(&store).await.unwrap(); + } + + #[tokio::test] + async fn delete_test() { + let inner = InMemory::new(); + let store = ThrottledStore::new(inner, ThrottleConfig::default()); + + assert_bounds!(measure_delete(&store, None).await, 0); + assert_bounds!(measure_delete(&store, Some(0)).await, 0); + assert_bounds!(measure_delete(&store, Some(10)).await, 0); + + store.config_mut(|cfg| cfg.wait_delete_per_call = WAIT_TIME); + assert_bounds!(measure_delete(&store, None).await, 1); + assert_bounds!(measure_delete(&store, Some(0)).await, 1); + assert_bounds!(measure_delete(&store, Some(10)).await, 1); + } + + #[tokio::test] + // macos github runner is so slow it can't complete within WAIT_TIME*2 + #[cfg(target_os = "linux")] + async fn get_test() { + let inner = InMemory::new(); + let store = ThrottledStore::new(inner, ThrottleConfig::default()); + + assert_bounds!(measure_get(&store, None).await, 0); + assert_bounds!(measure_get(&store, Some(0)).await, 0); + assert_bounds!(measure_get(&store, Some(10)).await, 0); + + store.config_mut(|cfg| cfg.wait_get_per_call = WAIT_TIME); + assert_bounds!(measure_get(&store, None).await, 1); + assert_bounds!(measure_get(&store, Some(0)).await, 1); + assert_bounds!(measure_get(&store, Some(10)).await, 1); + + store.config_mut(|cfg| { + cfg.wait_get_per_call = ZERO; + cfg.wait_get_per_byte = WAIT_TIME; + }); + assert_bounds!(measure_get(&store, Some(2)).await, 2); + + store.config_mut(|cfg| { + cfg.wait_get_per_call = WAIT_TIME; + cfg.wait_get_per_byte = WAIT_TIME; + }); + assert_bounds!(measure_get(&store, Some(2)).await, 3); + } + + #[tokio::test] + // macos github runner is so slow it can't complete within WAIT_TIME*2 + #[cfg(target_os = "linux")] + async fn list_test() { + let inner = InMemory::new(); + let store = ThrottledStore::new(inner, ThrottleConfig::default()); + + assert_bounds!(measure_list(&store, 0).await, 0); + assert_bounds!(measure_list(&store, 10).await, 0); + + store.config_mut(|cfg| cfg.wait_list_per_call = WAIT_TIME); + assert_bounds!(measure_list(&store, 0).await, 1); + assert_bounds!(measure_list(&store, 10).await, 1); + + store.config_mut(|cfg| { + cfg.wait_list_per_call = ZERO; + cfg.wait_list_per_entry = WAIT_TIME; + }); + assert_bounds!(measure_list(&store, 2).await, 2); + + store.config_mut(|cfg| { + cfg.wait_list_per_call = WAIT_TIME; + cfg.wait_list_per_entry = WAIT_TIME; + }); + assert_bounds!(measure_list(&store, 2).await, 3); + } + + #[tokio::test] + // macos github runner is so slow it can't complete within WAIT_TIME*2 + #[cfg(target_os = "linux")] + async fn list_with_delimiter_test() { + let inner = InMemory::new(); + let store = ThrottledStore::new(inner, ThrottleConfig::default()); + + assert_bounds!(measure_list_with_delimiter(&store, 0).await, 0); + assert_bounds!(measure_list_with_delimiter(&store, 10).await, 0); + + store.config_mut(|cfg| cfg.wait_list_with_delimiter_per_call = WAIT_TIME); + assert_bounds!(measure_list_with_delimiter(&store, 0).await, 1); + assert_bounds!(measure_list_with_delimiter(&store, 10).await, 1); + + store.config_mut(|cfg| { + cfg.wait_list_with_delimiter_per_call = ZERO; + cfg.wait_list_with_delimiter_per_entry = WAIT_TIME; + }); + assert_bounds!(measure_list_with_delimiter(&store, 2).await, 2); + + store.config_mut(|cfg| { + cfg.wait_list_with_delimiter_per_call = WAIT_TIME; + cfg.wait_list_with_delimiter_per_entry = WAIT_TIME; + }); + assert_bounds!(measure_list_with_delimiter(&store, 2).await, 3); + } + + #[tokio::test] + async fn put_test() { + let inner = InMemory::new(); + let store = ThrottledStore::new(inner, ThrottleConfig::default()); + + assert_bounds!(measure_put(&store, 0).await, 0); + assert_bounds!(measure_put(&store, 10).await, 0); + + store.config_mut(|cfg| cfg.wait_put_per_call = WAIT_TIME); + assert_bounds!(measure_put(&store, 0).await, 1); + assert_bounds!(measure_put(&store, 10).await, 1); + + store.config_mut(|cfg| cfg.wait_put_per_call = ZERO); + assert_bounds!(measure_put(&store, 0).await, 0); + } + + async fn place_test_object( + store: &ThrottledStore, + n_bytes: Option, + ) -> Path { + let path = Path::from("foo"); + + if let Some(n_bytes) = n_bytes { + let data: Vec<_> = std::iter::repeat(1u8).take(n_bytes).collect(); + let bytes = Bytes::from(data); + store.put(&path, bytes).await.unwrap(); + } else { + // ensure object is absent + store.delete(&path).await.unwrap(); + } + + path + } + + async fn place_test_objects( + store: &ThrottledStore, + n_entries: usize, + ) -> Path { + let prefix = Path::from("foo"); + + // clean up store + let entries: Vec<_> = store + .list(Some(&prefix)) + .await + .unwrap() + .try_collect() + .await + .unwrap(); + + for entry in entries { + store.delete(&entry.location).await.unwrap(); + } + + // create new entries + for i in 0..n_entries { + let path = prefix.child(i.to_string().as_str()); + + let data = Bytes::from("bar"); + store.put(&path, data).await.unwrap(); + } + + prefix + } + + async fn measure_delete( + store: &ThrottledStore, + n_bytes: Option, + ) -> Duration { + let path = place_test_object(store, n_bytes).await; + + let t0 = Instant::now(); + store.delete(&path).await.unwrap(); + + t0.elapsed() + } + + async fn measure_get( + store: &ThrottledStore, + n_bytes: Option, + ) -> Duration { + let path = place_test_object(store, n_bytes).await; + + let t0 = Instant::now(); + let res = store.get(&path).await; + if n_bytes.is_some() { + // need to consume bytes to provoke sleep times + let s = match res.unwrap() { + GetResult::Stream(s) => s, + GetResult::File(_, _) => unimplemented!(), + }; + + s.map_ok(|b| bytes::BytesMut::from(&b[..])) + .try_concat() + .await + .unwrap(); + } else { + assert!(res.is_err()); + } + + t0.elapsed() + } + + async fn measure_list( + store: &ThrottledStore, + n_entries: usize, + ) -> Duration { + let prefix = place_test_objects(store, n_entries).await; + + let t0 = Instant::now(); + store + .list(Some(&prefix)) + .await + .unwrap() + .try_collect::>() + .await + .unwrap(); + + t0.elapsed() + } + + async fn measure_list_with_delimiter( + store: &ThrottledStore, + n_entries: usize, + ) -> Duration { + let prefix = place_test_objects(store, n_entries).await; + + let t0 = Instant::now(); + store.list_with_delimiter(Some(&prefix)).await.unwrap(); + + t0.elapsed() + } + + async fn measure_put(store: &ThrottledStore, n_bytes: usize) -> Duration { + let data: Vec<_> = std::iter::repeat(1u8).take(n_bytes).collect(); + let bytes = Bytes::from(data); + + let t0 = Instant::now(); + store.put(&Path::from("foo"), bytes).await.unwrap(); + + t0.elapsed() + } +} diff --git a/src/token.rs b/src/token.rs new file mode 100644 index 0000000..a56a294 --- /dev/null +++ b/src/token.rs @@ -0,0 +1,64 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::future::Future; +use std::time::Instant; +use tokio::sync::Mutex; + +/// A temporary authentication token with an associated expiry +#[derive(Debug, Clone)] +pub struct TemporaryToken { + /// The temporary credential + pub token: T, + /// The instant at which this credential is no longer valid + pub expiry: Instant, +} + +/// Provides [`TokenCache::get_or_insert_with`] which can be used to cache a +/// [`TemporaryToken`] based on its expiry +#[derive(Debug, Default)] +pub struct TokenCache { + cache: Mutex>>, +} + +impl TokenCache { + pub async fn get_or_insert_with(&self, f: F) -> Result + where + F: FnOnce() -> Fut + Send, + Fut: Future, E>> + Send, + { + let now = Instant::now(); + let mut locked = self.cache.lock().await; + + if let Some(cached) = locked.as_ref() { + let delta = cached + .expiry + .checked_duration_since(now) + .unwrap_or_default(); + + if delta.as_secs() > 300 { + return Ok(cached.token.clone()); + } + } + + let cached = f().await?; + let token = cached.token.clone(); + *locked = Some(cached); + + Ok(token) + } +} diff --git a/src/util.rs b/src/util.rs new file mode 100644 index 0000000..4f3ed86 --- /dev/null +++ b/src/util.rs @@ -0,0 +1,73 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Common logic for interacting with remote object stores +use super::Result; +use bytes::Bytes; +use futures::{stream::StreamExt, Stream}; + +/// Returns the prefix to be passed to an object store +#[cfg(any(feature = "aws", feature = "gcp", feature = "azure"))] +pub fn format_prefix(prefix: Option<&crate::path::Path>) -> Option { + prefix + .filter(|x| !x.as_ref().is_empty()) + .map(|p| format!("{}{}", p.as_ref(), crate::path::DELIMITER)) +} + +/// Returns a formatted HTTP range header as per +/// +#[cfg(any(feature = "aws", feature = "gcp"))] +pub fn format_http_range(range: std::ops::Range) -> String { + format!("bytes={}-{}", range.start, range.end.saturating_sub(1)) +} + +/// Collect a stream into [`Bytes`] avoiding copying in the event of a single chunk +pub async fn collect_bytes(mut stream: S, size_hint: Option) -> Result +where + S: Stream> + Send + Unpin, +{ + let first = stream.next().await.transpose()?.unwrap_or_default(); + + // Avoid copying if single response + match stream.next().await.transpose()? { + None => Ok(first), + Some(second) => { + let size_hint = size_hint.unwrap_or_else(|| first.len() + second.len()); + + let mut buf = Vec::with_capacity(size_hint); + buf.extend_from_slice(&first); + buf.extend_from_slice(&second); + while let Some(maybe_bytes) = stream.next().await { + buf.extend_from_slice(&maybe_bytes?); + } + + Ok(buf.into()) + } + } +} + +/// Takes a function and spawns it to a tokio blocking pool if available +pub async fn maybe_spawn_blocking(f: F) -> Result +where + F: FnOnce() -> Result + Send + 'static, + T: Send + 'static, +{ + match tokio::runtime::Handle::try_current() { + Ok(runtime) => runtime.spawn_blocking(f).await?, + Err(_) => f(), + } +} From 51df1947b63d4c9c4102092d0fc8aa602e511078 Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Sat, 23 Jul 2022 06:01:54 -0700 Subject: [PATCH 002/397] Increase upper wait time (#2142) --- src/throttle.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/throttle.rs b/src/throttle.rs index 7a54a06..e91d55a 100644 --- a/src/throttle.rs +++ b/src/throttle.rs @@ -275,7 +275,7 @@ mod tests { macro_rules! assert_bounds { ($d:expr, $lower:expr) => { - assert_bounds!($d, $lower, $lower + 1); + assert_bounds!($d, $lower, $lower + 2); }; ($d:expr, $lower:expr, $upper:expr) => { let d = $d; From d0b03d3d44966dd517b1a30994488f5ee95cb180 Mon Sep 17 00:00:00 2001 From: Yang Jiang Date: Sun, 24 Jul 2022 03:04:10 +0800 Subject: [PATCH 003/397] update rust version to 1.62 (#2144) --- src/throttle.rs | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/throttle.rs b/src/throttle.rs index e91d55a..6560296 100644 --- a/src/throttle.rs +++ b/src/throttle.rs @@ -430,6 +430,7 @@ mod tests { path } + #[allow(dead_code)] async fn place_test_objects( store: &ThrottledStore, n_entries: usize, @@ -472,6 +473,7 @@ mod tests { t0.elapsed() } + #[allow(dead_code)] async fn measure_get( store: &ThrottledStore, n_bytes: Option, @@ -498,6 +500,7 @@ mod tests { t0.elapsed() } + #[allow(dead_code)] async fn measure_list( store: &ThrottledStore, n_entries: usize, @@ -516,6 +519,7 @@ mod tests { t0.elapsed() } + #[allow(dead_code)] async fn measure_list_with_delimiter( store: &ThrottledStore, n_entries: usize, From 3739a6cf0ef7d5719e523880bb8be43af482572b Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Mon, 25 Jul 2022 09:24:01 -0400 Subject: [PATCH 004/397] Port `object_store` integration tests, use github actions (#2148) * Add github test skeleton * Cleanups and fmt * Run on changes to object_store * Update name * Broken yaml? * Remove uneeded lint job * Run only object store tests * Add local gcp test instructions * Allow custom http client for gcs * remove unused error * Also run clippy * Update object_store/src/gcp.rs Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> * rename more * Fixup test Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> --- CONTRIBUTING.md | 23 +++++++++++++++++++++-- src/azure.rs | 50 ++++++++++++++++++++++++++++++++++++++++++++++++- src/gcp.rs | 34 ++++++++++++++++++++++++++------- src/lib.rs | 4 ++-- 4 files changed, 99 insertions(+), 12 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 2e216dd..7c2832c 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -90,5 +90,24 @@ $ cargo test --features azure ### GCP -We don't have a good story yet for testing the GCP integration locally. You will need to create a GCS bucket, a -service account that has access to it, and use this to run the tests. +To test the GCS integration, we use [Fake GCS Server](https://github.com/fsouza/fake-gcs-server) + +Startup the fake server: + +```shell +docker run -p 4443:4443 fsouza/fake-gcs-server +``` + +Configure the account: +```shell +curl --insecure -v -X POST --data-binary '{"name":"test-bucket"}' -H "Content-Type: application/json" "https://localhost:4443/storage/v1/b" +echo '{"gcs_base_url": "https://localhost:4443", "disable_oauth": true, "client_email": "", "private_key": ""}' > /tmp/gcs.json +``` + +Now run the tests: +```shell +TEST_INTEGRATION=1 \ +OBJECT_STORE_BUCKET=test-bucket \ +GOOGLE_SERVICE_ACCOUNT=/tmp/gcs.json \ +cargo test -p object_store --features=gcp +``` diff --git a/src/azure.rs b/src/azure.rs index 5f43279..75dafef 100644 --- a/src/azure.rs +++ b/src/azure.rs @@ -38,6 +38,7 @@ use futures::{ use snafu::{ResultExt, Snafu}; use std::collections::BTreeSet; use std::{convert::TryInto, sync::Arc}; +use url::Url; /// A specialized `Error` for Azure object store-related errors #[derive(Debug, Snafu)] @@ -158,6 +159,18 @@ enum Error { "Azurite (azure emulator) support not compiled in, please add `azure_test` feature" ))] NoEmulatorFeature, + + #[snafu(display( + "Unable parse emulator url {}={}, Error: {}", + env_name, + env_value, + source + ))] + UnableToParseEmulatorUrl { + env_name: String, + env_value: String, + source: url::ParseError, + }, } impl From for super::Error { @@ -507,6 +520,21 @@ fn check_if_emulator_works() -> Result<()> { Err(Error::NoEmulatorFeature.into()) } +/// Parses the contents of the environment variable `env_name` as a URL +/// if present, otherwise falls back to default_url +fn url_from_env(env_name: &str, default_url: &str) -> Result { + let url = match std::env::var(env_name) { + Ok(env_value) => { + Url::parse(&env_value).context(UnableToParseEmulatorUrlSnafu { + env_name, + env_value, + })? + } + Err(_) => Url::parse(default_url).expect("Failed to parse default URL"), + }; + Ok(url) +} + /// Configure a connection to container with given name on Microsoft Azure /// Blob store. /// @@ -524,7 +552,27 @@ pub fn new_azure( let (is_emulator, storage_account_client) = if use_emulator { check_if_emulator_works()?; - (true, StorageAccountClient::new_emulator_default()) + // Allow overriding defaults. Values taken from + // from https://docs.rs/azure_storage/0.2.0/src/azure_storage/core/clients/storage_account_client.rs.html#129-141 + let http_client = azure_core::new_http_client(); + let blob_storage_url = + url_from_env("AZURITE_BLOB_STORAGE_URL", "http://127.0.0.1:10000")?; + let queue_storage_url = + url_from_env("AZURITE_QUEUE_STORAGE_URL", "http://127.0.0.1:10001")?; + let table_storage_url = + url_from_env("AZURITE_TABLE_STORAGE_URL", "http://127.0.0.1:10002")?; + let filesystem_url = + url_from_env("AZURITE_TABLE_STORAGE_URL", "http://127.0.0.1:10004")?; + + let storage_client = StorageAccountClient::new_emulator( + http_client, + &blob_storage_url, + &table_storage_url, + &queue_storage_url, + &filesystem_url, + ); + + (true, storage_client) } else { ( false, diff --git a/src/gcp.rs b/src/gcp.rs index 84fb572..e836cab 100644 --- a/src/gcp.rs +++ b/src/gcp.rs @@ -502,9 +502,17 @@ fn reader_credentials_file( pub fn new_gcs( service_account_path: impl AsRef, bucket_name: impl Into, +) -> Result { + new_gcs_with_client(service_account_path, bucket_name, Client::new()) +} + +/// Configure a connection to Google Cloud Storage with the specified HTTP client. +pub fn new_gcs_with_client( + service_account_path: impl AsRef, + bucket_name: impl Into, + client: Client, ) -> Result { let credentials = reader_credentials_file(service_account_path)?; - let client = Client::new(); // TODO: https://cloud.google.com/storage/docs/authentication#oauth-scopes let scope = "https://www.googleapis.com/auth/devstorage.full_control"; @@ -575,6 +583,18 @@ mod test { service_account: String, } + impl GoogleCloudConfig { + fn build_test(self) -> Result { + // ignore HTTPS errors in tests so we can use fake-gcs server + let client = Client::builder() + .danger_accept_invalid_certs(true) + .build() + .expect("Error creating http client for testing"); + + new_gcs_with_client(self.service_account, self.bucket, client) + } + } + // Helper macro to skip tests if TEST_INTEGRATION and the GCP environment variables are not set. macro_rules! maybe_skip_integration { () => {{ @@ -622,7 +642,7 @@ mod test { #[tokio::test] async fn gcs_test() { let config = maybe_skip_integration!(); - let integration = new_gcs(config.service_account, config.bucket).unwrap(); + let integration = config.build_test().unwrap(); put_get_delete_list(&integration).await.unwrap(); list_uses_directories_correctly(&integration).await.unwrap(); @@ -633,7 +653,7 @@ mod test { #[tokio::test] async fn gcs_test_get_nonexistent_location() { let config = maybe_skip_integration!(); - let integration = new_gcs(config.service_account, &config.bucket).unwrap(); + let integration = config.build_test().unwrap(); let location = Path::from_iter([NON_EXISTENT_NAME]); @@ -650,7 +670,7 @@ mod test { async fn gcs_test_get_nonexistent_bucket() { let mut config = maybe_skip_integration!(); config.bucket = NON_EXISTENT_NAME.into(); - let integration = new_gcs(config.service_account, &config.bucket).unwrap(); + let integration = config.build_test().unwrap(); let location = Path::from_iter([NON_EXISTENT_NAME]); @@ -668,7 +688,7 @@ mod test { #[tokio::test] async fn gcs_test_delete_nonexistent_location() { let config = maybe_skip_integration!(); - let integration = new_gcs(config.service_account, &config.bucket).unwrap(); + let integration = config.build_test().unwrap(); let location = Path::from_iter([NON_EXISTENT_NAME]); @@ -684,7 +704,7 @@ mod test { async fn gcs_test_delete_nonexistent_bucket() { let mut config = maybe_skip_integration!(); config.bucket = NON_EXISTENT_NAME.into(); - let integration = new_gcs(config.service_account, &config.bucket).unwrap(); + let integration = config.build_test().unwrap(); let location = Path::from_iter([NON_EXISTENT_NAME]); @@ -700,7 +720,7 @@ mod test { async fn gcs_test_put_nonexistent_bucket() { let mut config = maybe_skip_integration!(); config.bucket = NON_EXISTENT_NAME.into(); - let integration = new_gcs(config.service_account, &config.bucket).unwrap(); + let integration = config.build_test().unwrap(); let location = Path::from_iter([NON_EXISTENT_NAME]); let data = Bytes::from("arbitrary data"); diff --git a/src/lib.rs b/src/lib.rs index 4a56b03..2dc6506 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -695,8 +695,8 @@ mod tests { #[tokio::test] async fn test_list_lifetimes() { let store = memory::InMemory::new(); - let stream = list_store(&store, "path").await.unwrap(); - assert_eq!(stream.count().await, 0); + let mut stream = list_store(&store, "path").await.unwrap(); + assert!(stream.next().await.is_none()); } // Tests TODO: From f6bf3017f878a4946885f6419e06948a404fab04 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Tue, 26 Jul 2022 10:20:27 -0400 Subject: [PATCH 005/397] Port Add stream upload (multi-part upload) (#2147) * feat: Add stream upload (multi-part upload) (#20) * feat: Implement multi-part upload Co-authored-by: Raphael Taylor-Davies * chore: simplify local file implementation * chore: Remove pin-project * feat: make cleanup_upload() top-level * docs: Add some docs for upload * chore: fix linting issue * fix: rename to put_multipart * feat: Implement multi-part upload for GCP * fix: Get GCS test to pass * chore: remove more upload language * fix: Add guard to test so we don't run with fake gcs server * chore: small tweaks * fix: apply suggestions from code review Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> * feat: switch to quick-xml * feat: remove throttle implementation of multipart * fix: rename from cleanup to abort * feat: enforce upload not readable until shutdown * fix: ensure we close files before moving them * chore: fix lint issue Co-authored-by: Raphael Taylor-Davies Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> * fmt * RAT multipart * Fix build * fix: merge issue Co-authored-by: Will Jones Co-authored-by: Raphael Taylor-Davies Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> --- Cargo.toml | 5 +- src/aws.rs | 231 +++++++++++++++++++++++++++++- src/azure.rs | 125 +++++++++++++++- src/gcp.rs | 340 ++++++++++++++++++++++++++++++++++++++++---- src/lib.rs | 105 +++++++++++++- src/local.rs | 361 ++++++++++++++++++++++++++++++++++++++++++++--- src/memory.rs | 69 ++++++++- src/multipart.rs | 195 +++++++++++++++++++++++++ src/throttle.rs | 17 +++ 9 files changed, 1392 insertions(+), 56 deletions(-) create mode 100644 src/multipart.rs diff --git a/Cargo.toml b/Cargo.toml index 613b6ab..7415398 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -44,6 +44,7 @@ chrono = { version = "0.4", default-features = false, features = ["clock"] } futures = "0.3" serde = { version = "1.0", default-features = false, features = ["derive"], optional = true } serde_json = { version = "1.0", default-features = false, optional = true } +quick-xml = { version = "0.23.0", features = ["serialize"], optional = true } rustls-pemfile = { version = "1.0", default-features = false, optional = true } ring = { version = "0.16", default-features = false, features = ["std"] } base64 = { version = "0.13", default-features = false, optional = true } @@ -59,7 +60,7 @@ rusoto_credential = { version = "0.48.0", optional = true, default-features = fa rusoto_s3 = { version = "0.48.0", optional = true, default-features = false, features = ["rustls"] } rusoto_sts = { version = "0.48.0", optional = true, default-features = false, features = ["rustls"] } snafu = "0.7" -tokio = { version = "1.18", features = ["sync", "macros", "parking_lot", "rt-multi-thread", "time"] } +tokio = { version = "1.18", features = ["sync", "macros", "parking_lot", "rt-multi-thread", "time", "io-util"] } tracing = { version = "0.1" } reqwest = { version = "0.11", optional = true, default-features = false, features = ["rustls-tls"] } parking_lot = { version = "0.12" } @@ -70,7 +71,7 @@ walkdir = "2" [features] azure = ["azure_core", "azure_storage_blobs", "azure_storage", "reqwest"] azure_test = ["azure", "azure_core/azurite_workaround", "azure_storage/azurite_workaround", "azure_storage_blobs/azurite_workaround"] -gcp = ["serde", "serde_json", "reqwest", "reqwest/json", "reqwest/stream", "chrono/serde", "rustls-pemfile", "base64"] +gcp = ["serde", "serde_json", "quick-xml", "reqwest", "reqwest/json", "reqwest/stream", "chrono/serde", "rustls-pemfile", "base64"] aws = ["rusoto_core", "rusoto_credential", "rusoto_s3", "rusoto_sts", "hyper", "hyper-rustls"] [dev-dependencies] # In alphabetical order diff --git a/src/aws.rs b/src/aws.rs index 7ebcc2a..3606a38 100644 --- a/src/aws.rs +++ b/src/aws.rs @@ -16,7 +16,23 @@ // under the License. //! An object store implementation for S3 +//! +//! ## Multi-part uploads +//! +//! Multi-part uploads can be initiated with the [ObjectStore::put_multipart] method. +//! Data passed to the writer is automatically buffered to meet the minimum size +//! requirements for a part. Multiple parts are uploaded concurrently. +//! +//! If the writer fails for any reason, you may have parts uploaded to AWS but not +//! used that you may be charged for. Use the [ObjectStore::abort_multipart] method +//! to abort the upload and drop those unneeded parts. In addition, you may wish to +//! consider implementing [automatic cleanup] of unused parts that are older than one +//! week. +//! +//! [automatic cleanup]: https://aws.amazon.com/blogs/aws/s3-lifecycle-management-update-support-for-multipart-uploads-and-delete-markers/ +use crate::multipart::{CloudMultiPartUpload, CloudMultiPartUploadImpl, UploadPart}; use crate::util::format_http_range; +use crate::MultipartId; use crate::{ collect_bytes, path::{Path, DELIMITER}, @@ -26,6 +42,7 @@ use crate::{ use async_trait::async_trait; use bytes::Bytes; use chrono::{DateTime, Utc}; +use futures::future::BoxFuture; use futures::{ stream::{self, BoxStream}, Future, Stream, StreamExt, TryStreamExt, @@ -36,10 +53,12 @@ use rusoto_credential::{InstanceMetadataProvider, StaticProvider}; use rusoto_s3::S3; use rusoto_sts::WebIdentityProvider; use snafu::{OptionExt, ResultExt, Snafu}; +use std::io; use std::ops::Range; use std::{ convert::TryFrom, fmt, num::NonZeroUsize, ops::Deref, sync::Arc, time::Duration, }; +use tokio::io::AsyncWrite; use tokio::sync::{OwnedSemaphorePermit, Semaphore}; use tracing::{debug, warn}; @@ -129,6 +148,32 @@ enum Error { path: String, }, + #[snafu(display( + "Unable to upload data. Bucket: {}, Location: {}, Error: {} ({:?})", + bucket, + path, + source, + source, + ))] + UnableToUploadData { + source: rusoto_core::RusotoError, + bucket: String, + path: String, + }, + + #[snafu(display( + "Unable to cleanup multipart data. Bucket: {}, Location: {}, Error: {} ({:?})", + bucket, + path, + source, + source, + ))] + UnableToCleanupMultipartData { + source: rusoto_core::RusotoError, + bucket: String, + path: String, + }, + #[snafu(display( "Unable to list data. Bucket: {}, Error: {} ({:?})", bucket, @@ -272,6 +317,71 @@ impl ObjectStore for AmazonS3 { Ok(()) } + async fn put_multipart( + &self, + location: &Path, + ) -> Result<(MultipartId, Box)> { + let bucket_name = self.bucket_name.clone(); + + let request_factory = move || rusoto_s3::CreateMultipartUploadRequest { + bucket: bucket_name.clone(), + key: location.to_string(), + ..Default::default() + }; + + let s3 = self.client().await; + + let data = s3_request(move || { + let (s3, request_factory) = (s3.clone(), request_factory.clone()); + + async move { s3.create_multipart_upload(request_factory()).await } + }) + .await + .context(UnableToUploadDataSnafu { + bucket: &self.bucket_name, + path: location.as_ref(), + })?; + + let upload_id = data.upload_id.unwrap(); + + let inner = S3MultiPartUpload { + upload_id: upload_id.clone(), + bucket: self.bucket_name.clone(), + key: location.to_string(), + client_unrestricted: self.client_unrestricted.clone(), + connection_semaphore: Arc::clone(&self.connection_semaphore), + }; + + Ok((upload_id, Box::new(CloudMultiPartUpload::new(inner, 8)))) + } + + async fn abort_multipart( + &self, + location: &Path, + multipart_id: &MultipartId, + ) -> Result<()> { + let request_factory = move || rusoto_s3::AbortMultipartUploadRequest { + bucket: self.bucket_name.clone(), + key: location.to_string(), + upload_id: multipart_id.to_string(), + ..Default::default() + }; + + let s3 = self.client().await; + s3_request(move || { + let (s3, request_factory) = (s3.clone(), request_factory); + + async move { s3.abort_multipart_upload(request_factory()).await } + }) + .await + .context(UnableToCleanupMultipartDataSnafu { + bucket: &self.bucket_name, + path: location.as_ref(), + })?; + + Ok(()) + } + async fn get(&self, location: &Path) -> Result { Ok(GetResult::Stream( self.get_object(location, None).await?.boxed(), @@ -821,13 +931,131 @@ impl Error { } } +struct S3MultiPartUpload { + bucket: String, + key: String, + upload_id: String, + client_unrestricted: rusoto_s3::S3Client, + connection_semaphore: Arc, +} + +impl CloudMultiPartUploadImpl for S3MultiPartUpload { + fn put_multipart_part( + &self, + buf: Vec, + part_idx: usize, + ) -> BoxFuture<'static, Result<(usize, UploadPart), io::Error>> { + // Get values to move into future; we don't want a reference to Self + let bucket = self.bucket.clone(); + let key = self.key.clone(); + let upload_id = self.upload_id.clone(); + let content_length = buf.len(); + + let request_factory = move || rusoto_s3::UploadPartRequest { + bucket, + key, + upload_id, + // AWS part number is 1-indexed + part_number: (part_idx + 1).try_into().unwrap(), + content_length: Some(content_length.try_into().unwrap()), + body: Some(buf.into()), + ..Default::default() + }; + + let s3 = self.client_unrestricted.clone(); + let connection_semaphore = Arc::clone(&self.connection_semaphore); + + Box::pin(async move { + let _permit = connection_semaphore + .acquire_owned() + .await + .expect("semaphore shouldn't be closed yet"); + + let response = s3_request(move || { + let (s3, request_factory) = (s3.clone(), request_factory.clone()); + async move { s3.upload_part(request_factory()).await } + }) + .await + .map_err(|err| io::Error::new(io::ErrorKind::Other, err))?; + + Ok(( + part_idx, + UploadPart { + content_id: response.e_tag.unwrap(), + }, + )) + }) + } + + fn complete( + &self, + completed_parts: Vec>, + ) -> BoxFuture<'static, Result<(), io::Error>> { + let parts = + completed_parts + .into_iter() + .enumerate() + .map(|(part_number, maybe_part)| match maybe_part { + Some(part) => { + Ok(rusoto_s3::CompletedPart { + e_tag: Some(part.content_id), + part_number: Some((part_number + 1).try_into().map_err( + |err| io::Error::new(io::ErrorKind::Other, err), + )?), + }) + } + None => Err(io::Error::new( + io::ErrorKind::Other, + format!("Missing information for upload part {:?}", part_number), + )), + }); + + // Get values to move into future; we don't want a reference to Self + let bucket = self.bucket.clone(); + let key = self.key.clone(); + let upload_id = self.upload_id.clone(); + + let request_factory = move || -> Result<_, io::Error> { + Ok(rusoto_s3::CompleteMultipartUploadRequest { + bucket, + key, + upload_id, + multipart_upload: Some(rusoto_s3::CompletedMultipartUpload { + parts: Some(parts.collect::>()?), + }), + ..Default::default() + }) + }; + + let s3 = self.client_unrestricted.clone(); + let connection_semaphore = Arc::clone(&self.connection_semaphore); + + Box::pin(async move { + let _permit = connection_semaphore + .acquire_owned() + .await + .expect("semaphore shouldn't be closed yet"); + + s3_request(move || { + let (s3, request_factory) = (s3.clone(), request_factory.clone()); + + async move { s3.complete_multipart_upload(request_factory()?).await } + }) + .await + .map_err(|err| io::Error::new(io::ErrorKind::Other, err))?; + + Ok(()) + }) + } +} + #[cfg(test)] mod tests { use super::*; use crate::{ tests::{ get_nonexistent_object, list_uses_directories_correctly, list_with_delimiter, - put_get_delete_list, rename_and_copy, + put_get_delete_list, rename_and_copy, stream_get, }, Error as ObjectStoreError, ObjectStore, }; @@ -943,6 +1171,7 @@ mod tests { check_credentials(list_uses_directories_correctly(&integration).await).unwrap(); check_credentials(list_with_delimiter(&integration).await).unwrap(); check_credentials(rename_and_copy(&integration).await).unwrap(); + check_credentials(stream_get(&integration).await).unwrap(); } #[tokio::test] diff --git a/src/azure.rs b/src/azure.rs index 75dafef..25f311a 100644 --- a/src/azure.rs +++ b/src/azure.rs @@ -16,10 +16,21 @@ // under the License. //! An object store implementation for Azure blob storage +//! +//! ## Streaming uploads +//! +//! [ObjectStore::put_multipart] will upload data in blocks and write a blob from those +//! blocks. Data is buffered internally to make blocks of at least 5MB and blocks +//! are uploaded concurrently. +//! +//! [ObjectStore::abort_multipart] is a no-op, since Azure Blob Store doesn't provide +//! a way to drop old blocks. Instead unused blocks are automatically cleaned up +//! after 7 days. use crate::{ + multipart::{CloudMultiPartUpload, CloudMultiPartUploadImpl, UploadPart}, path::{Path, DELIMITER}, util::format_prefix, - GetResult, ListResult, ObjectMeta, ObjectStore, Result, + GetResult, ListResult, MultipartId, ObjectMeta, ObjectStore, Result, }; use async_trait::async_trait; use azure_core::{prelude::*, HttpClient}; @@ -32,12 +43,15 @@ use azure_storage_blobs::{ }; use bytes::Bytes; use futures::{ + future::BoxFuture, stream::{self, BoxStream}, StreamExt, TryStreamExt, }; use snafu::{ResultExt, Snafu}; use std::collections::BTreeSet; +use std::io; use std::{convert::TryInto, sync::Arc}; +use tokio::io::AsyncWrite; use url::Url; /// A specialized `Error` for Azure object store-related errors @@ -232,6 +246,27 @@ impl ObjectStore for MicrosoftAzure { Ok(()) } + async fn put_multipart( + &self, + location: &Path, + ) -> Result<(MultipartId, Box)> { + let inner = AzureMultiPartUpload { + container_client: Arc::clone(&self.container_client), + location: location.to_owned(), + }; + Ok((String::new(), Box::new(CloudMultiPartUpload::new(inner, 8)))) + } + + async fn abort_multipart( + &self, + _location: &Path, + _multipart_id: &MultipartId, + ) -> Result<()> { + // There is no way to drop blocks that have been uploaded. Instead, they simply + // expire in 7 days. + Ok(()) + } + async fn get(&self, location: &Path) -> Result { let blob = self .container_client @@ -604,6 +639,94 @@ pub fn new_azure( }) } +// Relevant docs: https://azure.github.io/Storage/docs/application-and-user-data/basics/azure-blob-storage-upload-apis/ +// In Azure Blob Store, parts are "blocks" +// put_multipart_part -> PUT block +// complete -> PUT block list +// abort -> No equivalent; blocks are simply dropped after 7 days +#[derive(Debug, Clone)] +struct AzureMultiPartUpload { + container_client: Arc, + location: Path, +} + +impl AzureMultiPartUpload { + /// Gets the block id corresponding to the part index. + /// + /// In Azure, the user determines what id each block has. They must be + /// unique within an upload and of consistent length. + fn get_block_id(&self, part_idx: usize) -> String { + format!("{:20}", part_idx) + } +} + +impl CloudMultiPartUploadImpl for AzureMultiPartUpload { + fn put_multipart_part( + &self, + buf: Vec, + part_idx: usize, + ) -> BoxFuture<'static, Result<(usize, UploadPart), io::Error>> { + let client = Arc::clone(&self.container_client); + let location = self.location.clone(); + let block_id = self.get_block_id(part_idx); + + Box::pin(async move { + client + .as_blob_client(location.as_ref()) + .put_block(block_id.clone(), buf) + .execute() + .await + .map_err(|err| io::Error::new(io::ErrorKind::Other, err))?; + + Ok(( + part_idx, + UploadPart { + content_id: block_id, + }, + )) + }) + } + + fn complete( + &self, + completed_parts: Vec>, + ) -> BoxFuture<'static, Result<(), io::Error>> { + let parts = + completed_parts + .into_iter() + .enumerate() + .map(|(part_number, maybe_part)| match maybe_part { + Some(part) => { + Ok(azure_storage_blobs::blob::BlobBlockType::Uncommitted( + azure_storage_blobs::BlockId::new(part.content_id), + )) + } + None => Err(io::Error::new( + io::ErrorKind::Other, + format!("Missing information for upload part {:?}", part_number), + )), + }); + + let client = Arc::clone(&self.container_client); + let location = self.location.clone(); + + Box::pin(async move { + let block_list = azure_storage_blobs::blob::BlockList { + blocks: parts.collect::>()?, + }; + + client + .as_blob_client(location.as_ref()) + .put_block_list(&block_list) + .execute() + .await + .map_err(|err| io::Error::new(io::ErrorKind::Other, err))?; + + Ok(()) + }) + } +} + #[cfg(test)] mod tests { use crate::azure::new_azure; diff --git a/src/gcp.rs b/src/gcp.rs index e836cab..d740625 100644 --- a/src/gcp.rs +++ b/src/gcp.rs @@ -16,27 +16,44 @@ // under the License. //! An object store implementation for Google Cloud Storage +//! +//! ## Multi-part uploads +//! +//! [Multi-part uploads](https://cloud.google.com/storage/docs/multipart-uploads) +//! can be initiated with the [ObjectStore::put_multipart] method. +//! Data passed to the writer is automatically buffered to meet the minimum size +//! requirements for a part. Multiple parts are uploaded concurrently. +//! +//! If the writer fails for any reason, you may have parts uploaded to GCS but not +//! used that you may be charged for. Use the [ObjectStore::abort_multipart] method +//! to abort the upload and drop those unneeded parts. In addition, you may wish to +//! consider implementing automatic clean up of unused parts that are older than one +//! week. use std::collections::BTreeSet; use std::fs::File; -use std::io::BufReader; +use std::io::{self, BufReader}; use std::ops::Range; +use std::sync::Arc; use async_trait::async_trait; -use bytes::Bytes; +use bytes::{Buf, Bytes}; use chrono::{DateTime, Utc}; +use futures::future::BoxFuture; use futures::{stream::BoxStream, StreamExt, TryStreamExt}; use percent_encoding::{percent_encode, NON_ALPHANUMERIC}; use reqwest::header::RANGE; use reqwest::{header, Client, Method, Response, StatusCode}; use snafu::{ResultExt, Snafu}; +use tokio::io::AsyncWrite; +use crate::multipart::{CloudMultiPartUpload, CloudMultiPartUploadImpl, UploadPart}; use crate::util::format_http_range; use crate::{ oauth::OAuthProvider, path::{Path, DELIMITER}, token::TokenCache, util::format_prefix, - GetResult, ListResult, ObjectMeta, ObjectStore, Result, + GetResult, ListResult, MultipartId, ObjectMeta, ObjectStore, Result, }; #[derive(Debug, Snafu)] @@ -47,6 +64,14 @@ enum Error { #[snafu(display("Unable to decode service account file: {}", source))] DecodeCredentials { source: serde_json::Error }, + #[snafu(display("Got invalid XML response for {} {}: {}", method, url, source))] + InvalidXMLResponse { + source: quick_xml::de::DeError, + method: String, + url: String, + data: Bytes, + }, + #[snafu(display("Error performing list request: {}", source))] ListRequest { source: reqwest::Error }, @@ -139,9 +164,42 @@ struct Object { updated: DateTime, } +#[derive(serde::Deserialize, Debug)] +#[serde(rename_all = "PascalCase")] +struct InitiateMultipartUploadResult { + upload_id: String, +} + +#[derive(serde::Serialize, Debug)] +#[serde(rename_all = "PascalCase", rename(serialize = "Part"))] +struct MultipartPart { + #[serde(rename = "$unflatten=PartNumber")] + part_number: usize, + #[serde(rename = "$unflatten=ETag")] + e_tag: String, +} + +#[derive(serde::Serialize, Debug)] +#[serde(rename_all = "PascalCase")] +struct CompleteMultipartUpload { + #[serde(rename = "Part", default)] + parts: Vec, +} + /// Configuration for connecting to [Google Cloud Storage](https://cloud.google.com/storage/). #[derive(Debug)] pub struct GoogleCloudStorage { + client: Arc, +} + +impl std::fmt::Display for GoogleCloudStorage { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "GoogleCloudStorage({})", self.client.bucket_name) + } +} + +#[derive(Debug)] +struct GoogleCloudStorageClient { client: Client, base_url: String, @@ -155,13 +213,7 @@ pub struct GoogleCloudStorage { max_list_results: Option, } -impl std::fmt::Display for GoogleCloudStorage { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!(f, "GoogleCloudStorage({})", self.bucket_name) - } -} - -impl GoogleCloudStorage { +impl GoogleCloudStorageClient { async fn get_token(&self) -> Result { if let Some(oauth_provider) = &self.oauth_provider { Ok(self @@ -243,6 +295,61 @@ impl GoogleCloudStorage { Ok(()) } + /// Initiate a multi-part upload + async fn multipart_initiate(&self, path: &Path) -> Result { + let token = self.get_token().await?; + let url = format!("{}/{}/{}", self.base_url, self.bucket_name_encoded, path); + + let response = self + .client + .request(Method::POST, &url) + .bearer_auth(token) + .header(header::CONTENT_TYPE, "application/octet-stream") + .header(header::CONTENT_LENGTH, "0") + .query(&[("uploads", "")]) + .send() + .await + .context(PutRequestSnafu)? + .error_for_status() + .context(PutRequestSnafu)?; + + let data = response.bytes().await.context(PutRequestSnafu)?; + let result: InitiateMultipartUploadResult = quick_xml::de::from_reader( + data.as_ref().reader(), + ) + .context(InvalidXMLResponseSnafu { + method: "POST".to_string(), + url, + data, + })?; + + Ok(result.upload_id) + } + + /// Cleanup unused parts + async fn multipart_cleanup( + &self, + path: &str, + multipart_id: &MultipartId, + ) -> Result<()> { + let token = self.get_token().await?; + let url = format!("{}/{}/{}", self.base_url, self.bucket_name_encoded, path); + + self.client + .request(Method::DELETE, &url) + .bearer_auth(token) + .header(header::CONTENT_TYPE, "application/octet-stream") + .header(header::CONTENT_LENGTH, "0") + .query(&[("uploadId", multipart_id)]) + .send() + .await + .context(PutRequestSnafu)? + .error_for_status() + .context(PutRequestSnafu)?; + + Ok(()) + } + /// Perform a delete request async fn delete_request(&self, path: &Path) -> Result<()> { let token = self.get_token().await?; @@ -401,14 +508,184 @@ impl GoogleCloudStorage { } } +fn reqwest_error_as_io(err: reqwest::Error) -> io::Error { + if err.is_builder() || err.is_request() { + io::Error::new(io::ErrorKind::InvalidInput, err) + } else if err.is_status() { + match err.status() { + Some(StatusCode::NOT_FOUND) => io::Error::new(io::ErrorKind::NotFound, err), + Some(StatusCode::BAD_REQUEST) => { + io::Error::new(io::ErrorKind::InvalidInput, err) + } + Some(_) => io::Error::new(io::ErrorKind::Other, err), + None => io::Error::new(io::ErrorKind::Other, err), + } + } else if err.is_timeout() { + io::Error::new(io::ErrorKind::TimedOut, err) + } else if err.is_connect() { + io::Error::new(io::ErrorKind::NotConnected, err) + } else { + io::Error::new(io::ErrorKind::Other, err) + } +} + +struct GCSMultipartUpload { + client: Arc, + encoded_path: String, + multipart_id: MultipartId, +} + +impl CloudMultiPartUploadImpl for GCSMultipartUpload { + /// Upload an object part + fn put_multipart_part( + &self, + buf: Vec, + part_idx: usize, + ) -> BoxFuture<'static, Result<(usize, UploadPart), io::Error>> { + let upload_id = self.multipart_id.clone(); + let url = format!( + "{}/{}/{}", + self.client.base_url, self.client.bucket_name_encoded, self.encoded_path + ); + let client = Arc::clone(&self.client); + + Box::pin(async move { + let token = client + .get_token() + .await + .map_err(|err| io::Error::new(io::ErrorKind::Other, err))?; + + let response = client + .client + .request(Method::PUT, &url) + .bearer_auth(token) + .query(&[ + ("partNumber", format!("{}", part_idx + 1)), + ("uploadId", upload_id), + ]) + .header(header::CONTENT_TYPE, "application/octet-stream") + .header(header::CONTENT_LENGTH, format!("{}", buf.len())) + .body(buf) + .send() + .await + .map_err(reqwest_error_as_io)? + .error_for_status() + .map_err(reqwest_error_as_io)?; + + let content_id = response + .headers() + .get("ETag") + .ok_or_else(|| { + io::Error::new( + io::ErrorKind::InvalidData, + "response headers missing ETag", + ) + })? + .to_str() + .map_err(|err| io::Error::new(io::ErrorKind::InvalidData, err))? + .to_string(); + + Ok((part_idx, UploadPart { content_id })) + }) + } + + /// Complete a multipart upload + fn complete( + &self, + completed_parts: Vec>, + ) -> BoxFuture<'static, Result<(), io::Error>> { + let client = Arc::clone(&self.client); + let upload_id = self.multipart_id.clone(); + let url = format!( + "{}/{}/{}", + self.client.base_url, self.client.bucket_name_encoded, self.encoded_path + ); + + Box::pin(async move { + let parts: Vec = completed_parts + .into_iter() + .enumerate() + .map(|(part_number, maybe_part)| match maybe_part { + Some(part) => Ok(MultipartPart { + e_tag: part.content_id, + part_number: part_number + 1, + }), + None => Err(io::Error::new( + io::ErrorKind::Other, + format!("Missing information for upload part {:?}", part_number), + )), + }) + .collect::, io::Error>>()?; + + let token = client + .get_token() + .await + .map_err(|err| io::Error::new(io::ErrorKind::Other, err))?; + + let upload_info = CompleteMultipartUpload { parts }; + + let data = quick_xml::se::to_string(&upload_info) + .map_err(|err| io::Error::new(io::ErrorKind::Other, err))? + // We cannot disable the escaping that transforms "/" to ""e;" :( + // https://github.com/tafia/quick-xml/issues/362 + // https://github.com/tafia/quick-xml/issues/350 + .replace(""", "\""); + + client + .client + .request(Method::POST, &url) + .bearer_auth(token) + .query(&[("uploadId", upload_id)]) + .body(data) + .send() + .await + .map_err(reqwest_error_as_io)? + .error_for_status() + .map_err(reqwest_error_as_io)?; + + Ok(()) + }) + } +} + #[async_trait] impl ObjectStore for GoogleCloudStorage { async fn put(&self, location: &Path, bytes: Bytes) -> Result<()> { - self.put_request(location, bytes).await + self.client.put_request(location, bytes).await + } + + async fn put_multipart( + &self, + location: &Path, + ) -> Result<(MultipartId, Box)> { + let upload_id = self.client.multipart_initiate(location).await?; + + let encoded_path = + percent_encode(location.to_string().as_bytes(), NON_ALPHANUMERIC).to_string(); + + let inner = GCSMultipartUpload { + client: Arc::clone(&self.client), + encoded_path, + multipart_id: upload_id.clone(), + }; + + Ok((upload_id, Box::new(CloudMultiPartUpload::new(inner, 8)))) + } + + async fn abort_multipart( + &self, + location: &Path, + multipart_id: &MultipartId, + ) -> Result<()> { + self.client + .multipart_cleanup(location.as_ref(), multipart_id) + .await?; + + Ok(()) } async fn get(&self, location: &Path) -> Result { - let response = self.get_request(location, None, false).await?; + let response = self.client.get_request(location, None, false).await?; let stream = response .bytes_stream() .map_err(|source| crate::Error::Generic { @@ -421,14 +698,17 @@ impl ObjectStore for GoogleCloudStorage { } async fn get_range(&self, location: &Path, range: Range) -> Result { - let response = self.get_request(location, Some(range), false).await?; + let response = self + .client + .get_request(location, Some(range), false) + .await?; Ok(response.bytes().await.context(GetRequestSnafu { path: location.as_ref(), })?) } async fn head(&self, location: &Path) -> Result { - let response = self.get_request(location, None, true).await?; + let response = self.client.get_request(location, None, true).await?; let object = response.json().await.context(GetRequestSnafu { path: location.as_ref(), })?; @@ -436,7 +716,7 @@ impl ObjectStore for GoogleCloudStorage { } async fn delete(&self, location: &Path) -> Result<()> { - self.delete_request(location).await + self.client.delete_request(location).await } async fn list( @@ -444,6 +724,7 @@ impl ObjectStore for GoogleCloudStorage { prefix: Option<&Path>, ) -> Result>> { let stream = self + .client .list_paginated(prefix, false)? .map_ok(|r| { futures::stream::iter( @@ -457,7 +738,7 @@ impl ObjectStore for GoogleCloudStorage { } async fn list_with_delimiter(&self, prefix: Option<&Path>) -> Result { - let mut stream = self.list_paginated(prefix, true)?; + let mut stream = self.client.list_paginated(prefix, true)?; let mut common_prefixes = BTreeSet::new(); let mut objects = Vec::new(); @@ -482,11 +763,11 @@ impl ObjectStore for GoogleCloudStorage { } async fn copy(&self, from: &Path, to: &Path) -> Result<()> { - self.copy_request(from, to, false).await + self.client.copy_request(from, to, false).await } async fn copy_if_not_exists(&self, from: &Path, to: &Path) -> Result<()> { - self.copy_request(from, to, true).await + self.client.copy_request(from, to, true).await } } @@ -537,13 +818,15 @@ pub fn new_gcs_with_client( // environment variables. Set the environment variable explicitly so // that we can optionally accept command line arguments instead. Ok(GoogleCloudStorage { - client, - base_url: credentials.gcs_base_url, - oauth_provider, - token_cache: Default::default(), - bucket_name, - bucket_name_encoded: encoded_bucket_name, - max_list_results: None, + client: Arc::new(GoogleCloudStorageClient { + client, + base_url: credentials.gcs_base_url, + oauth_provider, + token_cache: Default::default(), + bucket_name, + bucket_name_encoded: encoded_bucket_name, + max_list_results: None, + }), }) } @@ -568,7 +851,7 @@ mod test { use crate::{ tests::{ get_nonexistent_object, list_uses_directories_correctly, list_with_delimiter, - put_get_delete_list, rename_and_copy, + put_get_delete_list, rename_and_copy, stream_get, }, Error as ObjectStoreError, ObjectStore, }; @@ -648,6 +931,11 @@ mod test { list_uses_directories_correctly(&integration).await.unwrap(); list_with_delimiter(&integration).await.unwrap(); rename_and_copy(&integration).await.unwrap(); + if integration.client.base_url == default_gcs_base_url() { + // Fake GCS server does not yet implement XML Multipart uploads + // https://github.com/fsouza/fake-gcs-server/issues/852 + stream_get(&integration).await.unwrap(); + } } #[tokio::test] diff --git a/src/lib.rs b/src/lib.rs index 2dc6506..54d2827 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -30,7 +30,7 @@ //! //! This crate provides APIs for interacting with object storage services. //! -//! It currently supports PUT, GET, DELETE, HEAD and list for: +//! It currently supports PUT (single or chunked/concurrent), GET, DELETE, HEAD and list for: //! //! * [Google Cloud Storage](https://cloud.google.com/storage/) //! * [Amazon S3](https://aws.amazon.com/s3/) @@ -56,6 +56,8 @@ mod oauth; #[cfg(feature = "gcp")] mod token; +#[cfg(any(feature = "azure", feature = "aws", feature = "gcp"))] +mod multipart; mod util; use crate::path::Path; @@ -68,16 +70,45 @@ use snafu::Snafu; use std::fmt::{Debug, Formatter}; use std::io::{Read, Seek, SeekFrom}; use std::ops::Range; +use tokio::io::AsyncWrite; /// An alias for a dynamically dispatched object store implementation. pub type DynObjectStore = dyn ObjectStore; +/// Id type for multi-part uploads. +pub type MultipartId = String; + /// Universal API to multiple object store services. #[async_trait] pub trait ObjectStore: std::fmt::Display + Send + Sync + Debug + 'static { /// Save the provided bytes to the specified location. async fn put(&self, location: &Path, bytes: Bytes) -> Result<()>; + /// Get a multi-part upload that allows writing data in chunks + /// + /// Most cloud-based uploads will buffer and upload parts in parallel. + /// + /// To complete the upload, [AsyncWrite::poll_shutdown] must be called + /// to completion. + /// + /// For some object stores (S3, GCS, and local in particular), if the + /// writer fails or panics, you must call [ObjectStore::abort_multipart] + /// to clean up partially written data. + async fn put_multipart( + &self, + location: &Path, + ) -> Result<(MultipartId, Box)>; + + /// Cleanup an aborted upload. + /// + /// See documentation for individual stores for exact behavior, as capabilities + /// vary by object store. + async fn abort_multipart( + &self, + location: &Path, + multipart_id: &MultipartId, + ) -> Result<()>; + /// Return the bytes that are stored at the specified location. async fn get(&self, location: &Path) -> Result; @@ -330,6 +361,7 @@ mod test_util { mod tests { use super::*; use crate::test_util::flatten_list_stream; + use tokio::io::AsyncWriteExt; type Error = Box; type Result = std::result::Result; @@ -497,6 +529,77 @@ mod tests { Ok(()) } + fn get_vec_of_bytes(chunk_length: usize, num_chunks: usize) -> Vec { + std::iter::repeat(Bytes::from_iter(std::iter::repeat(b'x').take(chunk_length))) + .take(num_chunks) + .collect() + } + + pub(crate) async fn stream_get(storage: &DynObjectStore) -> Result<()> { + let location = Path::from("test_dir/test_upload_file.txt"); + + // Can write to storage + let data = get_vec_of_bytes(5_000_000, 10); + let bytes_expected = data.concat(); + let (_, mut writer) = storage.put_multipart(&location).await?; + for chunk in &data { + writer.write_all(chunk).await?; + } + + // Object should not yet exist in store + let meta_res = storage.head(&location).await; + assert!(meta_res.is_err()); + assert!(matches!( + meta_res.unwrap_err(), + crate::Error::NotFound { .. } + )); + + writer.shutdown().await?; + let bytes_written = storage.get(&location).await?.bytes().await?; + assert_eq!(bytes_expected, bytes_written); + + // Can overwrite some storage + let data = get_vec_of_bytes(5_000, 5); + let bytes_expected = data.concat(); + let (_, mut writer) = storage.put_multipart(&location).await?; + for chunk in &data { + writer.write_all(chunk).await?; + } + writer.shutdown().await?; + let bytes_written = storage.get(&location).await?.bytes().await?; + assert_eq!(bytes_expected, bytes_written); + + // We can abort an empty write + let location = Path::from("test_dir/test_abort_upload.txt"); + let (upload_id, writer) = storage.put_multipart(&location).await?; + drop(writer); + storage.abort_multipart(&location, &upload_id).await?; + let get_res = storage.get(&location).await; + assert!(get_res.is_err()); + assert!(matches!( + get_res.unwrap_err(), + crate::Error::NotFound { .. } + )); + + // We can abort an in-progress write + let (upload_id, mut writer) = storage.put_multipart(&location).await?; + if let Some(chunk) = data.get(0) { + writer.write_all(chunk).await?; + let _ = writer.write(chunk).await?; + } + drop(writer); + + storage.abort_multipart(&location, &upload_id).await?; + let get_res = storage.get(&location).await; + assert!(get_res.is_err()); + assert!(matches!( + get_res.unwrap_err(), + crate::Error::NotFound { .. } + )); + + Ok(()) + } + pub(crate) async fn list_uses_directories_correctly( storage: &DynObjectStore, ) -> Result<()> { diff --git a/src/local.rs b/src/local.rs index 8a9462e..798edef 100644 --- a/src/local.rs +++ b/src/local.rs @@ -19,18 +19,23 @@ use crate::{ maybe_spawn_blocking, path::{filesystem_path_to_url, Path}, - GetResult, ListResult, ObjectMeta, ObjectStore, Result, + GetResult, ListResult, MultipartId, ObjectMeta, ObjectStore, Result, }; use async_trait::async_trait; use bytes::Bytes; +use futures::future::BoxFuture; +use futures::FutureExt; use futures::{stream::BoxStream, StreamExt}; use snafu::{ensure, OptionExt, ResultExt, Snafu}; -use std::collections::VecDeque; use std::fs::File; use std::io::{Read, Seek, SeekFrom, Write}; use std::ops::Range; +use std::pin::Pin; use std::sync::Arc; +use std::task::Poll; use std::{collections::BTreeSet, convert::TryFrom, io}; +use std::{collections::VecDeque, path::PathBuf}; +use tokio::io::AsyncWrite; use url::Url; use walkdir::{DirEntry, WalkDir}; @@ -233,24 +238,7 @@ impl ObjectStore for LocalFileSystem { let path = self.config.path_to_filesystem(location)?; maybe_spawn_blocking(move || { - let mut file = match File::create(&path) { - Ok(f) => f, - Err(err) if err.kind() == std::io::ErrorKind::NotFound => { - let parent = path - .parent() - .context(UnableToCreateFileSnafu { path: &path, err })?; - std::fs::create_dir_all(&parent) - .context(UnableToCreateDirSnafu { path: parent })?; - - match File::create(&path) { - Ok(f) => f, - Err(err) => { - return Err(Error::UnableToCreateFile { path, err }.into()) - } - } - } - Err(err) => return Err(Error::UnableToCreateFile { path, err }.into()), - }; + let mut file = open_writable_file(&path)?; file.write_all(&bytes) .context(UnableToCopyDataToFileSnafu)?; @@ -260,6 +248,53 @@ impl ObjectStore for LocalFileSystem { .await } + async fn put_multipart( + &self, + location: &Path, + ) -> Result<(MultipartId, Box)> { + let dest = self.config.path_to_filesystem(location)?; + + // Generate an id in case of concurrent writes + let mut multipart_id = 1; + + // Will write to a temporary path + let staging_path = loop { + let staging_path = get_upload_stage_path(&dest, &multipart_id.to_string()); + + match std::fs::metadata(&staging_path) { + Err(err) if err.kind() == io::ErrorKind::NotFound => break staging_path, + Err(err) => { + return Err(Error::UnableToCopyDataToFile { source: err }.into()) + } + Ok(_) => multipart_id += 1, + } + }; + let multipart_id = multipart_id.to_string(); + + let file = open_writable_file(&staging_path)?; + + Ok(( + multipart_id.clone(), + Box::new(LocalUpload::new(dest, multipart_id, Arc::new(file))), + )) + } + + async fn abort_multipart( + &self, + location: &Path, + multipart_id: &MultipartId, + ) -> Result<()> { + let dest = self.config.path_to_filesystem(location)?; + let staging_path: PathBuf = get_upload_stage_path(&dest, multipart_id); + + maybe_spawn_blocking(move || { + std::fs::remove_file(&staging_path) + .context(UnableToDeleteFileSnafu { path: staging_path })?; + Ok(()) + }) + .await + } + async fn get(&self, location: &Path) -> Result { let path = self.config.path_to_filesystem(location)?; maybe_spawn_blocking(move || { @@ -343,7 +378,12 @@ impl ObjectStore for LocalFileSystem { Err(e) => Some(Err(e)), Ok(None) => None, Ok(entry @ Some(_)) => entry - .filter(|dir_entry| dir_entry.file_type().is_file()) + .filter(|dir_entry| { + dir_entry.file_type().is_file() + // Ignore file names with # in them, since they might be in-progress uploads. + // They would be rejected anyways by filesystem_to_path below. + && !dir_entry.file_name().to_string_lossy().contains('#') + }) .map(|entry| { let location = config.filesystem_to_path(entry.path())?; convert_entry(entry, location) @@ -400,6 +440,13 @@ impl ObjectStore for LocalFileSystem { for entry_res in walkdir.into_iter().map(convert_walkdir_result) { if let Some(entry) = entry_res? { + if entry.file_type().is_file() + // Ignore file names with # in them, since they might be in-progress uploads. + // They would be rejected anyways by filesystem_to_path below. + && entry.file_name().to_string_lossy().contains('#') + { + continue; + } let is_directory = entry.file_type().is_dir(); let entry_location = config.filesystem_to_path(entry.path())?; @@ -475,6 +522,216 @@ impl ObjectStore for LocalFileSystem { } } +fn get_upload_stage_path(dest: &std::path::Path, multipart_id: &MultipartId) -> PathBuf { + let mut staging_path = dest.as_os_str().to_owned(); + staging_path.push(format!("#{}", multipart_id)); + staging_path.into() +} + +enum LocalUploadState { + /// Upload is ready to send new data + Idle(Arc), + /// In the middle of a write + Writing( + Arc, + BoxFuture<'static, Result>, + ), + /// In the middle of syncing data and closing file. + /// + /// Future will contain last reference to file, so it will call drop on completion. + ShuttingDown(BoxFuture<'static, Result<(), io::Error>>), + /// File is being moved from it's temporary location to the final location + Committing(BoxFuture<'static, Result<(), io::Error>>), + /// Upload is complete + Complete, +} + +struct LocalUpload { + inner_state: LocalUploadState, + dest: PathBuf, + multipart_id: MultipartId, +} + +impl LocalUpload { + pub fn new( + dest: PathBuf, + multipart_id: MultipartId, + file: Arc, + ) -> Self { + Self { + inner_state: LocalUploadState::Idle(file), + dest, + multipart_id, + } + } +} + +impl AsyncWrite for LocalUpload { + fn poll_write( + mut self: Pin<&mut Self>, + cx: &mut std::task::Context<'_>, + buf: &[u8], + ) -> std::task::Poll> { + let invalid_state = + |condition: &str| -> std::task::Poll> { + Poll::Ready(Err(io::Error::new( + io::ErrorKind::InvalidInput, + format!("Tried to write to file {}.", condition), + ))) + }; + + if let Ok(runtime) = tokio::runtime::Handle::try_current() { + let mut data: Vec = buf.to_vec(); + let data_len = data.len(); + + loop { + match &mut self.inner_state { + LocalUploadState::Idle(file) => { + let file = Arc::clone(file); + let file2 = Arc::clone(&file); + let data: Vec = std::mem::take(&mut data); + self.inner_state = LocalUploadState::Writing( + file, + Box::pin( + runtime + .spawn_blocking(move || (&*file2).write_all(&data)) + .map(move |res| match res { + Err(err) => { + Err(io::Error::new(io::ErrorKind::Other, err)) + } + Ok(res) => res.map(move |_| data_len), + }), + ), + ); + } + LocalUploadState::Writing(file, inner_write) => { + match inner_write.poll_unpin(cx) { + Poll::Ready(res) => { + self.inner_state = + LocalUploadState::Idle(Arc::clone(file)); + return Poll::Ready(res); + } + Poll::Pending => { + return Poll::Pending; + } + } + } + LocalUploadState::ShuttingDown(_) => { + return invalid_state("when writer is shutting down"); + } + LocalUploadState::Committing(_) => { + return invalid_state("when writer is committing data"); + } + LocalUploadState::Complete => { + return invalid_state("when writer is complete"); + } + } + } + } else if let LocalUploadState::Idle(file) = &self.inner_state { + let file = Arc::clone(file); + (&*file).write_all(buf)?; + Poll::Ready(Ok(buf.len())) + } else { + // If we are running on this thread, then only possible states are Idle and Complete. + invalid_state("when writer is already complete.") + } + } + + fn poll_flush( + self: Pin<&mut Self>, + _cx: &mut std::task::Context<'_>, + ) -> std::task::Poll> { + Poll::Ready(Ok(())) + } + + fn poll_shutdown( + mut self: Pin<&mut Self>, + cx: &mut std::task::Context<'_>, + ) -> std::task::Poll> { + if let Ok(runtime) = tokio::runtime::Handle::try_current() { + loop { + match &mut self.inner_state { + LocalUploadState::Idle(file) => { + // We are moving file into the future, and it will be dropped on it's completion, closing the file. + let file = Arc::clone(file); + self.inner_state = LocalUploadState::ShuttingDown(Box::pin( + runtime.spawn_blocking(move || (*file).sync_all()).map( + move |res| match res { + Err(err) => { + Err(io::Error::new(io::ErrorKind::Other, err)) + } + Ok(res) => res, + }, + ), + )); + } + LocalUploadState::ShuttingDown(fut) => match fut.poll_unpin(cx) { + Poll::Ready(res) => { + res?; + let staging_path = + get_upload_stage_path(&self.dest, &self.multipart_id); + let dest = self.dest.clone(); + self.inner_state = LocalUploadState::Committing(Box::pin( + runtime + .spawn_blocking(move || { + std::fs::rename(&staging_path, &dest) + }) + .map(move |res| match res { + Err(err) => { + Err(io::Error::new(io::ErrorKind::Other, err)) + } + Ok(res) => res, + }), + )); + } + Poll::Pending => { + return Poll::Pending; + } + }, + LocalUploadState::Writing(_, _) => { + return Poll::Ready(Err(io::Error::new( + io::ErrorKind::InvalidInput, + "Tried to commit a file where a write is in progress.", + ))); + } + LocalUploadState::Committing(fut) => match fut.poll_unpin(cx) { + Poll::Ready(res) => { + self.inner_state = LocalUploadState::Complete; + return Poll::Ready(res); + } + Poll::Pending => return Poll::Pending, + }, + LocalUploadState::Complete => { + return Poll::Ready(Err(io::Error::new( + io::ErrorKind::Other, + "Already complete", + ))) + } + } + } + } else { + let staging_path = get_upload_stage_path(&self.dest, &self.multipart_id); + match &mut self.inner_state { + LocalUploadState::Idle(file) => { + let file = Arc::clone(file); + self.inner_state = LocalUploadState::Complete; + file.sync_all()?; + std::mem::drop(file); + std::fs::rename(&staging_path, &self.dest)?; + Poll::Ready(Ok(())) + } + _ => { + // If we are running on this thread, then only possible states are Idle and Complete. + Poll::Ready(Err(io::Error::new( + io::ErrorKind::Other, + "Already complete", + ))) + } + } + } + } +} + fn open_file(path: &std::path::PathBuf) -> Result { let file = File::open(path).map_err(|e| { if e.kind() == std::io::ErrorKind::NotFound { @@ -492,6 +749,33 @@ fn open_file(path: &std::path::PathBuf) -> Result { Ok(file) } +fn open_writable_file(path: &std::path::PathBuf) -> Result { + match File::create(&path) { + Ok(f) => Ok(f), + Err(err) if err.kind() == std::io::ErrorKind::NotFound => { + let parent = path + .parent() + .context(UnableToCreateFileSnafu { path: &path, err })?; + std::fs::create_dir_all(&parent) + .context(UnableToCreateDirSnafu { path: parent })?; + + match File::create(&path) { + Ok(f) => Ok(f), + Err(err) => Err(Error::UnableToCreateFile { + path: path.to_path_buf(), + err, + } + .into()), + } + } + Err(err) => Err(Error::UnableToCreateFile { + path: path.to_path_buf(), + err, + } + .into()), + } +} + fn convert_entry(entry: DirEntry, location: Path) -> Result { let metadata = entry .metadata() @@ -548,11 +832,12 @@ mod tests { use crate::{ tests::{ copy_if_not_exists, get_nonexistent_object, list_uses_directories_correctly, - list_with_delimiter, put_get_delete_list, rename_and_copy, + list_with_delimiter, put_get_delete_list, rename_and_copy, stream_get, }, Error as ObjectStoreError, ObjectStore, }; use tempfile::TempDir; + use tokio::io::AsyncWriteExt; #[tokio::test] async fn file_test() { @@ -564,6 +849,7 @@ mod tests { list_with_delimiter(&integration).await.unwrap(); rename_and_copy(&integration).await.unwrap(); copy_if_not_exists(&integration).await.unwrap(); + stream_get(&integration).await.unwrap(); } #[test] @@ -574,6 +860,7 @@ mod tests { put_get_delete_list(&integration).await.unwrap(); list_uses_directories_correctly(&integration).await.unwrap(); list_with_delimiter(&integration).await.unwrap(); + stream_get(&integration).await.unwrap(); }); } @@ -770,4 +1057,34 @@ mod tests { err ); } + + #[tokio::test] + async fn list_hides_incomplete_uploads() { + let root = TempDir::new().unwrap(); + let integration = LocalFileSystem::new_with_prefix(root.path()).unwrap(); + let location = Path::from("some_file"); + + let data = Bytes::from("arbitrary data"); + let (multipart_id, mut writer) = + integration.put_multipart(&location).await.unwrap(); + writer.write_all(&data).await.unwrap(); + + let (multipart_id_2, mut writer_2) = + integration.put_multipart(&location).await.unwrap(); + assert_ne!(multipart_id, multipart_id_2); + writer_2.write_all(&data).await.unwrap(); + + let list = flatten_list_stream(&integration, None).await.unwrap(); + assert_eq!(list.len(), 0); + + assert_eq!( + integration + .list_with_delimiter(None) + .await + .unwrap() + .objects + .len(), + 0 + ); + } } diff --git a/src/memory.rs b/src/memory.rs index ffd8e3a..dc3967d 100644 --- a/src/memory.rs +++ b/src/memory.rs @@ -16,6 +16,7 @@ // under the License. //! An in-memory object store implementation +use crate::MultipartId; use crate::{path::Path, GetResult, ListResult, ObjectMeta, ObjectStore, Result}; use async_trait::async_trait; use bytes::Bytes; @@ -25,7 +26,12 @@ use parking_lot::RwLock; use snafu::{ensure, OptionExt, Snafu}; use std::collections::BTreeMap; use std::collections::BTreeSet; +use std::io; use std::ops::Range; +use std::pin::Pin; +use std::sync::Arc; +use std::task::Poll; +use tokio::io::AsyncWrite; /// A specialized `Error` for in-memory object store-related errors #[derive(Debug, Snafu)] @@ -67,7 +73,7 @@ impl From for super::Error { /// storage provider. #[derive(Debug, Default)] pub struct InMemory { - storage: RwLock>, + storage: Arc>>, } impl std::fmt::Display for InMemory { @@ -83,6 +89,29 @@ impl ObjectStore for InMemory { Ok(()) } + async fn put_multipart( + &self, + location: &Path, + ) -> Result<(MultipartId, Box)> { + Ok(( + String::new(), + Box::new(InMemoryUpload { + location: location.clone(), + data: Vec::new(), + storage: Arc::clone(&self.storage), + }), + )) + } + + async fn abort_multipart( + &self, + _location: &Path, + _multipart_id: &MultipartId, + ) -> Result<()> { + // Nothing to clean up + Ok(()) + } + async fn get(&self, location: &Path) -> Result { let data = self.get_bytes(location).await?; @@ -211,7 +240,7 @@ impl InMemory { let storage = storage.clone(); Self { - storage: RwLock::new(storage), + storage: Arc::new(RwLock::new(storage)), } } @@ -227,6 +256,39 @@ impl InMemory { } } +struct InMemoryUpload { + location: Path, + data: Vec, + storage: Arc>>, +} + +impl AsyncWrite for InMemoryUpload { + fn poll_write( + mut self: Pin<&mut Self>, + _cx: &mut std::task::Context<'_>, + buf: &[u8], + ) -> std::task::Poll> { + self.data.extend_from_slice(buf); + Poll::Ready(Ok(buf.len())) + } + + fn poll_flush( + self: Pin<&mut Self>, + _cx: &mut std::task::Context<'_>, + ) -> std::task::Poll> { + Poll::Ready(Ok(())) + } + + fn poll_shutdown( + mut self: Pin<&mut Self>, + _cx: &mut std::task::Context<'_>, + ) -> std::task::Poll> { + let data = Bytes::from(std::mem::take(&mut self.data)); + self.storage.write().insert(self.location.clone(), data); + Poll::Ready(Ok(())) + } +} + #[cfg(test)] mod tests { use super::*; @@ -234,7 +296,7 @@ mod tests { use crate::{ tests::{ copy_if_not_exists, get_nonexistent_object, list_uses_directories_correctly, - list_with_delimiter, put_get_delete_list, rename_and_copy, + list_with_delimiter, put_get_delete_list, rename_and_copy, stream_get, }, Error as ObjectStoreError, ObjectStore, }; @@ -248,6 +310,7 @@ mod tests { list_with_delimiter(&integration).await.unwrap(); rename_and_copy(&integration).await.unwrap(); copy_if_not_exists(&integration).await.unwrap(); + stream_get(&integration).await.unwrap(); } #[tokio::test] diff --git a/src/multipart.rs b/src/multipart.rs new file mode 100644 index 0000000..c16022d --- /dev/null +++ b/src/multipart.rs @@ -0,0 +1,195 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use futures::{future::BoxFuture, stream::FuturesUnordered, Future, StreamExt}; +use std::{io, pin::Pin, sync::Arc, task::Poll}; +use tokio::io::AsyncWrite; + +use crate::Result; + +type BoxedTryFuture = Pin> + Send>>; + +/// A trait that can be implemented by cloud-based object stores +/// and used in combination with [`CloudMultiPartUpload`] to provide +/// multipart upload support +/// +/// Note: this does not use AsyncTrait as the lifetimes are difficult to manage +pub(crate) trait CloudMultiPartUploadImpl { + /// Upload a single part + fn put_multipart_part( + &self, + buf: Vec, + part_idx: usize, + ) -> BoxFuture<'static, Result<(usize, UploadPart), io::Error>>; + + /// Complete the upload with the provided parts + /// + /// `completed_parts` is in order of part number + fn complete( + &self, + completed_parts: Vec>, + ) -> BoxFuture<'static, Result<(), io::Error>>; +} + +#[derive(Debug, Clone)] +pub(crate) struct UploadPart { + pub content_id: String, +} + +pub(crate) struct CloudMultiPartUpload +where + T: CloudMultiPartUploadImpl, +{ + inner: Arc, + /// A list of completed parts, in sequential order. + completed_parts: Vec>, + /// Part upload tasks currently running + tasks: FuturesUnordered>, + /// Maximum number of upload tasks to run concurrently + max_concurrency: usize, + /// Buffer that will be sent in next upload. + current_buffer: Vec, + /// Minimum size of a part in bytes + min_part_size: usize, + /// Index of current part + current_part_idx: usize, + /// The completion task + completion_task: Option>, +} + +impl CloudMultiPartUpload +where + T: CloudMultiPartUploadImpl, +{ + pub fn new(inner: T, max_concurrency: usize) -> Self { + Self { + inner: Arc::new(inner), + completed_parts: Vec::new(), + tasks: FuturesUnordered::new(), + max_concurrency, + current_buffer: Vec::new(), + // TODO: Should self vary by provider? + // TODO: Should we automatically increase then when part index gets large? + min_part_size: 5_000_000, + current_part_idx: 0, + completion_task: None, + } + } + + pub fn poll_tasks( + mut self: Pin<&mut Self>, + cx: &mut std::task::Context<'_>, + ) -> Result<(), io::Error> { + if self.tasks.is_empty() { + return Ok(()); + } + let total_parts = self.completed_parts.len(); + while let Poll::Ready(Some(res)) = self.tasks.poll_next_unpin(cx) { + let (part_idx, part) = res?; + self.completed_parts + .resize(std::cmp::max(part_idx + 1, total_parts), None); + self.completed_parts[part_idx] = Some(part); + } + Ok(()) + } +} + +impl AsyncWrite for CloudMultiPartUpload +where + T: CloudMultiPartUploadImpl + Send + Sync, +{ + fn poll_write( + mut self: Pin<&mut Self>, + cx: &mut std::task::Context<'_>, + buf: &[u8], + ) -> std::task::Poll> { + // Poll current tasks + self.as_mut().poll_tasks(cx)?; + + // If adding buf to pending buffer would trigger send, check + // whether we have capacity for another task. + let enough_to_send = (buf.len() + self.current_buffer.len()) > self.min_part_size; + if enough_to_send && self.tasks.len() < self.max_concurrency { + // If we do, copy into the buffer and submit the task, and return ready. + self.current_buffer.extend_from_slice(buf); + + let out_buffer = std::mem::take(&mut self.current_buffer); + let task = self + .inner + .put_multipart_part(out_buffer, self.current_part_idx); + self.tasks.push(task); + self.current_part_idx += 1; + + // We need to poll immediately after adding to setup waker + self.as_mut().poll_tasks(cx)?; + + Poll::Ready(Ok(buf.len())) + } else if !enough_to_send { + self.current_buffer.extend_from_slice(buf); + Poll::Ready(Ok(buf.len())) + } else { + // Waker registered by call to poll_tasks at beginning + Poll::Pending + } + } + + fn poll_flush( + mut self: Pin<&mut Self>, + cx: &mut std::task::Context<'_>, + ) -> std::task::Poll> { + // Poll current tasks + self.as_mut().poll_tasks(cx)?; + + // If current_buffer is not empty, see if it can be submitted + if !self.current_buffer.is_empty() && self.tasks.len() < self.max_concurrency { + let out_buffer: Vec = std::mem::take(&mut self.current_buffer); + let task = self + .inner + .put_multipart_part(out_buffer, self.current_part_idx); + self.tasks.push(task); + } + + self.as_mut().poll_tasks(cx)?; + + // If tasks and current_buffer are empty, return Ready + if self.tasks.is_empty() && self.current_buffer.is_empty() { + Poll::Ready(Ok(())) + } else { + Poll::Pending + } + } + + fn poll_shutdown( + mut self: Pin<&mut Self>, + cx: &mut std::task::Context<'_>, + ) -> std::task::Poll> { + // First, poll flush + match self.as_mut().poll_flush(cx) { + Poll::Pending => return Poll::Pending, + Poll::Ready(res) => res?, + }; + + // If shutdown task is not set, set it + let parts = std::mem::take(&mut self.completed_parts); + let inner = Arc::clone(&self.inner); + let completion_task = self + .completion_task + .get_or_insert_with(|| inner.complete(parts)); + + Pin::new(completion_task).poll(cx) + } +} diff --git a/src/throttle.rs b/src/throttle.rs index 6560296..6789f0e 100644 --- a/src/throttle.rs +++ b/src/throttle.rs @@ -20,11 +20,13 @@ use parking_lot::Mutex; use std::ops::Range; use std::{convert::TryInto, sync::Arc}; +use crate::MultipartId; use crate::{path::Path, GetResult, ListResult, ObjectMeta, ObjectStore, Result}; use async_trait::async_trait; use bytes::Bytes; use futures::{stream::BoxStream, StreamExt}; use std::time::Duration; +use tokio::io::AsyncWrite; /// Configuration settings for throttled store #[derive(Debug, Default, Clone, Copy)] @@ -149,6 +151,21 @@ impl ObjectStore for ThrottledStore { self.inner.put(location, bytes).await } + async fn put_multipart( + &self, + _location: &Path, + ) -> Result<(MultipartId, Box)> { + Err(super::Error::NotImplemented) + } + + async fn abort_multipart( + &self, + _location: &Path, + _multipart_id: &MultipartId, + ) -> Result<()> { + Err(super::Error::NotImplemented) + } + async fn get(&self, location: &Path) -> Result { sleep(self.config().wait_get_per_call).await; From 20d54c4286d616ac00500f20562c6df01a702c78 Mon Sep 17 00:00:00 2001 From: Jean-Charles Campagne Date: Thu, 28 Jul 2022 10:57:34 +0100 Subject: [PATCH 006/397] Ignore broken symlinks for LocalFileSystem object store (#2195) * Re-enable test_list_root test for MacOS * LocalFileSystem: ignore broken links in convert_walkdir_result --- src/local.rs | 31 +++++++++++++++++++++++++++---- 1 file changed, 27 insertions(+), 4 deletions(-) diff --git a/src/local.rs b/src/local.rs index 798edef..e2f133e 100644 --- a/src/local.rs +++ b/src/local.rs @@ -27,7 +27,7 @@ use futures::future::BoxFuture; use futures::FutureExt; use futures::{stream::BoxStream, StreamExt}; use snafu::{ensure, OptionExt, ResultExt, Snafu}; -use std::fs::File; +use std::fs::{metadata, symlink_metadata, File}; use std::io::{Read, Seek, SeekFrom, Write}; use std::ops::Range; use std::pin::Pin; @@ -804,11 +804,36 @@ fn convert_metadata(metadata: std::fs::Metadata, location: Path) -> Result, ) -> Result> { match res { - Ok(entry) => Ok(Some(entry)), + Ok(entry) => { + // To check for broken symlink: call symlink_metadata() - it does not traverse symlinks); + // if ok: check if entry is symlink; and try to read it by calling metadata(). + match symlink_metadata(entry.path()) { + Ok(attr) => { + if attr.is_symlink() { + let target_metadata = metadata(entry.path()); + match target_metadata { + Ok(_) => { + // symlink is valid + Ok(Some(entry)) + } + Err(_) => { + // this is a broken symlink, return None + Ok(None) + } + } + } else { + Ok(Some(entry)) + } + } + Err(_) => Ok(None), + } + } + Err(walkdir_err) => match walkdir_err.io_error() { Some(io_err) => match io_err.kind() { io::ErrorKind::NotFound => Ok(None), @@ -990,8 +1015,6 @@ mod tests { } #[tokio::test] - #[cfg(target_os = "linux")] - // macos has some magic in its root '/.VolumeIcon.icns"' which causes this test to fail async fn test_list_root() { let integration = LocalFileSystem::new(); let result = integration.list_with_delimiter(None).await; From 70691b27f145cbec6925f9385a3a64c72e6c6e1b Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 29 Jul 2022 13:27:37 -0400 Subject: [PATCH 007/397] Add Builder style config objects for object_store (#2204) * Add AmazonS3Config, MicrosoftAzureBuilder, GoogleCloudStorageBuilder * fix: improve docs * review feedback: remove old code, make with_client test only --- src/aws.rs | 370 ++++++++++++++++++++++++++++++++------------------- src/azure.rs | 220 ++++++++++++++++++------------ src/gcp.rs | 230 ++++++++++++++++++++------------ 3 files changed, 516 insertions(+), 304 deletions(-) diff --git a/src/aws.rs b/src/aws.rs index 3606a38..89a2185 100644 --- a/src/aws.rs +++ b/src/aws.rs @@ -228,6 +228,14 @@ enum Error { source: rusoto_core::region::ParseRegionError, }, + #[snafu(display( + "Region must be specified for AWS S3. Regions should look like `us-east-2`" + ))] + MissingRegion {}, + + #[snafu(display("Missing bucket name"))] + MissingBucketName {}, + #[snafu(display("Missing aws-access-key"))] MissingAccessKey, @@ -584,99 +592,195 @@ fn convert_object_meta(object: rusoto_s3::Object, bucket: &str) -> Result>, - secret_access_key: Option>, - region: impl Into, - bucket_name: impl Into, - endpoint: Option>, - session_token: Option>, +/// +/// # Example +/// ``` +/// # let REGION = "foo"; +/// # let BUCKET_NAME = "foo"; +/// # let ACCESS_KEY_ID = "foo"; +/// # let SECRET_KEY = "foo"; +/// let s3 = object_store::aws::AmazonS3Builder::new() +/// .with_region(REGION) +/// .with_bucket_name(BUCKET_NAME) +/// .with_access_key_id(ACCESS_KEY_ID) +/// .with_secret_access_key(SECRET_KEY) +/// .build(); +/// ``` +#[derive(Debug)] +pub struct AmazonS3Builder { + access_key_id: Option, + secret_access_key: Option, + region: Option, + bucket_name: Option, + endpoint: Option, + token: Option, max_connections: NonZeroUsize, allow_http: bool, -) -> Result { - let region = region.into(); - let region: rusoto_core::Region = match endpoint { - None => region.parse().context(InvalidRegionSnafu { region })?, - Some(endpoint) => rusoto_core::Region::Custom { - name: region, - endpoint: endpoint.into(), - }, - }; +} - let mut builder = HyperBuilder::default(); - builder.pool_max_idle_per_host(max_connections.get()); - - let connector = if allow_http { - hyper_rustls::HttpsConnectorBuilder::new() - .with_webpki_roots() - .https_or_http() - .enable_http1() - .enable_http2() - .build() - } else { - hyper_rustls::HttpsConnectorBuilder::new() - .with_webpki_roots() - .https_only() - .enable_http1() - .enable_http2() - .build() - }; +impl Default for AmazonS3Builder { + fn default() -> Self { + Self { + access_key_id: None, + secret_access_key: None, + region: None, + bucket_name: None, + endpoint: None, + token: None, + max_connections: NonZeroUsize::new(16).unwrap(), + allow_http: false, + } + } +} - let http_client = rusoto_core::request::HttpClient::from_builder(builder, connector); +impl AmazonS3Builder { + /// Create a new [`AmazonS3Builder`] with default values. + pub fn new() -> Self { + Default::default() + } - let client = match (access_key_id, secret_access_key, session_token) { - (Some(access_key_id), Some(secret_access_key), Some(session_token)) => { - let credentials_provider = StaticProvider::new( - access_key_id.into(), - secret_access_key.into(), - Some(session_token.into()), - None, - ); - rusoto_s3::S3Client::new_with(http_client, credentials_provider, region) - } - (Some(access_key_id), Some(secret_access_key), None) => { - let credentials_provider = StaticProvider::new_minimal( - access_key_id.into(), - secret_access_key.into(), - ); - rusoto_s3::S3Client::new_with(http_client, credentials_provider, region) - } - (None, Some(_), _) => return Err(Error::MissingAccessKey.into()), - (Some(_), None, _) => return Err(Error::MissingSecretAccessKey.into()), - _ if std::env::var_os("AWS_WEB_IDENTITY_TOKEN_FILE").is_some() => { - rusoto_s3::S3Client::new_with( - http_client, - WebIdentityProvider::from_k8s_env(), - region, - ) - } - _ => rusoto_s3::S3Client::new_with( - http_client, - InstanceMetadataProvider::new(), + /// Set the AWS Access Key (required) + pub fn with_access_key_id(mut self, access_key_id: impl Into) -> Self { + self.access_key_id = Some(access_key_id.into()); + self + } + + /// Set the AWS Secret Access Key (required) + pub fn with_secret_access_key( + mut self, + secret_access_key: impl Into, + ) -> Self { + self.secret_access_key = Some(secret_access_key.into()); + self + } + + /// Set the region (e.g. `us-east-1`) (required) + pub fn with_region(mut self, region: impl Into) -> Self { + self.region = Some(region.into()); + self + } + + /// Set the bucket_name (required) + pub fn with_bucket_name(mut self, bucket_name: impl Into) -> Self { + self.bucket_name = Some(bucket_name.into()); + self + } + + /// Sets the endpoint for communicating with AWS S3. Default value + /// is based on region. + /// + /// For example, this might be set to `"http://localhost:4566:` + /// for testing against a localstack instance. + pub fn with_endpoint(mut self, endpoint: impl Into) -> Self { + self.endpoint = Some(endpoint.into()); + self + } + + /// Set the token to use for requests (passed to underlying provider) + pub fn with_token(mut self, token: impl Into) -> Self { + self.token = Some(token.into()); + self + } + + /// Sets the maximum number of concurrent outstanding + /// connectons. Default is `16`. + pub fn with_max_connections(mut self, max_connections: NonZeroUsize) -> Self { + self.max_connections = max_connections; + self + } + + /// Sets what protocol is allowed. If `allow_http` is : + /// * false (default): Only HTTPS are allowed + /// * true: HTTP and HTTPS are allowed + pub fn with_allow_http(mut self, allow_http: bool) -> Self { + self.allow_http = allow_http; + self + } + + /// Create a [`AmazonS3`] instance from the provided values, + /// consuming `self`. + pub fn build(self) -> Result { + let Self { + access_key_id, + secret_access_key, region, - ), - }; + bucket_name, + endpoint, + token, + max_connections, + allow_http, + } = self; + + let region = region.ok_or(Error::MissingRegion {})?; + let bucket_name = bucket_name.ok_or(Error::MissingBucketName {})?; + + let region: rusoto_core::Region = match endpoint { + None => region.parse().context(InvalidRegionSnafu { region })?, + Some(endpoint) => rusoto_core::Region::Custom { + name: region, + endpoint, + }, + }; - Ok(AmazonS3 { - client_unrestricted: client, - connection_semaphore: Arc::new(Semaphore::new(max_connections.get())), - bucket_name: bucket_name.into(), - }) -} + let mut builder = HyperBuilder::default(); + builder.pool_max_idle_per_host(max_connections.get()); + + let connector = if allow_http { + hyper_rustls::HttpsConnectorBuilder::new() + .with_webpki_roots() + .https_or_http() + .enable_http1() + .enable_http2() + .build() + } else { + hyper_rustls::HttpsConnectorBuilder::new() + .with_webpki_roots() + .https_only() + .enable_http1() + .enable_http2() + .build() + }; + + let http_client = + rusoto_core::request::HttpClient::from_builder(builder, connector); + + let client = match (access_key_id, secret_access_key, token) { + (Some(access_key_id), Some(secret_access_key), Some(token)) => { + let credentials_provider = StaticProvider::new( + access_key_id, + secret_access_key, + Some(token), + None, + ); + rusoto_s3::S3Client::new_with(http_client, credentials_provider, region) + } + (Some(access_key_id), Some(secret_access_key), None) => { + let credentials_provider = + StaticProvider::new_minimal(access_key_id, secret_access_key); + rusoto_s3::S3Client::new_with(http_client, credentials_provider, region) + } + (None, Some(_), _) => return Err(Error::MissingAccessKey.into()), + (Some(_), None, _) => return Err(Error::MissingSecretAccessKey.into()), + _ if std::env::var_os("AWS_WEB_IDENTITY_TOKEN_FILE").is_some() => { + rusoto_s3::S3Client::new_with( + http_client, + WebIdentityProvider::from_k8s_env(), + region, + ) + } + _ => rusoto_s3::S3Client::new_with( + http_client, + InstanceMetadataProvider::new(), + region, + ), + }; -/// Create a new [`AmazonS3`] that always errors -pub fn new_failing_s3() -> Result { - new_s3( - Some("foo"), - Some("bar"), - "us-east-1", - "bucket", - None as Option<&str>, - None as Option<&str>, - NonZeroUsize::new(16).unwrap(), - true, - ) + Ok(AmazonS3 { + client_unrestricted: client, + connection_semaphore: Arc::new(Semaphore::new(max_connections.get())), + bucket_name, + }) + } } /// S3 client bundled w/ a semaphore permit. @@ -1057,7 +1161,7 @@ mod tests { get_nonexistent_object, list_uses_directories_correctly, list_with_delimiter, put_get_delete_list, rename_and_copy, stream_get, }, - Error as ObjectStoreError, ObjectStore, + Error as ObjectStoreError, }; use bytes::Bytes; use std::env; @@ -1067,17 +1171,9 @@ mod tests { const NON_EXISTENT_NAME: &str = "nonexistentname"; - #[derive(Debug)] - struct AwsConfig { - access_key_id: String, - secret_access_key: String, - region: String, - bucket: String, - endpoint: Option, - token: Option, - } - - // Helper macro to skip tests if TEST_INTEGRATION and the AWS environment variables are not set. + // Helper macro to skip tests if TEST_INTEGRATION and the AWS + // environment variables are not set. Returns a configured + // AmazonS3Builder macro_rules! maybe_skip_integration { () => {{ dotenv::dotenv().ok(); @@ -1116,18 +1212,38 @@ mod tests { ); return; } else { - AwsConfig { - access_key_id: env::var("AWS_ACCESS_KEY_ID") - .expect("already checked AWS_ACCESS_KEY_ID"), - secret_access_key: env::var("AWS_SECRET_ACCESS_KEY") - .expect("already checked AWS_SECRET_ACCESS_KEY"), - region: env::var("AWS_DEFAULT_REGION") - .expect("already checked AWS_DEFAULT_REGION"), - bucket: env::var("OBJECT_STORE_BUCKET") - .expect("already checked OBJECT_STORE_BUCKET"), - endpoint: env::var("AWS_ENDPOINT").ok(), - token: env::var("AWS_SESSION_TOKEN").ok(), - } + let config = AmazonS3Builder::new() + .with_access_key_id( + env::var("AWS_ACCESS_KEY_ID") + .expect("already checked AWS_ACCESS_KEY_ID"), + ) + .with_secret_access_key( + env::var("AWS_SECRET_ACCESS_KEY") + .expect("already checked AWS_SECRET_ACCESS_KEY"), + ) + .with_region( + env::var("AWS_DEFAULT_REGION") + .expect("already checked AWS_DEFAULT_REGION"), + ) + .with_bucket_name( + env::var("OBJECT_STORE_BUCKET") + .expect("already checked OBJECT_STORE_BUCKET"), + ) + .with_allow_http(true); + + let config = if let Some(endpoint) = env::var("AWS_ENDPOINT").ok() { + config.with_endpoint(endpoint) + } else { + config + }; + + let config = if let Some(token) = env::var("AWS_SESSION_TOKEN").ok() { + config.with_token(token) + } else { + config + }; + + config } }}; } @@ -1148,24 +1264,10 @@ mod tests { r } - fn make_integration(config: AwsConfig) -> AmazonS3 { - new_s3( - Some(config.access_key_id), - Some(config.secret_access_key), - config.region, - config.bucket, - config.endpoint, - config.token, - NonZeroUsize::new(16).unwrap(), - true, - ) - .expect("Valid S3 config") - } - #[tokio::test] async fn s3_test() { let config = maybe_skip_integration!(); - let integration = make_integration(config); + let integration = config.build().unwrap(); check_credentials(put_get_delete_list(&integration).await).unwrap(); check_credentials(list_uses_directories_correctly(&integration).await).unwrap(); @@ -1177,7 +1279,7 @@ mod tests { #[tokio::test] async fn s3_test_get_nonexistent_location() { let config = maybe_skip_integration!(); - let integration = make_integration(config); + let integration = config.build().unwrap(); let location = Path::from_iter([NON_EXISTENT_NAME]); @@ -1204,9 +1306,8 @@ mod tests { #[tokio::test] async fn s3_test_get_nonexistent_bucket() { - let mut config = maybe_skip_integration!(); - config.bucket = NON_EXISTENT_NAME.into(); - let integration = make_integration(config); + let config = maybe_skip_integration!().with_bucket_name(NON_EXISTENT_NAME); + let integration = config.build().unwrap(); let location = Path::from_iter([NON_EXISTENT_NAME]); @@ -1220,9 +1321,9 @@ mod tests { #[tokio::test] async fn s3_test_put_nonexistent_bucket() { - let mut config = maybe_skip_integration!(); - config.bucket = NON_EXISTENT_NAME.into(); - let integration = make_integration(config); + let config = maybe_skip_integration!().with_bucket_name(NON_EXISTENT_NAME); + + let integration = config.build().unwrap(); let location = Path::from_iter([NON_EXISTENT_NAME]); let data = Bytes::from("arbitrary data"); @@ -1244,7 +1345,7 @@ mod tests { #[tokio::test] async fn s3_test_delete_nonexistent_location() { let config = maybe_skip_integration!(); - let integration = make_integration(config); + let integration = config.build().unwrap(); let location = Path::from_iter([NON_EXISTENT_NAME]); @@ -1253,9 +1354,8 @@ mod tests { #[tokio::test] async fn s3_test_delete_nonexistent_bucket() { - let mut config = maybe_skip_integration!(); - config.bucket = NON_EXISTENT_NAME.into(); - let integration = make_integration(config); + let config = maybe_skip_integration!().with_bucket_name(NON_EXISTENT_NAME); + let integration = config.build().unwrap(); let location = Path::from_iter([NON_EXISTENT_NAME]); diff --git a/src/azure.rs b/src/azure.rs index 25f311a..dca52a3 100644 --- a/src/azure.rs +++ b/src/azure.rs @@ -185,6 +185,15 @@ enum Error { env_value: String, source: url::ParseError, }, + + #[snafu(display("Account must be specified"))] + MissingAccount {}, + + #[snafu(display("Access key must be specified"))] + MissingAccessKey {}, + + #[snafu(display("Container name must be specified"))] + MissingContainerName {}, } impl From for super::Error { @@ -570,73 +579,125 @@ fn url_from_env(env_name: &str, default_url: &str) -> Result { Ok(url) } -/// Configure a connection to container with given name on Microsoft Azure -/// Blob store. +/// Configure a connection to Mirosoft Azure Blob Storage bucket using +/// the specified credentials. /// -/// The credentials `account` and `access_key` must provide access to the -/// store. -pub fn new_azure( - account: impl Into, - access_key: impl Into, - container_name: impl Into, +/// # Example +/// ``` +/// # let ACCOUNT = "foo"; +/// # let BUCKET_NAME = "foo"; +/// # let ACCESS_KEY = "foo"; +/// let azure = object_store::azure::MicrosoftAzureBuilder::new() +/// .with_account(ACCOUNT) +/// .with_access_key(ACCESS_KEY) +/// .with_container_name(BUCKET_NAME) +/// .build(); +/// ``` +#[derive(Debug, Default)] +pub struct MicrosoftAzureBuilder { + account: Option, + access_key: Option, + container_name: Option, use_emulator: bool, -) -> Result { - let account = account.into(); - let access_key = access_key.into(); - let http_client: Arc = Arc::new(reqwest::Client::new()); - - let (is_emulator, storage_account_client) = if use_emulator { - check_if_emulator_works()?; - // Allow overriding defaults. Values taken from - // from https://docs.rs/azure_storage/0.2.0/src/azure_storage/core/clients/storage_account_client.rs.html#129-141 - let http_client = azure_core::new_http_client(); - let blob_storage_url = - url_from_env("AZURITE_BLOB_STORAGE_URL", "http://127.0.0.1:10000")?; - let queue_storage_url = - url_from_env("AZURITE_QUEUE_STORAGE_URL", "http://127.0.0.1:10001")?; - let table_storage_url = - url_from_env("AZURITE_TABLE_STORAGE_URL", "http://127.0.0.1:10002")?; - let filesystem_url = - url_from_env("AZURITE_TABLE_STORAGE_URL", "http://127.0.0.1:10004")?; - - let storage_client = StorageAccountClient::new_emulator( - http_client, - &blob_storage_url, - &table_storage_url, - &queue_storage_url, - &filesystem_url, - ); - - (true, storage_client) - } else { - ( - false, - StorageAccountClient::new_access_key( - Arc::clone(&http_client), - &account, - &access_key, - ), - ) - }; +} - let storage_client = storage_account_client.as_storage_client(); - let blob_base_url = storage_account_client - .blob_storage_url() - .as_ref() - // make url ending consistent between the emulator and remote storage account - .trim_end_matches('/') - .to_string(); +impl MicrosoftAzureBuilder { + /// Create a new [`MicrosoftAzureBuilder`] with default values. + pub fn new() -> Self { + Default::default() + } - let container_name = container_name.into(); + /// Set the Azure Account (required) + pub fn with_account(mut self, account: impl Into) -> Self { + self.account = Some(account.into()); + self + } - let container_client = storage_client.as_container_client(&container_name); + /// Set the Azure Access Key (required) + pub fn with_access_key(mut self, access_key: impl Into) -> Self { + self.access_key = Some(access_key.into()); + self + } + + /// Set the Azure Container Name (required) + pub fn with_container_name(mut self, container_name: impl Into) -> Self { + self.container_name = Some(container_name.into()); + self + } - Ok(MicrosoftAzure { - container_client, - container_name, - blob_base_url, - is_emulator, - }) + /// Set if the Azure emulator should be used (defaults to false) + pub fn with_use_emulator(mut self, use_emulator: bool) -> Self { + self.use_emulator = use_emulator; + self + } + + /// Configure a connection to container with given name on Microsoft Azure + /// Blob store. + pub fn build(self) -> Result { + let Self { + account, + access_key, + container_name, + use_emulator, + } = self; + + let account = account.ok_or(Error::MissingAccount {})?; + let access_key = access_key.ok_or(Error::MissingAccessKey {})?; + let container_name = container_name.ok_or(Error::MissingContainerName {})?; + + let http_client: Arc = Arc::new(reqwest::Client::new()); + + let (is_emulator, storage_account_client) = if use_emulator { + check_if_emulator_works()?; + // Allow overriding defaults. Values taken from + // from https://docs.rs/azure_storage/0.2.0/src/azure_storage/core/clients/storage_account_client.rs.html#129-141 + let http_client = azure_core::new_http_client(); + let blob_storage_url = + url_from_env("AZURITE_BLOB_STORAGE_URL", "http://127.0.0.1:10000")?; + let queue_storage_url = + url_from_env("AZURITE_QUEUE_STORAGE_URL", "http://127.0.0.1:10001")?; + let table_storage_url = + url_from_env("AZURITE_TABLE_STORAGE_URL", "http://127.0.0.1:10002")?; + let filesystem_url = + url_from_env("AZURITE_TABLE_STORAGE_URL", "http://127.0.0.1:10004")?; + + let storage_client = StorageAccountClient::new_emulator( + http_client, + &blob_storage_url, + &table_storage_url, + &queue_storage_url, + &filesystem_url, + ); + + (true, storage_client) + } else { + ( + false, + StorageAccountClient::new_access_key( + Arc::clone(&http_client), + &account, + &access_key, + ), + ) + }; + + let storage_client = storage_account_client.as_storage_client(); + let blob_base_url = storage_account_client + .blob_storage_url() + .as_ref() + // make url ending consistent between the emulator and remote storage account + .trim_end_matches('/') + .to_string(); + + let container_client = storage_client.as_container_client(&container_name); + + Ok(MicrosoftAzure { + container_client, + container_name, + blob_base_url, + is_emulator, + }) + } } // Relevant docs: https://azure.github.io/Storage/docs/application-and-user-data/basics/azure-blob-storage-upload-apis/ @@ -729,21 +790,13 @@ impl CloudMultiPartUploadImpl for AzureMultiPartUpload { #[cfg(test)] mod tests { - use crate::azure::new_azure; + use super::*; use crate::tests::{ copy_if_not_exists, list_uses_directories_correctly, list_with_delimiter, put_get_delete_list, rename_and_copy, }; use std::env; - #[derive(Debug)] - struct AzureConfig { - storage_account: String, - access_key: String, - bucket: String, - use_emulator: bool, - } - // Helper macro to skip tests if TEST_INTEGRATION and the Azure environment // variables are not set. macro_rules! maybe_skip_integration { @@ -785,28 +838,23 @@ mod tests { ); return; } else { - AzureConfig { - storage_account: env::var("AZURE_STORAGE_ACCOUNT") - .unwrap_or_default(), - access_key: env::var("AZURE_STORAGE_ACCESS_KEY").unwrap_or_default(), - bucket: env::var("OBJECT_STORE_BUCKET") - .expect("already checked OBJECT_STORE_BUCKET"), - use_emulator, - } + MicrosoftAzureBuilder::new() + .with_account(env::var("AZURE_STORAGE_ACCOUNT").unwrap_or_default()) + .with_access_key( + env::var("AZURE_STORAGE_ACCESS_KEY").unwrap_or_default(), + ) + .with_container_name( + env::var("OBJECT_STORE_BUCKET") + .expect("already checked OBJECT_STORE_BUCKET"), + ) + .with_use_emulator(use_emulator) } }}; } #[tokio::test] async fn azure_blob_test() { - let config = maybe_skip_integration!(); - let integration = new_azure( - config.storage_account, - config.access_key, - config.bucket, - config.use_emulator, - ) - .unwrap(); + let integration = maybe_skip_integration!().build().unwrap(); put_get_delete_list(&integration).await.unwrap(); list_uses_directories_correctly(&integration).await.unwrap(); diff --git a/src/gcp.rs b/src/gcp.rs index d740625..dea8769 100644 --- a/src/gcp.rs +++ b/src/gcp.rs @@ -98,6 +98,12 @@ enum Error { #[snafu(display("Error decoding object size: {}", source))] InvalidSize { source: std::num::ParseIntError }, + + #[snafu(display("Missing bucket name"))] + MissingBucketName {}, + + #[snafu(display("Missing service account path"))] + MissingServiceAccountPath, } impl From for super::Error { @@ -779,55 +785,121 @@ fn reader_credentials_file( Ok(serde_json::from_reader(reader).context(DecodeCredentialsSnafu)?) } -/// Configure a connection to Google Cloud Storage. -pub fn new_gcs( - service_account_path: impl AsRef, - bucket_name: impl Into, -) -> Result { - new_gcs_with_client(service_account_path, bucket_name, Client::new()) +/// Configure a connection to Google Cloud Storage using the specified +/// credentials. +/// +/// # Example +/// ``` +/// # let BUCKET_NAME = "foo"; +/// # let SERVICE_ACCOUNT_PATH = "/tmp/foo.json"; +/// let gcs = object_store::gcp::GoogleCloudStorageBuilder::new() +/// .with_service_account_path(SERVICE_ACCOUNT_PATH) +/// .with_bucket_name(BUCKET_NAME) +/// .build(); +/// ``` +#[derive(Debug, Default)] +pub struct GoogleCloudStorageBuilder { + bucket_name: Option, + service_account_path: Option, + client: Option, } -/// Configure a connection to Google Cloud Storage with the specified HTTP client. -pub fn new_gcs_with_client( - service_account_path: impl AsRef, - bucket_name: impl Into, - client: Client, -) -> Result { - let credentials = reader_credentials_file(service_account_path)?; - - // TODO: https://cloud.google.com/storage/docs/authentication#oauth-scopes - let scope = "https://www.googleapis.com/auth/devstorage.full_control"; - let audience = "https://www.googleapis.com/oauth2/v4/token".to_string(); - - let oauth_provider = (!credentials.disable_oauth) - .then(|| { - OAuthProvider::new( - credentials.client_email, - credentials.private_key, - scope.to_string(), - audience, - ) - }) - .transpose()?; +impl GoogleCloudStorageBuilder { + /// Create a new [`GoogleCloudStorageBuilder`] with default values. + pub fn new() -> Self { + Default::default() + } - let bucket_name = bucket_name.into(); - let encoded_bucket_name = - percent_encode(bucket_name.as_bytes(), NON_ALPHANUMERIC).to_string(); + /// Set the bucket name (required) + pub fn with_bucket_name(mut self, bucket_name: impl Into) -> Self { + self.bucket_name = Some(bucket_name.into()); + self + } - // The cloud storage crate currently only supports authentication via - // environment variables. Set the environment variable explicitly so - // that we can optionally accept command line arguments instead. - Ok(GoogleCloudStorage { - client: Arc::new(GoogleCloudStorageClient { - client, - base_url: credentials.gcs_base_url, - oauth_provider, - token_cache: Default::default(), + /// Set the path to the service account file (required). Example + /// `"/tmp/gcs.json"` + /// + /// Example contents of `gcs.json`: + /// + /// ```json + /// { + /// "gcs_base_url": "https://localhost:4443", + /// "disable_oauth": true, + /// "client_email": "", + /// "private_key": "" + /// } + /// ``` + pub fn with_service_account_path( + mut self, + service_account_path: impl Into, + ) -> Self { + self.service_account_path = Some(service_account_path.into()); + self + } + + /// Use the specified http [`Client`] (defaults to [`Client::new`]) + /// + /// This allows you to set custom client options such as allowing + /// non secure connections or custom headers. + /// + /// NOTE: Currently only available in `test`s to facilitate + /// testing, to avoid leaking details and preserve our ability to + /// make changes to the implementation. + #[cfg(test)] + pub fn with_client(mut self, client: Client) -> Self { + self.client = Some(client); + self + } + + /// Configure a connection to Google Cloud Storage, returning a + /// new [`GoogleCloudStorage`] and consuming `self` + pub fn build(self) -> Result { + let Self { bucket_name, - bucket_name_encoded: encoded_bucket_name, - max_list_results: None, - }), - }) + service_account_path, + client, + } = self; + + let bucket_name = bucket_name.ok_or(Error::MissingBucketName {})?; + let service_account_path = + service_account_path.ok_or(Error::MissingServiceAccountPath)?; + let client = client.unwrap_or_else(Client::new); + + let credentials = reader_credentials_file(service_account_path)?; + + // TODO: https://cloud.google.com/storage/docs/authentication#oauth-scopes + let scope = "https://www.googleapis.com/auth/devstorage.full_control"; + let audience = "https://www.googleapis.com/oauth2/v4/token".to_string(); + + let oauth_provider = (!credentials.disable_oauth) + .then(|| { + OAuthProvider::new( + credentials.client_email, + credentials.private_key, + scope.to_string(), + audience, + ) + }) + .transpose()?; + + let encoded_bucket_name = + percent_encode(bucket_name.as_bytes(), NON_ALPHANUMERIC).to_string(); + + // The cloud storage crate currently only supports authentication via + // environment variables. Set the environment variable explicitly so + // that we can optionally accept command line arguments instead. + Ok(GoogleCloudStorage { + client: Arc::new(GoogleCloudStorageClient { + client, + base_url: credentials.gcs_base_url, + oauth_provider, + token_cache: Default::default(), + bucket_name, + bucket_name_encoded: encoded_bucket_name, + max_list_results: None, + }), + }) + } } fn convert_object_meta(object: &Object) -> Result { @@ -860,24 +932,6 @@ mod test { const NON_EXISTENT_NAME: &str = "nonexistentname"; - #[derive(Debug)] - struct GoogleCloudConfig { - bucket: String, - service_account: String, - } - - impl GoogleCloudConfig { - fn build_test(self) -> Result { - // ignore HTTPS errors in tests so we can use fake-gcs server - let client = Client::builder() - .danger_accept_invalid_certs(true) - .build() - .expect("Error creating http client for testing"); - - new_gcs_with_client(self.service_account, self.bucket, client) - } - } - // Helper macro to skip tests if TEST_INTEGRATION and the GCP environment variables are not set. macro_rules! maybe_skip_integration { () => {{ @@ -912,20 +966,29 @@ mod test { ); return; } else { - GoogleCloudConfig { - bucket: env::var("OBJECT_STORE_BUCKET") - .expect("already checked OBJECT_STORE_BUCKET"), - service_account: env::var("GOOGLE_SERVICE_ACCOUNT") - .expect("already checked GOOGLE_SERVICE_ACCOUNT"), - } + GoogleCloudStorageBuilder::new() + .with_bucket_name( + env::var("OBJECT_STORE_BUCKET") + .expect("already checked OBJECT_STORE_BUCKET") + ) + .with_service_account_path( + env::var("GOOGLE_SERVICE_ACCOUNT") + .expect("already checked GOOGLE_SERVICE_ACCOUNT") + ) + .with_client( + // ignore HTTPS errors in tests so we can use fake-gcs server + Client::builder() + .danger_accept_invalid_certs(true) + .build() + .expect("Error creating http client for testing") + ) } }}; } #[tokio::test] async fn gcs_test() { - let config = maybe_skip_integration!(); - let integration = config.build_test().unwrap(); + let integration = maybe_skip_integration!().build().unwrap(); put_get_delete_list(&integration).await.unwrap(); list_uses_directories_correctly(&integration).await.unwrap(); @@ -940,8 +1003,7 @@ mod test { #[tokio::test] async fn gcs_test_get_nonexistent_location() { - let config = maybe_skip_integration!(); - let integration = config.build_test().unwrap(); + let integration = maybe_skip_integration!().build().unwrap(); let location = Path::from_iter([NON_EXISTENT_NAME]); @@ -956,9 +1018,10 @@ mod test { #[tokio::test] async fn gcs_test_get_nonexistent_bucket() { - let mut config = maybe_skip_integration!(); - config.bucket = NON_EXISTENT_NAME.into(); - let integration = config.build_test().unwrap(); + let integration = maybe_skip_integration!() + .with_bucket_name(NON_EXISTENT_NAME) + .build() + .unwrap(); let location = Path::from_iter([NON_EXISTENT_NAME]); @@ -975,8 +1038,7 @@ mod test { #[tokio::test] async fn gcs_test_delete_nonexistent_location() { - let config = maybe_skip_integration!(); - let integration = config.build_test().unwrap(); + let integration = maybe_skip_integration!().build().unwrap(); let location = Path::from_iter([NON_EXISTENT_NAME]); @@ -990,9 +1052,10 @@ mod test { #[tokio::test] async fn gcs_test_delete_nonexistent_bucket() { - let mut config = maybe_skip_integration!(); - config.bucket = NON_EXISTENT_NAME.into(); - let integration = config.build_test().unwrap(); + let integration = maybe_skip_integration!() + .with_bucket_name(NON_EXISTENT_NAME) + .build() + .unwrap(); let location = Path::from_iter([NON_EXISTENT_NAME]); @@ -1006,9 +1069,10 @@ mod test { #[tokio::test] async fn gcs_test_put_nonexistent_bucket() { - let mut config = maybe_skip_integration!(); - config.bucket = NON_EXISTENT_NAME.into(); - let integration = config.build_test().unwrap(); + let integration = maybe_skip_integration!() + .with_bucket_name(NON_EXISTENT_NAME) + .build() + .unwrap(); let location = Path::from_iter([NON_EXISTENT_NAME]); let data = Bytes::from("arbitrary data"); From 5b67b543a424e694b385d6b6656ed10a925cf044 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Sun, 31 Jul 2022 21:44:32 +0100 Subject: [PATCH 008/397] Add LimitStore (#2175) (#2242) * Add LimitStore (#2175) * Review feedback * Fix test --- src/aws.rs | 1 + src/lib.rs | 1 + src/limit.rs | 263 +++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 265 insertions(+) create mode 100644 src/limit.rs diff --git a/src/aws.rs b/src/aws.rs index 89a2185..cedd465 100644 --- a/src/aws.rs +++ b/src/aws.rs @@ -684,6 +684,7 @@ impl AmazonS3Builder { /// Sets the maximum number of concurrent outstanding /// connectons. Default is `16`. + #[deprecated(note = "use LimitStore instead")] pub fn with_max_connections(mut self, max_connections: NonZeroUsize) -> Self { self.max_connections = max_connections; self diff --git a/src/lib.rs b/src/lib.rs index 54d2827..33e8452 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -45,6 +45,7 @@ pub mod aws; pub mod azure; #[cfg(feature = "gcp")] pub mod gcp; +pub mod limit; pub mod local; pub mod memory; pub mod path; diff --git a/src/limit.rs b/src/limit.rs new file mode 100644 index 0000000..fd21ccb --- /dev/null +++ b/src/limit.rs @@ -0,0 +1,263 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! An object store that limits the maximum concurrency of the wrapped implementation + +use crate::{ + BoxStream, GetResult, ListResult, MultipartId, ObjectMeta, ObjectStore, Path, Result, + StreamExt, +}; +use async_trait::async_trait; +use bytes::Bytes; +use futures::Stream; +use std::io::{Error, IoSlice}; +use std::ops::Range; +use std::pin::Pin; +use std::sync::Arc; +use std::task::{Context, Poll}; +use tokio::io::AsyncWrite; +use tokio::sync::{OwnedSemaphorePermit, Semaphore}; + +/// Store wrapper that wraps an inner store and limits the maximum number of concurrent +/// object store operations. Where each call to an [`ObjectStore`] member function is +/// considered a single operation, even if it may result in more than one network call +/// +/// ``` +/// # use object_store::memory::InMemory; +/// # use object_store::limit::LimitStore; +/// +/// // Create an in-memory `ObjectStore` limited to 20 concurrent requests +/// let store = LimitStore::new(InMemory::new(), 20); +/// ``` +/// +#[derive(Debug)] +pub struct LimitStore { + inner: T, + max_requests: usize, + semaphore: Arc, +} + +impl LimitStore { + /// Create new limit store that will limit the maximum + /// number of outstanding concurrent requests to + /// `max_requests` + pub fn new(inner: T, max_requests: usize) -> Self { + Self { + inner, + max_requests, + semaphore: Arc::new(Semaphore::new(max_requests)), + } + } +} + +impl std::fmt::Display for LimitStore { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "LimitStore({}, {})", self.max_requests, self.inner) + } +} + +#[async_trait] +impl ObjectStore for LimitStore { + async fn put(&self, location: &Path, bytes: Bytes) -> Result<()> { + let _permit = self.semaphore.acquire().await.unwrap(); + self.inner.put(location, bytes).await + } + + async fn put_multipart( + &self, + location: &Path, + ) -> Result<(MultipartId, Box)> { + let permit = Arc::clone(&self.semaphore).acquire_owned().await.unwrap(); + let (id, write) = self.inner.put_multipart(location).await?; + Ok((id, Box::new(PermitWrapper::new(write, permit)))) + } + + async fn abort_multipart( + &self, + location: &Path, + multipart_id: &MultipartId, + ) -> Result<()> { + let _permit = self.semaphore.acquire().await.unwrap(); + self.inner.abort_multipart(location, multipart_id).await + } + + async fn get(&self, location: &Path) -> Result { + let permit = Arc::clone(&self.semaphore).acquire_owned().await.unwrap(); + match self.inner.get(location).await? { + r @ GetResult::File(_, _) => Ok(r), + GetResult::Stream(s) => { + Ok(GetResult::Stream(PermitWrapper::new(s, permit).boxed())) + } + } + } + + async fn get_range(&self, location: &Path, range: Range) -> Result { + let _permit = self.semaphore.acquire().await.unwrap(); + self.inner.get_range(location, range).await + } + + async fn head(&self, location: &Path) -> Result { + let _permit = self.semaphore.acquire().await.unwrap(); + self.inner.head(location).await + } + + async fn delete(&self, location: &Path) -> Result<()> { + let _permit = self.semaphore.acquire().await.unwrap(); + self.inner.delete(location).await + } + + async fn list( + &self, + prefix: Option<&Path>, + ) -> Result>> { + let permit = Arc::clone(&self.semaphore).acquire_owned().await.unwrap(); + let s = self.inner.list(prefix).await?; + Ok(PermitWrapper::new(s, permit).boxed()) + } + + async fn list_with_delimiter(&self, prefix: Option<&Path>) -> Result { + let _permit = self.semaphore.acquire().await.unwrap(); + self.inner.list_with_delimiter(prefix).await + } + + async fn copy(&self, from: &Path, to: &Path) -> Result<()> { + let _permit = self.semaphore.acquire().await.unwrap(); + self.inner.copy(from, to).await + } + + async fn rename(&self, from: &Path, to: &Path) -> Result<()> { + let _permit = self.semaphore.acquire().await.unwrap(); + self.inner.rename(from, to).await + } + + async fn copy_if_not_exists(&self, from: &Path, to: &Path) -> Result<()> { + let _permit = self.semaphore.acquire().await.unwrap(); + self.inner.copy_if_not_exists(from, to).await + } + + async fn rename_if_not_exists(&self, from: &Path, to: &Path) -> Result<()> { + let _permit = self.semaphore.acquire().await.unwrap(); + self.inner.rename_if_not_exists(from, to).await + } +} + +/// Combines an [`OwnedSemaphorePermit`] with some other type +struct PermitWrapper { + inner: T, + #[allow(dead_code)] + permit: OwnedSemaphorePermit, +} + +impl PermitWrapper { + fn new(inner: T, permit: OwnedSemaphorePermit) -> Self { + Self { inner, permit } + } +} + +impl Stream for PermitWrapper { + type Item = T::Item; + + fn poll_next( + mut self: Pin<&mut Self>, + cx: &mut Context<'_>, + ) -> Poll> { + Pin::new(&mut self.inner).poll_next(cx) + } + + fn size_hint(&self) -> (usize, Option) { + self.inner.size_hint() + } +} + +impl AsyncWrite for PermitWrapper { + fn poll_write( + mut self: Pin<&mut Self>, + cx: &mut Context<'_>, + buf: &[u8], + ) -> Poll> { + Pin::new(&mut self.inner).poll_write(cx, buf) + } + + fn poll_flush( + mut self: Pin<&mut Self>, + cx: &mut Context<'_>, + ) -> Poll> { + Pin::new(&mut self.inner).poll_flush(cx) + } + + fn poll_shutdown( + mut self: Pin<&mut Self>, + cx: &mut Context<'_>, + ) -> Poll> { + Pin::new(&mut self.inner).poll_shutdown(cx) + } + + fn poll_write_vectored( + mut self: Pin<&mut Self>, + cx: &mut Context<'_>, + bufs: &[IoSlice<'_>], + ) -> Poll> { + Pin::new(&mut self.inner).poll_write_vectored(cx, bufs) + } + + fn is_write_vectored(&self) -> bool { + self.inner.is_write_vectored() + } +} + +#[cfg(test)] +mod tests { + use crate::limit::LimitStore; + use crate::memory::InMemory; + use crate::tests::{ + list_uses_directories_correctly, list_with_delimiter, put_get_delete_list, + rename_and_copy, stream_get, + }; + use crate::ObjectStore; + use std::time::Duration; + use tokio::time::timeout; + + #[tokio::test] + async fn limit_test() { + let max_requests = 10; + let memory = InMemory::new(); + let integration = LimitStore::new(memory, max_requests); + + put_get_delete_list(&integration).await.unwrap(); + list_uses_directories_correctly(&integration).await.unwrap(); + list_with_delimiter(&integration).await.unwrap(); + rename_and_copy(&integration).await.unwrap(); + stream_get(&integration).await.unwrap(); + + let mut streams = Vec::with_capacity(max_requests); + for _ in 0..max_requests { + let stream = integration.list(None).await.unwrap(); + streams.push(stream); + } + + let t = Duration::from_millis(20); + + // Expect to not be able to make another request + assert!(timeout(t, integration.list(None)).await.is_err()); + + // Drop one of the streams + streams.pop(); + + // Can now make another request + integration.list(None).await.unwrap(); + } +} From 856175fd06414e4a75f44c06fb4b46f9639c4f96 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Tue, 2 Aug 2022 15:01:07 +0100 Subject: [PATCH 009/397] Handle symlinks in LocalFileSystem (#2206) (#2269) * Handle symlinks in LocalFileSystem (#2206) * Update object_store/src/local.rs Co-authored-by: Andrew Lamb Co-authored-by: Andrew Lamb --- src/local.rs | 169 +++++++++++++++++++++++++++++++++++++++++++----- src/path/mod.rs | 22 ++----- 2 files changed, 160 insertions(+), 31 deletions(-) diff --git a/src/local.rs b/src/local.rs index e2f133e..c3f54e0 100644 --- a/src/local.rs +++ b/src/local.rs @@ -68,56 +68,56 @@ pub(crate) enum Error { #[snafu(display("Unable to create dir {}: {}", path.display(), source))] UnableToCreateDir { source: io::Error, - path: std::path::PathBuf, + path: PathBuf, }, #[snafu(display("Unable to create file {}: {}", path.display(), err))] UnableToCreateFile { - path: std::path::PathBuf, + path: PathBuf, err: io::Error, }, #[snafu(display("Unable to delete file {}: {}", path.display(), source))] UnableToDeleteFile { source: io::Error, - path: std::path::PathBuf, + path: PathBuf, }, #[snafu(display("Unable to open file {}: {}", path.display(), source))] UnableToOpenFile { source: io::Error, - path: std::path::PathBuf, + path: PathBuf, }, #[snafu(display("Unable to read data from file {}: {}", path.display(), source))] UnableToReadBytes { source: io::Error, - path: std::path::PathBuf, + path: PathBuf, }, #[snafu(display("Out of range of file {}, expected: {}, actual: {}", path.display(), expected, actual))] OutOfRange { - path: std::path::PathBuf, + path: PathBuf, expected: usize, actual: usize, }, #[snafu(display("Unable to copy file from {} to {}: {}", from.display(), to.display(), source))] UnableToCopyFile { - from: std::path::PathBuf, - to: std::path::PathBuf, + from: PathBuf, + to: PathBuf, source: io::Error, }, NotFound { - path: std::path::PathBuf, + path: PathBuf, source: io::Error, }, #[snafu(display("Error seeking file {}: {}", path.display(), source))] Seek { source: io::Error, - path: std::path::PathBuf, + path: PathBuf, }, #[snafu(display("Unable to convert URL \"{}\" to filesystem path", url))] @@ -170,6 +170,17 @@ impl From for super::Error { /// /// If not called from a tokio context, this will perform IO on the current thread with /// no additional complexity or overheads +/// +/// # Symlinks +/// +/// [`LocalFileSystem`] will follow symlinks as normal, however, it is worth noting: +/// +/// * Broken symlinks will be silently ignored by listing operations +/// * No effort is made to prevent breaking symlinks when deleting files +/// * Symlinks that resolve to paths outside the root **will** be followed +/// * Mutating a file through one or more symlinks will mutate the underlying file +/// * Deleting a path that resolves to a symlink will only delete the symlink +/// #[derive(Debug)] pub struct LocalFileSystem { config: Arc, @@ -214,10 +225,13 @@ impl LocalFileSystem { impl Config { /// Return filesystem path of the given location - fn path_to_filesystem(&self, location: &Path) -> Result { + fn path_to_filesystem(&self, location: &Path) -> Result { let mut url = self.root.clone(); url.path_segments_mut() .expect("url path") + // technically not necessary as Path ignores empty segments + // but avoids creating paths with "//" which look odd in error messages. + .pop_if_empty() .extend(location.parts()); url.to_file_path() @@ -371,7 +385,8 @@ impl ObjectStore for LocalFileSystem { let walkdir = WalkDir::new(&root_path) // Don't include the root directory itself - .min_depth(1); + .min_depth(1) + .follow_links(true); let s = walkdir.into_iter().flat_map(move |result_dir_entry| { match convert_walkdir_result(result_dir_entry) { @@ -433,7 +448,10 @@ impl ObjectStore for LocalFileSystem { let resolved_prefix = config.path_to_filesystem(&prefix)?; maybe_spawn_blocking(move || { - let walkdir = WalkDir::new(&resolved_prefix).min_depth(1).max_depth(1); + let walkdir = WalkDir::new(&resolved_prefix) + .min_depth(1) + .max_depth(1) + .follow_links(true); let mut common_prefixes = BTreeSet::new(); let mut objects = Vec::new(); @@ -732,7 +750,7 @@ impl AsyncWrite for LocalUpload { } } -fn open_file(path: &std::path::PathBuf) -> Result { +fn open_file(path: &PathBuf) -> Result { let file = File::open(path).map_err(|e| { if e.kind() == std::io::ErrorKind::NotFound { Error::NotFound { @@ -749,7 +767,7 @@ fn open_file(path: &std::path::PathBuf) -> Result { Ok(file) } -fn open_writable_file(path: &std::path::PathBuf) -> Result { +fn open_writable_file(path: &PathBuf) -> Result { match File::create(&path) { Ok(f) => Ok(f), Err(err) if err.kind() == std::io::ErrorKind::NotFound => { @@ -861,7 +879,8 @@ mod tests { }, Error as ObjectStoreError, ObjectStore, }; - use tempfile::TempDir; + use futures::TryStreamExt; + use tempfile::{NamedTempFile, TempDir}; use tokio::io::AsyncWriteExt; #[tokio::test] @@ -1030,6 +1049,124 @@ mod tests { } } + async fn check_list( + integration: &LocalFileSystem, + prefix: Option<&Path>, + expected: &[&str], + ) { + let result: Vec<_> = integration + .list(prefix) + .await + .unwrap() + .try_collect() + .await + .unwrap(); + + let mut strings: Vec<_> = result.iter().map(|x| x.location.as_ref()).collect(); + strings.sort_unstable(); + assert_eq!(&strings, expected) + } + + #[tokio::test] + #[cfg(target_family = "unix")] + async fn test_symlink() { + let root = TempDir::new().unwrap(); + let integration = LocalFileSystem::new_with_prefix(root.path()).unwrap(); + + let subdir = root.path().join("a"); + std::fs::create_dir(&subdir).unwrap(); + let file = subdir.join("file.parquet"); + std::fs::write(file, "test").unwrap(); + + check_list(&integration, None, &["a/file.parquet"]).await; + integration + .head(&Path::from("a/file.parquet")) + .await + .unwrap(); + + // Follow out of tree symlink + let other = NamedTempFile::new().unwrap(); + std::os::unix::fs::symlink(other.path(), root.path().join("test.parquet")) + .unwrap(); + + // Should return test.parquet even though out of tree + check_list(&integration, None, &["a/file.parquet", "test.parquet"]).await; + + // Can fetch test.parquet + integration.head(&Path::from("test.parquet")).await.unwrap(); + + // Follow in tree symlink + std::os::unix::fs::symlink(&subdir, root.path().join("b")).unwrap(); + check_list( + &integration, + None, + &["a/file.parquet", "b/file.parquet", "test.parquet"], + ) + .await; + check_list(&integration, Some(&Path::from("b")), &["b/file.parquet"]).await; + + // Can fetch through symlink + integration + .head(&Path::from("b/file.parquet")) + .await + .unwrap(); + + // Ignore broken symlink + std::os::unix::fs::symlink( + root.path().join("foo.parquet"), + root.path().join("c"), + ) + .unwrap(); + + check_list( + &integration, + None, + &["a/file.parquet", "b/file.parquet", "test.parquet"], + ) + .await; + + let mut r = integration.list_with_delimiter(None).await.unwrap(); + r.common_prefixes.sort_unstable(); + assert_eq!(r.common_prefixes.len(), 2); + assert_eq!(r.common_prefixes[0].as_ref(), "a"); + assert_eq!(r.common_prefixes[1].as_ref(), "b"); + assert_eq!(r.objects.len(), 1); + assert_eq!(r.objects[0].location.as_ref(), "test.parquet"); + + let r = integration + .list_with_delimiter(Some(&Path::from("a"))) + .await + .unwrap(); + assert_eq!(r.common_prefixes.len(), 0); + assert_eq!(r.objects.len(), 1); + assert_eq!(r.objects[0].location.as_ref(), "a/file.parquet"); + + // Deleting a symlink doesn't delete the source file + integration + .delete(&Path::from("test.parquet")) + .await + .unwrap(); + assert!(other.path().exists()); + + check_list(&integration, None, &["a/file.parquet", "b/file.parquet"]).await; + + // Deleting through a symlink deletes both files + integration + .delete(&Path::from("b/file.parquet")) + .await + .unwrap(); + + check_list(&integration, None, &[]).await; + + // Adding a file through a symlink creates in both paths + integration + .put(&Path::from("b/file.parquet"), Bytes::from(vec![0, 1, 2])) + .await + .unwrap(); + + check_list(&integration, None, &["a/file.parquet", "b/file.parquet"]).await; + } + #[tokio::test] async fn invalid_path() { let root = TempDir::new().unwrap(); diff --git a/src/path/mod.rs b/src/path/mod.rs index 23488ef..38b7eb3 100644 --- a/src/path/mod.rs +++ b/src/path/mod.rs @@ -163,7 +163,7 @@ impl Path { /// Convert a filesystem path to a [`Path`] relative to the filesystem root /// - /// This will return an error if the path does not exist, or contains illegal + /// This will return an error if the path contains illegal /// character sequences as defined by [`Path::parse`] pub fn from_filesystem_path( path: impl AsRef, @@ -173,9 +173,8 @@ impl Path { /// Convert a filesystem path to a [`Path`] relative to the provided base /// - /// This will return an error if the path does not exist on the local filesystem, - /// contains illegal character sequences as defined by [`Path::parse`], or `base` - /// does not refer to a parent path of `path` + /// This will return an error if the path contains illegal character sequences + /// as defined by [`Path::parse`], or `base` does not refer to a parent path of `path` pub(crate) fn from_filesystem_path_with_base( path: impl AsRef, base: Option<&Url>, @@ -295,20 +294,13 @@ where } } -/// Given a filesystem path, convert it to its canonical URL representation, -/// returning an error if the file doesn't exist on the local filesystem +/// Given a filesystem path convert it to a URL representation pub(crate) fn filesystem_path_to_url( path: impl AsRef, ) -> Result { - let path = path.as_ref().canonicalize().context(CanonicalizeSnafu { - path: path.as_ref(), - })?; - - match path.is_dir() { - true => Url::from_directory_path(&path), - false => Url::from_file_path(&path), - } - .map_err(|_| Error::InvalidPath { path }) + Url::from_file_path(&path).map_err(|_| Error::InvalidPath { + path: path.as_ref().into(), + }) } #[cfg(test)] From 47db1544426607108848b69fa7ea8a12bb0d8ca4 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Tue, 2 Aug 2022 15:43:33 -0400 Subject: [PATCH 010/397] Improve `object_store crate` documentation (#2260) * Improve crates.io page * Improve builder doc examples * Add examples in main library docs * Apply suggestions from code review Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> --- Cargo.toml | 4 +- README.md | 17 ++++++- src/aws.rs | 5 +- src/azure.rs | 5 +- src/gcp.rs | 5 +- src/lib.rs | 128 ++++++++++++++++++++++++++++++++++++++++++++++++--- 6 files changed, 147 insertions(+), 17 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 7415398..b5b1ae1 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -21,7 +21,7 @@ version = "0.3.0" edition = "2021" license = "MIT/Apache-2.0" readme = "README.md" -description = "A generic object store interface for uniformly interacting with AWS S3, Google Cloud Storage and Azure Blob Storage" +description = "A generic object store interface for uniformly interacting with AWS S3, Google Cloud Storage, Azure Blob Storage and local files." keywords = [ "object", "storage", @@ -77,4 +77,4 @@ aws = ["rusoto_core", "rusoto_credential", "rusoto_s3", "rusoto_sts", "hyper", " [dev-dependencies] # In alphabetical order dotenv = "0.15.0" tempfile = "3.1.0" -futures-test = "0.3" +futures-test = "0.3" \ No newline at end of file diff --git a/README.md b/README.md index 313588b..fd10414 100644 --- a/README.md +++ b/README.md @@ -19,8 +19,21 @@ # Rust Object Store -A crate providing a generic interface to object stores, such as S3, Azure Blob Storage and Google Cloud Storage. +A focused, easy to use, idiomatic, high performance, `async` object +store library interacting with object stores. -Originally developed for [InfluxDB IOx](https://github.com/influxdata/influxdb_iox/) and later split out and donated to Apache Arrow. +Using this crate, the same binary and code can easily run in multiple +clouds and local test environments, via a simple runtime configuration +change. Supported object stores include: + +* [AWS S3](https://aws.amazon.com/s3/) +* [Azure Blob Storage](https://azure.microsoft.com/en-us/services/storage/blobs/) +* [Google Cloud Storage](https://cloud.google.com/storage) +* Local files +* Memory +* Custom implementations + + +Originally developed for [InfluxDB IOx](https://github.com/influxdata/influxdb_iox/) and later split out and donated to [Apache Arrow](https://arrow.apache.org/). See [docs.rs](https://docs.rs/object_store) for usage instructions diff --git a/src/aws.rs b/src/aws.rs index cedd465..d59f48b 100644 --- a/src/aws.rs +++ b/src/aws.rs @@ -260,7 +260,7 @@ impl From for super::Error { } } -/// Configuration for connecting to [Amazon S3](https://aws.amazon.com/s3/). +/// Interface for [Amazon S3](https://aws.amazon.com/s3/). pub struct AmazonS3 { /// S3 client w/o any connection limit. /// @@ -599,7 +599,8 @@ fn convert_object_meta(object: rusoto_s3::Object, bucket: &str) -> Result for super::Error { } } -/// Configuration for connecting to [Microsoft Azure Blob Storage](https://azure.microsoft.com/en-us/services/storage/blobs/). +/// Interface for [Microsoft Azure Blob Storage](https://azure.microsoft.com/en-us/services/storage/blobs/). #[derive(Debug)] pub struct MicrosoftAzure { container_client: Arc, @@ -587,7 +587,8 @@ fn url_from_env(env_name: &str, default_url: &str) -> Result { /// # let ACCOUNT = "foo"; /// # let BUCKET_NAME = "foo"; /// # let ACCESS_KEY = "foo"; -/// let azure = object_store::azure::MicrosoftAzureBuilder::new() +/// # use object_store::azure::MicrosoftAzureBuilder; +/// let azure = MicrosoftAzureBuilder::new() /// .with_account(ACCOUNT) /// .with_access_key(ACCESS_KEY) /// .with_container_name(BUCKET_NAME) diff --git a/src/gcp.rs b/src/gcp.rs index dea8769..dd9c844 100644 --- a/src/gcp.rs +++ b/src/gcp.rs @@ -192,7 +192,7 @@ struct CompleteMultipartUpload { parts: Vec, } -/// Configuration for connecting to [Google Cloud Storage](https://cloud.google.com/storage/). +/// Interface for [Google Cloud Storage](https://cloud.google.com/storage/). #[derive(Debug)] pub struct GoogleCloudStorage { client: Arc, @@ -792,7 +792,8 @@ fn reader_credentials_file( /// ``` /// # let BUCKET_NAME = "foo"; /// # let SERVICE_ACCOUNT_PATH = "/tmp/foo.json"; -/// let gcs = object_store::gcp::GoogleCloudStorageBuilder::new() +/// # use object_store::gcp::GoogleCloudStorageBuilder; +/// let gcs = GoogleCloudStorageBuilder::new() /// .with_service_account_path(SERVICE_ACCOUNT_PATH) /// .with_bucket_name(BUCKET_NAME) /// .build(); diff --git a/src/lib.rs b/src/lib.rs index 33e8452..c1d7e3e 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -28,15 +28,129 @@ //! # object_store //! -//! This crate provides APIs for interacting with object storage services. +//! This crate provides a uniform API for interacting with object storage services and +//! local files via the the [`ObjectStore`] trait. //! -//! It currently supports PUT (single or chunked/concurrent), GET, DELETE, HEAD and list for: +//! # Create an [`ObjectStore`] implementation: //! -//! * [Google Cloud Storage](https://cloud.google.com/storage/) -//! * [Amazon S3](https://aws.amazon.com/s3/) -//! * [Azure Blob Storage](https://azure.microsoft.com/en-gb/services/storage/blobs/#overview) -//! * In-memory -//! * Local file storage +//! * [Google Cloud Storage](https://cloud.google.com/storage/): [`GoogleCloudStorageBuilder`](gcp::GoogleCloudStorageBuilder) +//! * [Amazon S3](https://aws.amazon.com/s3/): [`AmazonS3Builder`](aws::AmazonS3Builder) +//! * [Azure Blob Storage](https://azure.microsoft.com/en-gb/services/storage/blobs/):: [`MicrosoftAzureBuilder`](azure::MicrosoftAzureBuilder) +//! * In Memory: [`InMemory`](memory::InMemory) +//! * Local filesystem: [`LocalFileSystem`](local::LocalFileSystem) +//! +//! # Adapters +//! +//! [`ObjectStore`] instances can be composed with various adapters +//! which add additional functionality: +//! +//! * Rate Throttling: [`ThrottleConfig`](throttle::ThrottleConfig) +//! * Concurrent Request Limit: [`LimitStore`](limit::LimitStore) +//! +//! +//! # Listing objects: +//! +//! Use the [`ObjectStore::list`] method to iterate over objects in +//! remote storage or files in the local filesystem: +//! +//! ``` +//! # use object_store::local::LocalFileSystem; +//! # // use LocalFileSystem for example +//! # fn get_object_store() -> LocalFileSystem { +//! # LocalFileSystem::new_with_prefix("/tmp").unwrap() +//! # } +//! +//! # async fn example() { +//! use std::sync::Arc; +//! use object_store::{path::Path, ObjectStore}; +//! use futures::stream::StreamExt; +//! +//! // create an ObjectStore +//! let object_store: Arc = Arc::new(get_object_store()); +//! +//! // Recursively list all files below the 'data' path. +//! // 1. On AWS S3 this would be the 'data/' prefix +//! // 2. On a local filesystem, this would be the 'data' directory +//! let prefix: Path = "data".try_into().unwrap(); +//! +//! // Get an `async` stream of Metadata objects: +//! let list_stream = object_store +//! .list(Some(&prefix)) +//! .await +//! .expect("Error listing files"); +//! +//! // Print a line about each object based on its metadata +//! // using for_each from `StreamExt` trait. +//! list_stream +//! .for_each(move |meta| { +//! async { +//! let meta = meta.expect("Error listing"); +//! println!("Name: {}, size: {}", meta.location, meta.size); +//! } +//! }) +//! .await; +//! # } +//! ``` +//! +//! Which will print out something like the following: +//! +//! ```text +//! Name: data/file01.parquet, size: 112832 +//! Name: data/file02.parquet, size: 143119 +//! Name: data/child/file03.parquet, size: 100 +//! ... +//! ``` +//! +//! # Fetching objects +//! +//! Use the [`ObjectStore::get`] method to fetch the data bytes +//! from remote storage or files in the local filesystem as a stream. +//! +//! ``` +//! # use object_store::local::LocalFileSystem; +//! # // use LocalFileSystem for example +//! # fn get_object_store() -> LocalFileSystem { +//! # LocalFileSystem::new_with_prefix("/tmp").unwrap() +//! # } +//! +//! # async fn example() { +//! use std::sync::Arc; +//! use object_store::{path::Path, ObjectStore}; +//! use futures::stream::StreamExt; +//! +//! // create an ObjectStore +//! let object_store: Arc = Arc::new(get_object_store()); +//! +//! // Retrieve a specific file +//! let path: Path = "data/file01.parquet".try_into().unwrap(); +//! +//! // fetch the bytes from object store +//! let stream = object_store +//! .get(&path) +//! .await +//! .unwrap() +//! .into_stream(); +//! +//! // Count the '0's using `map` from `StreamExt` trait +//! let num_zeros = stream +//! .map(|bytes| { +//! let bytes = bytes.unwrap(); +//! bytes.iter().filter(|b| **b == 0).count() +//! }) +//! .collect::>() +//! .await +//! .into_iter() +//! .sum::(); +//! +//! println!("Num zeros in {} is {}", path, num_zeros); +//! # } +//! ``` +//! +//! Which will print out something like the following: +//! +//! ```text +//! Num zeros in data/file01.parquet is 657 +//! ``` //! #[cfg(feature = "aws")] From 333eecec6fe0141d24e8d986c4664e67cd582e46 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Wed, 3 Aug 2022 11:09:05 +0100 Subject: [PATCH 011/397] Retry GCP requests on server error (#2243) * Retry GCP requests on server error * Also retry OAuth * Lower default backoff configuration * Add retry disclaimer * Add retry_timeout * Add logging * Fix features --- Cargo.toml | 5 +- src/client/backoff.rs | 156 ++++++++++++++++++++++++++++++++++++++ src/client/mod.rs | 23 ++++++ src/{ => client}/oauth.rs | 12 ++- src/client/retry.rs | 106 ++++++++++++++++++++++++++ src/{ => client}/token.rs | 0 src/gcp.rs | 44 +++++++---- src/lib.rs | 6 +- 8 files changed, 328 insertions(+), 24 deletions(-) create mode 100644 src/client/backoff.rs create mode 100644 src/client/mod.rs rename src/{ => client}/oauth.rs (96%) create mode 100644 src/client/retry.rs rename src/{ => client}/token.rs (100%) diff --git a/Cargo.toml b/Cargo.toml index b5b1ae1..aaf9ee9 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -48,6 +48,7 @@ quick-xml = { version = "0.23.0", features = ["serialize"], optional = true } rustls-pemfile = { version = "1.0", default-features = false, optional = true } ring = { version = "0.16", default-features = false, features = ["std"] } base64 = { version = "0.13", default-features = false, optional = true } +rand = { version = "0.8", default-features = false, optional = true, features = ["std", "std_rng"] } # for rusoto hyper = { version = "0.14", optional = true, default-features = false } # for rusoto @@ -58,7 +59,7 @@ percent-encoding = "2.1" rusoto_core = { version = "0.48.0", optional = true, default-features = false, features = ["rustls"] } rusoto_credential = { version = "0.48.0", optional = true, default-features = false } rusoto_s3 = { version = "0.48.0", optional = true, default-features = false, features = ["rustls"] } -rusoto_sts = { version = "0.48.0", optional = true, default-features = false, features = ["rustls"] } +rusoto_sts = { version = "0.48.0", optional = true, default-features = false, features = ["rustls"] } snafu = "0.7" tokio = { version = "1.18", features = ["sync", "macros", "parking_lot", "rt-multi-thread", "time", "io-util"] } tracing = { version = "0.1" } @@ -71,7 +72,7 @@ walkdir = "2" [features] azure = ["azure_core", "azure_storage_blobs", "azure_storage", "reqwest"] azure_test = ["azure", "azure_core/azurite_workaround", "azure_storage/azurite_workaround", "azure_storage_blobs/azurite_workaround"] -gcp = ["serde", "serde_json", "quick-xml", "reqwest", "reqwest/json", "reqwest/stream", "chrono/serde", "rustls-pemfile", "base64"] +gcp = ["serde", "serde_json", "quick-xml", "reqwest", "reqwest/json", "reqwest/stream", "chrono/serde", "rustls-pemfile", "base64", "rand"] aws = ["rusoto_core", "rusoto_credential", "rusoto_s3", "rusoto_sts", "hyper", "hyper-rustls"] [dev-dependencies] # In alphabetical order diff --git a/src/client/backoff.rs b/src/client/backoff.rs new file mode 100644 index 0000000..5a6126c --- /dev/null +++ b/src/client/backoff.rs @@ -0,0 +1,156 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use rand::prelude::*; +use std::time::Duration; + +/// Exponential backoff with jitter +/// +/// See +#[allow(missing_copy_implementations)] +#[derive(Debug, Clone)] +pub struct BackoffConfig { + /// The initial backoff duration + pub init_backoff: Duration, + /// The maximum backoff duration + pub max_backoff: Duration, + /// The base of the exponential to use + pub base: f64, +} + +impl Default for BackoffConfig { + fn default() -> Self { + Self { + init_backoff: Duration::from_millis(100), + max_backoff: Duration::from_secs(15), + base: 2., + } + } +} + +/// [`Backoff`] can be created from a [`BackoffConfig`] +/// +/// Consecutive calls to [`Backoff::next`] will return the next backoff interval +/// +pub struct Backoff { + init_backoff: f64, + next_backoff_secs: f64, + max_backoff_secs: f64, + base: f64, + rng: Option>, +} + +impl std::fmt::Debug for Backoff { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("Backoff") + .field("init_backoff", &self.init_backoff) + .field("next_backoff_secs", &self.next_backoff_secs) + .field("max_backoff_secs", &self.max_backoff_secs) + .field("base", &self.base) + .finish() + } +} + +impl Backoff { + /// Create a new [`Backoff`] from the provided [`BackoffConfig`] + pub fn new(config: &BackoffConfig) -> Self { + Self::new_with_rng(config, None) + } + + /// Creates a new `Backoff` with the optional `rng` + /// + /// Used [`rand::thread_rng()`] if no rng provided + pub fn new_with_rng( + config: &BackoffConfig, + rng: Option>, + ) -> Self { + let init_backoff = config.init_backoff.as_secs_f64(); + Self { + init_backoff, + next_backoff_secs: init_backoff, + max_backoff_secs: config.max_backoff.as_secs_f64(), + base: config.base, + rng, + } + } + + /// Returns the next backoff duration to wait for + pub fn next(&mut self) -> Duration { + let range = self.init_backoff..(self.next_backoff_secs * self.base); + + let rand_backoff = match self.rng.as_mut() { + Some(rng) => rng.gen_range(range), + None => thread_rng().gen_range(range), + }; + + let next_backoff = self.max_backoff_secs.min(rand_backoff); + Duration::from_secs_f64(std::mem::replace( + &mut self.next_backoff_secs, + next_backoff, + )) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use rand::rngs::mock::StepRng; + + #[test] + fn test_backoff() { + let init_backoff_secs = 1.; + let max_backoff_secs = 500.; + let base = 3.; + + let config = BackoffConfig { + init_backoff: Duration::from_secs_f64(init_backoff_secs), + max_backoff: Duration::from_secs_f64(max_backoff_secs), + base, + }; + + let assert_fuzzy_eq = + |a: f64, b: f64| assert!((b - a).abs() < 0.0001, "{} != {}", a, b); + + // Create a static rng that takes the minimum of the range + let rng = Box::new(StepRng::new(0, 0)); + let mut backoff = Backoff::new_with_rng(&config, Some(rng)); + + for _ in 0..20 { + assert_eq!(backoff.next().as_secs_f64(), init_backoff_secs); + } + + // Create a static rng that takes the maximum of the range + let rng = Box::new(StepRng::new(u64::MAX, 0)); + let mut backoff = Backoff::new_with_rng(&config, Some(rng)); + + for i in 0..20 { + let value = (base.powi(i) * init_backoff_secs).min(max_backoff_secs); + assert_fuzzy_eq(backoff.next().as_secs_f64(), value); + } + + // Create a static rng that takes the mid point of the range + let rng = Box::new(StepRng::new(u64::MAX / 2, 0)); + let mut backoff = Backoff::new_with_rng(&config, Some(rng)); + + let mut value = init_backoff_secs; + for _ in 0..20 { + assert_fuzzy_eq(backoff.next().as_secs_f64(), value); + value = (init_backoff_secs + (value * base - init_backoff_secs) / 2.) + .min(max_backoff_secs); + } + } +} diff --git a/src/client/mod.rs b/src/client/mod.rs new file mode 100644 index 0000000..1166ebe --- /dev/null +++ b/src/client/mod.rs @@ -0,0 +1,23 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Generic utilities reqwest based ObjectStore implementations + +pub mod backoff; +pub mod oauth; +pub mod retry; +pub mod token; diff --git a/src/oauth.rs b/src/client/oauth.rs similarity index 96% rename from src/oauth.rs rename to src/client/oauth.rs index 273e37b..88e7a7b 100644 --- a/src/oauth.rs +++ b/src/client/oauth.rs @@ -15,7 +15,9 @@ // specific language governing permissions and limitations // under the License. -use crate::token::TemporaryToken; +use crate::client::retry::RetryExt; +use crate::client::token::TemporaryToken; +use crate::RetryConfig; use reqwest::{Client, Method}; use ring::signature::RsaKeyPair; use snafu::{ResultExt, Snafu}; @@ -133,7 +135,11 @@ impl OAuthProvider { } /// Fetch a fresh token - pub async fn fetch_token(&self, client: &Client) -> Result> { + pub async fn fetch_token( + &self, + client: &Client, + retry: &RetryConfig, + ) -> Result> { let now = seconds_since_epoch(); let exp = now + 3600; @@ -168,7 +174,7 @@ impl OAuthProvider { let response: TokenResponse = client .request(Method::POST, &self.audience) .form(&body) - .send() + .send_retry(retry) .await .context(TokenRequestSnafu)? .error_for_status() diff --git a/src/client/retry.rs b/src/client/retry.rs new file mode 100644 index 0000000..c4dd6ee --- /dev/null +++ b/src/client/retry.rs @@ -0,0 +1,106 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! A shared HTTP client implementation incorporating retries + +use crate::client::backoff::{Backoff, BackoffConfig}; +use futures::future::BoxFuture; +use futures::FutureExt; +use reqwest::{Response, Result}; +use std::time::{Duration, Instant}; +use tracing::info; + +/// Contains the configuration for how to respond to server errors +/// +/// By default they will be retried up to some limit, using exponential +/// backoff with jitter. See [`BackoffConfig`] for more information +/// +#[derive(Debug, Clone)] +pub struct RetryConfig { + /// The backoff configuration + pub backoff: BackoffConfig, + + /// The maximum number of times to retry a request + /// + /// Set to 0 to disable retries + pub max_retries: usize, + + /// The maximum length of time from the initial request + /// after which no further retries will be attempted + /// + /// This not only bounds the length of time before a server + /// error will be surfaced to the application, but also bounds + /// the length of time a request's credentials must remain valid. + /// + /// As requests are retried without renewing credentials or + /// regenerating request payloads, this number should be kept + /// below 5 minutes to avoid errors due to expired credentials + /// and/or request payloads + pub retry_timeout: Duration, +} + +impl Default for RetryConfig { + fn default() -> Self { + Self { + backoff: Default::default(), + max_retries: 10, + retry_timeout: Duration::from_secs(3 * 60), + } + } +} + +pub trait RetryExt { + /// Dispatch a request with the given retry configuration + /// + /// # Panic + /// + /// This will panic if the request body is a stream + fn send_retry(self, config: &RetryConfig) -> BoxFuture<'static, Result>; +} + +impl RetryExt for reqwest::RequestBuilder { + fn send_retry(self, config: &RetryConfig) -> BoxFuture<'static, Result> { + let mut backoff = Backoff::new(&config.backoff); + let max_retries = config.max_retries; + let retry_timeout = config.retry_timeout; + + async move { + let mut retries = 0; + let now = Instant::now(); + + loop { + let s = self.try_clone().expect("request body must be cloneable"); + match s.send().await { + Err(e) + if retries < max_retries + && now.elapsed() < retry_timeout + && e.status() + .map(|s| s.is_server_error()) + .unwrap_or(false) => + { + let sleep = backoff.next(); + retries += 1; + info!("Encountered server error, backing off for {} seconds, retry {} of {}", sleep.as_secs_f32(), retries, max_retries); + tokio::time::sleep(sleep).await; + } + r => return r, + } + } + } + .boxed() + } +} diff --git a/src/token.rs b/src/client/token.rs similarity index 100% rename from src/token.rs rename to src/client/token.rs diff --git a/src/gcp.rs b/src/gcp.rs index dd9c844..f9cb2b2 100644 --- a/src/gcp.rs +++ b/src/gcp.rs @@ -46,14 +46,13 @@ use reqwest::{header, Client, Method, Response, StatusCode}; use snafu::{ResultExt, Snafu}; use tokio::io::AsyncWrite; -use crate::multipart::{CloudMultiPartUpload, CloudMultiPartUploadImpl, UploadPart}; -use crate::util::format_http_range; +use crate::client::retry::RetryExt; use crate::{ - oauth::OAuthProvider, + client::{oauth::OAuthProvider, token::TokenCache}, + multipart::{CloudMultiPartUpload, CloudMultiPartUploadImpl, UploadPart}, path::{Path, DELIMITER}, - token::TokenCache, - util::format_prefix, - GetResult, ListResult, MultipartId, ObjectMeta, ObjectStore, Result, + util::{format_http_range, format_prefix}, + GetResult, ListResult, MultipartId, ObjectMeta, ObjectStore, Result, RetryConfig, }; #[derive(Debug, Snafu)] @@ -215,6 +214,8 @@ struct GoogleCloudStorageClient { bucket_name: String, bucket_name_encoded: String, + retry_config: RetryConfig, + // TODO: Hook this up in tests max_list_results: Option, } @@ -224,7 +225,9 @@ impl GoogleCloudStorageClient { if let Some(oauth_provider) = &self.oauth_provider { Ok(self .token_cache - .get_or_insert_with(|| oauth_provider.fetch_token(&self.client)) + .get_or_insert_with(|| { + oauth_provider.fetch_token(&self.client, &self.retry_config) + }) .await?) } else { Ok("".to_owned()) @@ -264,7 +267,7 @@ impl GoogleCloudStorageClient { let response = builder .bearer_auth(token) .query(&[("alt", alt)]) - .send() + .send_retry(&self.retry_config) .await .context(GetRequestSnafu { path: path.as_ref(), @@ -292,7 +295,7 @@ impl GoogleCloudStorageClient { .header(header::CONTENT_LENGTH, payload.len()) .query(&[("uploadType", "media"), ("name", path.as_ref())]) .body(payload) - .send() + .send_retry(&self.retry_config) .await .context(PutRequestSnafu)? .error_for_status() @@ -313,7 +316,7 @@ impl GoogleCloudStorageClient { .header(header::CONTENT_TYPE, "application/octet-stream") .header(header::CONTENT_LENGTH, "0") .query(&[("uploads", "")]) - .send() + .send_retry(&self.retry_config) .await .context(PutRequestSnafu)? .error_for_status() @@ -347,7 +350,7 @@ impl GoogleCloudStorageClient { .header(header::CONTENT_TYPE, "application/octet-stream") .header(header::CONTENT_LENGTH, "0") .query(&[("uploadId", multipart_id)]) - .send() + .send_retry(&self.retry_config) .await .context(PutRequestSnafu)? .error_for_status() @@ -364,7 +367,7 @@ impl GoogleCloudStorageClient { let builder = self.client.request(Method::DELETE, url); builder .bearer_auth(token) - .send() + .send_retry(&self.retry_config) .await .context(DeleteRequestSnafu { path: path.as_ref(), @@ -407,7 +410,7 @@ impl GoogleCloudStorageClient { builder .bearer_auth(token) - .send() + .send_retry(&self.retry_config) .await .context(CopyRequestSnafu { path: from.as_ref(), @@ -456,7 +459,7 @@ impl GoogleCloudStorageClient { .request(Method::GET, url) .query(&query) .bearer_auth(token) - .send() + .send_retry(&self.retry_config) .await .context(ListRequestSnafu)? .error_for_status() @@ -572,7 +575,7 @@ impl CloudMultiPartUploadImpl for GCSMultipartUpload { .header(header::CONTENT_TYPE, "application/octet-stream") .header(header::CONTENT_LENGTH, format!("{}", buf.len())) .body(buf) - .send() + .send_retry(&client.retry_config) .await .map_err(reqwest_error_as_io)? .error_for_status() @@ -643,7 +646,7 @@ impl CloudMultiPartUploadImpl for GCSMultipartUpload { .bearer_auth(token) .query(&[("uploadId", upload_id)]) .body(data) - .send() + .send_retry(&client.retry_config) .await .map_err(reqwest_error_as_io)? .error_for_status() @@ -803,6 +806,7 @@ pub struct GoogleCloudStorageBuilder { bucket_name: Option, service_account_path: Option, client: Option, + retry_config: RetryConfig, } impl GoogleCloudStorageBuilder { @@ -838,6 +842,12 @@ impl GoogleCloudStorageBuilder { self } + /// Set the retry configuration + pub fn with_retry(mut self, retry_config: RetryConfig) -> Self { + self.retry_config = retry_config; + self + } + /// Use the specified http [`Client`] (defaults to [`Client::new`]) /// /// This allows you to set custom client options such as allowing @@ -859,6 +869,7 @@ impl GoogleCloudStorageBuilder { bucket_name, service_account_path, client, + retry_config, } = self; let bucket_name = bucket_name.ok_or(Error::MissingBucketName {})?; @@ -897,6 +908,7 @@ impl GoogleCloudStorageBuilder { token_cache: Default::default(), bucket_name, bucket_name_encoded: encoded_bucket_name, + retry_config, max_list_results: None, }), }) diff --git a/src/lib.rs b/src/lib.rs index c1d7e3e..08634e2 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -166,10 +166,10 @@ pub mod path; pub mod throttle; #[cfg(feature = "gcp")] -mod oauth; +mod client; #[cfg(feature = "gcp")] -mod token; +pub use client::{backoff::BackoffConfig, retry::RetryConfig}; #[cfg(any(feature = "azure", feature = "aws", feature = "gcp"))] mod multipart; @@ -451,7 +451,7 @@ pub enum Error { #[cfg(feature = "gcp")] #[snafu(display("OAuth error: {}", source), context(false))] - OAuth { source: oauth::Error }, + OAuth { source: client::oauth::Error }, } #[cfg(test)] From f1d744e40e41d7043d38054c9c2252e96e16ed17 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 5 Aug 2022 15:53:30 -0400 Subject: [PATCH 012/397] Remove vestigal ` object_store/.circleci/` (#2337) --- .circleci/config.yml | 262 ------------------------------------------- 1 file changed, 262 deletions(-) delete mode 100644 .circleci/config.yml diff --git a/.circleci/config.yml b/.circleci/config.yml deleted file mode 100644 index b4dff6d..0000000 --- a/.circleci/config.yml +++ /dev/null @@ -1,262 +0,0 @@ ---- -# CI Overview -# ----------- -# -# Each night: -# -# A build image is created (ci_image) from `docker/Dockerfile.ci` and is -# pushed to `quay.io/influxdb/rust:ci`. This build image is then used to run -# the CI tasks for the day. -# -# Every commit: -# -# The CI for every PR and merge to main runs tests, fmt, lints and compiles debug binaries -# -# On main if all these checks pass it will then additionally compile in "release" mode and -# publish a docker image to quay.io/influxdb/iox:$COMMIT_SHA -# -# Manual CI Image: -# -# It is possible to manually trigger a rebuild of the image used in CI. To do this, navigate to -# https://app.circleci.com/pipelines/github/influxdata/influxdb_iox?branch=main (overriding the -# branch name if desired). Then: -# - Click "Run Pipeline" in the top-right -# - Expand "Add Parameters" -# - Add a "boolean" parameter called "ci_image" with the value true -# - Click "Run Pipeline" -# -# If you refresh the page you should see a newly running ci_image workflow -# - -version: 2.1 - -orbs: - win: circleci/windows@4.1 - -commands: - rust_components: - description: Verify installed components - steps: - - run: - name: Verify installed components - command: | - rustup --version - rustup show - cargo fmt --version - cargo clippy --version - - cache_restore: - description: Restore Cargo Cache - steps: - - restore_cache: - name: Restoring Cargo Cache - keys: - - cargo-cache-{{ arch }}-{{ .Branch }}-{{ checksum "Cargo.lock" }} - - cargo-cache-{{ arch }}-{{ .Branch }} - - cargo-cache - cache_save: - description: Save Cargo Cache - steps: - - save_cache: - name: Save Cargo Cache - paths: - - /usr/local/cargo/registry - key: cargo-cache-{{ arch }}-{{ .Branch }}-{{ checksum "Cargo.lock" }} - -jobs: - fmt: - docker: - - image: quay.io/influxdb/rust:ci - environment: - # Disable incremental compilation to avoid overhead. We are not preserving these files anyway. - CARGO_INCREMENTAL: "0" - # Disable full debug symbol generation to speed up CI build - # "1" means line tables only, which is useful for panic tracebacks. - RUSTFLAGS: "-C debuginfo=1" - # https://github.com/rust-lang/cargo/issues/10280 - CARGO_NET_GIT_FETCH_WITH_CLI: "true" - steps: - - checkout - - rust_components - - cache_restore - - run: - name: Rust fmt - command: cargo fmt --all -- --check - - cache_save - lint: - docker: - - image: quay.io/influxdb/rust:ci - environment: - # Disable incremental compilation to avoid overhead. We are not preserving these files anyway. - CARGO_INCREMENTAL: "0" - # Disable full debug symbol generation to speed up CI build - # "1" means line tables only, which is useful for panic tracebacks. - RUSTFLAGS: "-C debuginfo=1" - # https://github.com/rust-lang/cargo/issues/10280 - CARGO_NET_GIT_FETCH_WITH_CLI: "true" - steps: - - checkout - - rust_components - - cache_restore - - run: - name: Clippy - command: cargo clippy --all-targets --all-features --workspace -- -D warnings - - cache_save - cargo_audit: - docker: - - image: quay.io/influxdb/rust:ci - environment: - # Disable incremental compilation to avoid overhead. We are not preserving these files anyway. - CARGO_INCREMENTAL: "0" - # Disable full debug symbol generation to speed up CI build - # "1" means line tables only, which is useful for panic tracebacks. - RUSTFLAGS: "-C debuginfo=1" - # https://github.com/rust-lang/cargo/issues/10280 - CARGO_NET_GIT_FETCH_WITH_CLI: "true" - steps: - - checkout - - rust_components - - cache_restore - - run: - name: Install cargo-deny - command: cargo install --force cargo-deny - - run: - name: cargo-deny Checks - command: cargo deny check -s - - cache_save - check: - docker: - - image: quay.io/influxdb/rust:ci - environment: - # Disable incremental compilation to avoid overhead. We are not preserving these files anyway. - CARGO_INCREMENTAL: "0" - # Disable full debug symbol generation to speed up CI build - # "1" means line tables only, which is useful for panic tracebacks. - RUSTFLAGS: "-C debuginfo=1" - # https://github.com/rust-lang/cargo/issues/10280 - CARGO_NET_GIT_FETCH_WITH_CLI: "true" - steps: - - checkout - - rust_components - - cache_restore - - run: - name: Install cargo-hack - command: cargo install cargo-hack - - run: - name: Check all features - command: cargo hack check --feature-powerset --no-dev-deps --workspace - - cache_save - doc: - docker: - - image: quay.io/influxdb/rust:ci - environment: - # Disable incremental compilation to avoid overhead. We are not preserving these files anyway. - CARGO_INCREMENTAL: "0" - # Disable full debug symbol generation to speed up CI build - # "1" means line tables only, which is useful for panic tracebacks. - RUSTFLAGS: "-C debuginfo=1" - # https://github.com/rust-lang/cargo/issues/10280 - CARGO_NET_GIT_FETCH_WITH_CLI: "true" - steps: - - checkout - - rust_components - - cache_restore - - run: - name: Cargo doc - # excluding datafusion because it's effectively a dependency masqueraded as workspace crate. - command: cargo doc --document-private-items --no-deps --workspace --exclude datafusion - - cache_save - - run: - name: Compress Docs - command: tar -cvzf rustdoc.tar.gz target/doc/ - - store_artifacts: - path: rustdoc.tar.gz - test: - # setup multiple docker images (see https://circleci.com/docs/2.0/configuration-reference/#docker) - docker: - - image: quay.io/influxdb/rust:ci - - image: localstack/localstack:0.14.4 - - image: mcr.microsoft.com/azure-storage/azurite - - image: fsouza/fake-gcs-server - command: - - "-scheme" - - "http" - resource_class: 2xlarge # use of a smaller executor tends crashes on link - environment: - # Disable incremental compilation to avoid overhead. We are not preserving these files anyway. - CARGO_INCREMENTAL: "0" - # Disable full debug symbol generation to speed up CI build - # "1" means line tables only, which is useful for panic tracebacks. - RUSTFLAGS: "-C debuginfo=1" - # https://github.com/rust-lang/cargo/issues/10280 - CARGO_NET_GIT_FETCH_WITH_CLI: "true" - RUST_BACKTRACE: "1" - # Run integration tests - TEST_INTEGRATION: 1 - AWS_DEFAULT_REGION: "us-east-1" - AWS_ACCESS_KEY_ID: test - AWS_SECRET_ACCESS_KEY: test - AWS_ENDPOINT: http://127.0.0.1:4566 - AZURE_USE_EMULATOR: "1" - GOOGLE_SERVICE_ACCOUNT: "/tmp/gcs.json" - OBJECT_STORE_BUCKET: test-bucket - steps: - - run: - name: Setup localstack (AWS emulation) - command: | - cd /tmp - curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip" - unzip awscliv2.zip - sudo ./aws/install - aws --endpoint-url=http://localhost:4566 s3 mb s3://test-bucket - - run: - name: Setup Azurite (Azure emulation) - # the magical connection string is from https://docs.microsoft.com/en-us/azure/storage/common/storage-use-azurite?tabs=visual-studio#http-connection-strings - command: | - curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash - az storage container create -n test-bucket --connection-string 'DefaultEndpointsProtocol=http;AccountName=devstoreaccount1;AccountKey=Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==;BlobEndpoint=http://127.0.0.1:10000/devstoreaccount1;QueueEndpoint=http://127.0.0.1:10001/devstoreaccount1;' - - run: - name: Setup fake GCS server - command: | - curl -X POST --data-binary '{"name":"test-bucket"}' -H "Content-Type: application/json" "http://localhost:4443/storage/v1/b" - echo '{"gcs_base_url": "http://localhost:4443", "disable_oauth": true, "client_email": "", "private_key": ""}' > "$GOOGLE_SERVICE_ACCOUNT" - - checkout - - rust_components - - cache_restore - - run: - name: Cargo test - command: cargo test --workspace --features=aws,azure,azure_test,gcp - - cache_save - - test_windows: - executor: - name: win/default - size: medium - environment: - # https://github.com/rust-lang/cargo/issues/10280 - CARGO_NET_GIT_FETCH_WITH_CLI: "true" - steps: - - checkout - - run: - name: Download rustup - command: wget https://win.rustup.rs/x86_64 -O rustup-init.exe - - run: - name: Install rustup - command: .\rustup-init.exe -y --default-host=x86_64-pc-windows-msvc - - run: - name: Cargo test - command: cargo test --workspace - -workflows: - version: 2 - - # CI for all pull requests. - ci: - jobs: - - check - - fmt - - lint - - cargo_audit - - test - - test_windows - - doc From a349447f3ce3be8468a83bc3a1d6238651c25fe0 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Sun, 7 Aug 2022 18:53:46 +0100 Subject: [PATCH 013/397] Fix Copy from percent-encoded path (#2353) (#2354) --- src/aws.rs | 20 +++++++++++++++++++- src/azure.rs | 17 +++++++++-------- src/lib.rs | 7 +++++++ 3 files changed, 35 insertions(+), 9 deletions(-) diff --git a/src/aws.rs b/src/aws.rs index d59f48b..86766b0 100644 --- a/src/aws.rs +++ b/src/aws.rs @@ -48,6 +48,7 @@ use futures::{ Future, Stream, StreamExt, TryStreamExt, }; use hyper::client::Builder as HyperBuilder; +use percent_encoding::{percent_encode, AsciiSet, NON_ALPHANUMERIC}; use rusoto_core::ByteStream; use rusoto_credential::{InstanceMetadataProvider, StaticProvider}; use rusoto_s3::S3; @@ -62,6 +63,17 @@ use tokio::io::AsyncWrite; use tokio::sync::{OwnedSemaphorePermit, Semaphore}; use tracing::{debug, warn}; +// Do not URI-encode any of the unreserved characters that RFC 3986 defines: +// A-Z, a-z, 0-9, hyphen ( - ), underscore ( _ ), period ( . ), and tilde ( ~ ). +const STRICT_ENCODE_SET: AsciiSet = NON_ALPHANUMERIC + .remove(b'-') + .remove(b'.') + .remove(b'_') + .remove(b'~'); + +/// This struct is used to maintain the URI path encoding +const STRICT_PATH_ENCODE_SET: AsciiSet = STRICT_ENCODE_SET.remove(b'/'); + /// The maximum number of times a request will be retried in the case of an AWS server error pub const MAX_NUM_RETRIES: u32 = 3; @@ -541,9 +553,15 @@ impl ObjectStore for AmazonS3 { let to = to.as_ref(); let bucket_name = self.bucket_name.clone(); + let copy_source = format!( + "{}/{}", + &bucket_name, + percent_encode(from.as_ref(), &STRICT_PATH_ENCODE_SET) + ); + let request_factory = move || rusoto_s3::CopyObjectRequest { bucket: bucket_name.clone(), - copy_source: format!("{}/{}", &bucket_name, from), + copy_source, key: to.to_string(), ..Default::default() }; diff --git a/src/azure.rs b/src/azure.rs index 0d5f2fa..cee874b 100644 --- a/src/azure.rs +++ b/src/azure.rs @@ -470,14 +470,15 @@ impl ObjectStore for MicrosoftAzure { impl MicrosoftAzure { /// helper function to create a source url for copy function - fn get_copy_from_url(&self, from: &Path) -> Result { - Ok(reqwest::Url::parse(&format!( - "{}/{}/{}", - &self.blob_base_url, self.container_name, from - )) - .context(UnableToParseUrlSnafu { - container: &self.container_name, - })?) + fn get_copy_from_url(&self, from: &Path) -> Result { + let mut url = + Url::parse(&format!("{}/{}", &self.blob_base_url, self.container_name)) + .context(UnableToParseUrlSnafu { + container: &self.container_name, + })?; + + url.path_segments_mut().unwrap().extend(from.parts()); + Ok(url) } async fn list_impl( diff --git a/src/lib.rs b/src/lib.rs index 08634e2..b60a295 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -635,7 +635,14 @@ mod tests { assert_eq!(files, vec![emoji_file.clone()]); + let dst = Path::from("foo.parquet"); + storage.copy(&emoji_file, &dst).await.unwrap(); + let mut files = flatten_list_stream(storage, None).await.unwrap(); + files.sort_unstable(); + assert_eq!(files, vec![emoji_file.clone(), dst.clone()]); + storage.delete(&emoji_file).await.unwrap(); + storage.delete(&dst).await.unwrap(); let files = flatten_list_stream(storage, Some(&emoji_prefix)) .await .unwrap(); From 1216fe24817cc2b1ca2ebba2af7655fc56f91115 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Mon, 8 Aug 2022 10:05:29 +0100 Subject: [PATCH 014/397] Make ring optional dependency and cleanup tests (#2344) --- Cargo.toml | 8 +-- src/aws.rs | 57 ++------------------- src/azure.rs | 10 ++-- src/gcp.rs | 10 ++-- src/lib.rs | 129 ++++++++++++++++++++++-------------------------- src/limit.rs | 10 ++-- src/local.rs | 20 ++++---- src/memory.rs | 12 ++--- src/throttle.rs | 10 ++-- 9 files changed, 104 insertions(+), 162 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index aaf9ee9..7ccec86 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -46,9 +46,9 @@ serde = { version = "1.0", default-features = false, features = ["derive"], opti serde_json = { version = "1.0", default-features = false, optional = true } quick-xml = { version = "0.23.0", features = ["serialize"], optional = true } rustls-pemfile = { version = "1.0", default-features = false, optional = true } -ring = { version = "0.16", default-features = false, features = ["std"] } +ring = { version = "0.16", default-features = false, features = ["std"], optional = true } base64 = { version = "0.13", default-features = false, optional = true } -rand = { version = "0.8", default-features = false, optional = true, features = ["std", "std_rng"] } +rand = { version = "0.8", default-features = false, features = ["std", "std_rng"], optional = true } # for rusoto hyper = { version = "0.14", optional = true, default-features = false } # for rusoto @@ -63,7 +63,7 @@ rusoto_sts = { version = "0.48.0", optional = true, default-features = false, fe snafu = "0.7" tokio = { version = "1.18", features = ["sync", "macros", "parking_lot", "rt-multi-thread", "time", "io-util"] } tracing = { version = "0.1" } -reqwest = { version = "0.11", optional = true, default-features = false, features = ["rustls-tls"] } +reqwest = { version = "0.11", default-features = false, features = ["rustls-tls"], optional = true } parking_lot = { version = "0.12" } # Filesystem integration url = "2.2" @@ -72,7 +72,7 @@ walkdir = "2" [features] azure = ["azure_core", "azure_storage_blobs", "azure_storage", "reqwest"] azure_test = ["azure", "azure_core/azurite_workaround", "azure_storage/azurite_workaround", "azure_storage_blobs/azurite_workaround"] -gcp = ["serde", "serde_json", "quick-xml", "reqwest", "reqwest/json", "reqwest/stream", "chrono/serde", "rustls-pemfile", "base64", "rand"] +gcp = ["serde", "serde_json", "quick-xml", "reqwest", "reqwest/json", "reqwest/stream", "chrono/serde", "rustls-pemfile", "base64", "rand", "ring"] aws = ["rusoto_core", "rusoto_credential", "rusoto_s3", "rusoto_sts", "hyper", "hyper-rustls"] [dev-dependencies] # In alphabetical order diff --git a/src/aws.rs b/src/aws.rs index 86766b0..bcb294c 100644 --- a/src/aws.rs +++ b/src/aws.rs @@ -1027,34 +1027,6 @@ where } } -impl Error { - #[cfg(test)] - fn s3_error_due_to_credentials(&self) -> bool { - use rusoto_core::RusotoError; - use Error::*; - - matches!( - self, - UnableToPutData { - source: RusotoError::Credentials(_), - bucket: _, - path: _, - } | UnableToGetData { - source: RusotoError::Credentials(_), - bucket: _, - path: _, - } | UnableToDeleteData { - source: RusotoError::Credentials(_), - bucket: _, - path: _, - } | UnableToListData { - source: RusotoError::Credentials(_), - bucket: _, - } - ) - } -} - struct S3MultiPartUpload { bucket: String, key: String, @@ -1186,9 +1158,6 @@ mod tests { use bytes::Bytes; use std::env; - type TestError = Box; - type Result = std::result::Result; - const NON_EXISTENT_NAME: &str = "nonexistentname"; // Helper macro to skip tests if TEST_INTEGRATION and the AWS @@ -1268,32 +1237,16 @@ mod tests { }}; } - fn check_credentials(r: Result) -> Result { - if let Err(e) = &r { - let e = &**e; - if let Some(e) = e.downcast_ref::() { - if e.s3_error_due_to_credentials() { - eprintln!( - "Try setting the AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY \ - environment variables" - ); - } - } - } - - r - } - #[tokio::test] async fn s3_test() { let config = maybe_skip_integration!(); let integration = config.build().unwrap(); - check_credentials(put_get_delete_list(&integration).await).unwrap(); - check_credentials(list_uses_directories_correctly(&integration).await).unwrap(); - check_credentials(list_with_delimiter(&integration).await).unwrap(); - check_credentials(rename_and_copy(&integration).await).unwrap(); - check_credentials(stream_get(&integration).await).unwrap(); + put_get_delete_list(&integration).await; + list_uses_directories_correctly(&integration).await; + list_with_delimiter(&integration).await; + rename_and_copy(&integration).await; + stream_get(&integration).await; } #[tokio::test] diff --git a/src/azure.rs b/src/azure.rs index cee874b..6a5f537 100644 --- a/src/azure.rs +++ b/src/azure.rs @@ -858,10 +858,10 @@ mod tests { async fn azure_blob_test() { let integration = maybe_skip_integration!().build().unwrap(); - put_get_delete_list(&integration).await.unwrap(); - list_uses_directories_correctly(&integration).await.unwrap(); - list_with_delimiter(&integration).await.unwrap(); - rename_and_copy(&integration).await.unwrap(); - copy_if_not_exists(&integration).await.unwrap(); + put_get_delete_list(&integration).await; + list_uses_directories_correctly(&integration).await; + list_with_delimiter(&integration).await; + rename_and_copy(&integration).await; + copy_if_not_exists(&integration).await; } } diff --git a/src/gcp.rs b/src/gcp.rs index f9cb2b2..0dc5a95 100644 --- a/src/gcp.rs +++ b/src/gcp.rs @@ -1003,14 +1003,14 @@ mod test { async fn gcs_test() { let integration = maybe_skip_integration!().build().unwrap(); - put_get_delete_list(&integration).await.unwrap(); - list_uses_directories_correctly(&integration).await.unwrap(); - list_with_delimiter(&integration).await.unwrap(); - rename_and_copy(&integration).await.unwrap(); + put_get_delete_list(&integration).await; + list_uses_directories_correctly(&integration).await; + list_with_delimiter(&integration).await; + rename_and_copy(&integration).await; if integration.client.base_url == default_gcs_base_url() { // Fake GCS server does not yet implement XML Multipart uploads // https://github.com/fsouza/fake-gcs-server/issues/852 - stream_get(&integration).await.unwrap(); + stream_get(&integration).await; } } diff --git a/src/lib.rs b/src/lib.rs index b60a295..564799d 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -478,15 +478,12 @@ mod tests { use crate::test_util::flatten_list_stream; use tokio::io::AsyncWriteExt; - type Error = Box; - type Result = std::result::Result; - - pub(crate) async fn put_get_delete_list(storage: &DynObjectStore) -> Result<()> { + pub(crate) async fn put_get_delete_list(storage: &DynObjectStore) { let store_str = storage.to_string(); delete_fixtures(storage).await; - let content_list = flatten_list_stream(storage, None).await?; + let content_list = flatten_list_stream(storage, None).await.unwrap(); assert!( content_list.is_empty(), "Expected list to be empty; found: {:?}", @@ -497,16 +494,16 @@ mod tests { let data = Bytes::from("arbitrary data"); let expected_data = data.clone(); - storage.put(&location, data).await?; + storage.put(&location, data).await.unwrap(); let root = Path::from("/"); // List everything - let content_list = flatten_list_stream(storage, None).await?; + let content_list = flatten_list_stream(storage, None).await.unwrap(); assert_eq!(content_list, &[location.clone()]); // Should behave the same as no prefix - let content_list = flatten_list_stream(storage, Some(&root)).await?; + let content_list = flatten_list_stream(storage, Some(&root)).await.unwrap(); assert_eq!(content_list, &[location.clone()]); // List with delimiter @@ -523,15 +520,15 @@ mod tests { // List everything starting with a prefix that should return results let prefix = Path::from("test_dir"); - let content_list = flatten_list_stream(storage, Some(&prefix)).await?; + let content_list = flatten_list_stream(storage, Some(&prefix)).await.unwrap(); assert_eq!(content_list, &[location.clone()]); // List everything starting with a prefix that shouldn't return results let prefix = Path::from("something"); - let content_list = flatten_list_stream(storage, Some(&prefix)).await?; + let content_list = flatten_list_stream(storage, Some(&prefix)).await.unwrap(); assert!(content_list.is_empty()); - let read_data = storage.get(&location).await?.bytes().await?; + let read_data = storage.get(&location).await.unwrap().bytes().await.unwrap(); assert_eq!(&*read_data, expected_data); // Test range request @@ -557,12 +554,12 @@ mod tests { out_of_range_result.unwrap_err(); } - let head = storage.head(&location).await?; + let head = storage.head(&location).await.unwrap(); assert_eq!(head.size, expected_data.len()); - storage.delete(&location).await?; + storage.delete(&location).await.unwrap(); - let content_list = flatten_list_stream(storage, None).await?; + let content_list = flatten_list_stream(storage, None).await.unwrap(); assert!(content_list.is_empty()); let err = storage.get(&location).await.unwrap_err(); @@ -647,8 +644,6 @@ mod tests { .await .unwrap(); assert!(files.is_empty()); - - Ok(()) } fn get_vec_of_bytes(chunk_length: usize, num_chunks: usize) -> Vec { @@ -657,15 +652,15 @@ mod tests { .collect() } - pub(crate) async fn stream_get(storage: &DynObjectStore) -> Result<()> { + pub(crate) async fn stream_get(storage: &DynObjectStore) { let location = Path::from("test_dir/test_upload_file.txt"); // Can write to storage let data = get_vec_of_bytes(5_000_000, 10); let bytes_expected = data.concat(); - let (_, mut writer) = storage.put_multipart(&location).await?; + let (_, mut writer) = storage.put_multipart(&location).await.unwrap(); for chunk in &data { - writer.write_all(chunk).await?; + writer.write_all(chunk).await.unwrap(); } // Object should not yet exist in store @@ -676,26 +671,29 @@ mod tests { crate::Error::NotFound { .. } )); - writer.shutdown().await?; - let bytes_written = storage.get(&location).await?.bytes().await?; + writer.shutdown().await.unwrap(); + let bytes_written = storage.get(&location).await.unwrap().bytes().await.unwrap(); assert_eq!(bytes_expected, bytes_written); // Can overwrite some storage let data = get_vec_of_bytes(5_000, 5); let bytes_expected = data.concat(); - let (_, mut writer) = storage.put_multipart(&location).await?; + let (_, mut writer) = storage.put_multipart(&location).await.unwrap(); for chunk in &data { - writer.write_all(chunk).await?; + writer.write_all(chunk).await.unwrap(); } - writer.shutdown().await?; - let bytes_written = storage.get(&location).await?.bytes().await?; + writer.shutdown().await.unwrap(); + let bytes_written = storage.get(&location).await.unwrap().bytes().await.unwrap(); assert_eq!(bytes_expected, bytes_written); // We can abort an empty write let location = Path::from("test_dir/test_abort_upload.txt"); - let (upload_id, writer) = storage.put_multipart(&location).await?; + let (upload_id, writer) = storage.put_multipart(&location).await.unwrap(); drop(writer); - storage.abort_multipart(&location, &upload_id).await?; + storage + .abort_multipart(&location, &upload_id) + .await + .unwrap(); let get_res = storage.get(&location).await; assert!(get_res.is_err()); assert!(matches!( @@ -704,30 +702,29 @@ mod tests { )); // We can abort an in-progress write - let (upload_id, mut writer) = storage.put_multipart(&location).await?; + let (upload_id, mut writer) = storage.put_multipart(&location).await.unwrap(); if let Some(chunk) = data.get(0) { - writer.write_all(chunk).await?; - let _ = writer.write(chunk).await?; + writer.write_all(chunk).await.unwrap(); + let _ = writer.write(chunk).await.unwrap(); } drop(writer); - storage.abort_multipart(&location, &upload_id).await?; + storage + .abort_multipart(&location, &upload_id) + .await + .unwrap(); let get_res = storage.get(&location).await; assert!(get_res.is_err()); assert!(matches!( get_res.unwrap_err(), crate::Error::NotFound { .. } )); - - Ok(()) } - pub(crate) async fn list_uses_directories_correctly( - storage: &DynObjectStore, - ) -> Result<()> { + pub(crate) async fn list_uses_directories_correctly(storage: &DynObjectStore) { delete_fixtures(storage).await; - let content_list = flatten_list_stream(storage, None).await?; + let content_list = flatten_list_stream(storage, None).await.unwrap(); assert!( content_list.is_empty(), "Expected list to be empty; found: {:?}", @@ -738,25 +735,23 @@ mod tests { let location2 = Path::from("foo.bar/y.json"); let data = Bytes::from("arbitrary data"); - storage.put(&location1, data.clone()).await?; - storage.put(&location2, data).await?; + storage.put(&location1, data.clone()).await.unwrap(); + storage.put(&location2, data).await.unwrap(); let prefix = Path::from("foo"); - let content_list = flatten_list_stream(storage, Some(&prefix)).await?; + let content_list = flatten_list_stream(storage, Some(&prefix)).await.unwrap(); assert_eq!(content_list, &[location1.clone()]); let prefix = Path::from("foo/x"); - let content_list = flatten_list_stream(storage, Some(&prefix)).await?; + let content_list = flatten_list_stream(storage, Some(&prefix)).await.unwrap(); assert_eq!(content_list, &[]); - - Ok(()) } - pub(crate) async fn list_with_delimiter(storage: &DynObjectStore) -> Result<()> { + pub(crate) async fn list_with_delimiter(storage: &DynObjectStore) { delete_fixtures(storage).await; // ==================== check: store is empty ==================== - let content_list = flatten_list_stream(storage, None).await?; + let content_list = flatten_list_stream(storage, None).await.unwrap(); assert!(content_list.is_empty()); // ==================== do: create files ==================== @@ -818,10 +813,8 @@ mod tests { } // ==================== check: store is empty ==================== - let content_list = flatten_list_stream(storage, None).await?; + let content_list = flatten_list_stream(storage, None).await.unwrap(); assert!(content_list.is_empty()); - - Ok(()) } pub(crate) async fn get_nonexistent_object( @@ -837,7 +830,7 @@ mod tests { storage.get(&location).await?.bytes().await } - pub(crate) async fn rename_and_copy(storage: &DynObjectStore) -> Result<()> { + pub(crate) async fn rename_and_copy(storage: &DynObjectStore) { // Create two objects let path1 = Path::from("test1"); let path2 = Path::from("test2"); @@ -845,29 +838,27 @@ mod tests { let contents2 = Bytes::from("dogs"); // copy() make both objects identical - storage.put(&path1, contents1.clone()).await?; - storage.put(&path2, contents2.clone()).await?; - storage.copy(&path1, &path2).await?; - let new_contents = storage.get(&path2).await?.bytes().await?; + storage.put(&path1, contents1.clone()).await.unwrap(); + storage.put(&path2, contents2.clone()).await.unwrap(); + storage.copy(&path1, &path2).await.unwrap(); + let new_contents = storage.get(&path2).await.unwrap().bytes().await.unwrap(); assert_eq!(&new_contents, &contents1); // rename() copies contents and deletes original - storage.put(&path1, contents1.clone()).await?; - storage.put(&path2, contents2.clone()).await?; - storage.rename(&path1, &path2).await?; - let new_contents = storage.get(&path2).await?.bytes().await?; + storage.put(&path1, contents1.clone()).await.unwrap(); + storage.put(&path2, contents2.clone()).await.unwrap(); + storage.rename(&path1, &path2).await.unwrap(); + let new_contents = storage.get(&path2).await.unwrap().bytes().await.unwrap(); assert_eq!(&new_contents, &contents1); let result = storage.get(&path1).await; assert!(result.is_err()); assert!(matches!(result.unwrap_err(), crate::Error::NotFound { .. })); // Clean up - storage.delete(&path2).await?; - - Ok(()) + storage.delete(&path2).await.unwrap(); } - pub(crate) async fn copy_if_not_exists(storage: &DynObjectStore) -> Result<()> { + pub(crate) async fn copy_if_not_exists(storage: &DynObjectStore) { // Create two objects let path1 = Path::from("test1"); let path2 = Path::from("test2"); @@ -875,8 +866,8 @@ mod tests { let contents2 = Bytes::from("dogs"); // copy_if_not_exists() errors if destination already exists - storage.put(&path1, contents1.clone()).await?; - storage.put(&path2, contents2.clone()).await?; + storage.put(&path1, contents1.clone()).await.unwrap(); + storage.put(&path2, contents2.clone()).await.unwrap(); let result = storage.copy_if_not_exists(&path1, &path2).await; assert!(result.is_err()); assert!(matches!( @@ -885,19 +876,17 @@ mod tests { )); // copy_if_not_exists() copies contents and allows deleting original - storage.delete(&path2).await?; - storage.copy_if_not_exists(&path1, &path2).await?; - storage.delete(&path1).await?; - let new_contents = storage.get(&path2).await?.bytes().await?; + storage.delete(&path2).await.unwrap(); + storage.copy_if_not_exists(&path1, &path2).await.unwrap(); + storage.delete(&path1).await.unwrap(); + let new_contents = storage.get(&path2).await.unwrap().bytes().await.unwrap(); assert_eq!(&new_contents, &contents1); let result = storage.get(&path1).await; assert!(result.is_err()); assert!(matches!(result.unwrap_err(), crate::Error::NotFound { .. })); // Clean up - storage.delete(&path2).await?; - - Ok(()) + storage.delete(&path2).await.unwrap(); } async fn delete_fixtures(storage: &DynObjectStore) { diff --git a/src/limit.rs b/src/limit.rs index fd21ccb..acee7d5 100644 --- a/src/limit.rs +++ b/src/limit.rs @@ -237,11 +237,11 @@ mod tests { let memory = InMemory::new(); let integration = LimitStore::new(memory, max_requests); - put_get_delete_list(&integration).await.unwrap(); - list_uses_directories_correctly(&integration).await.unwrap(); - list_with_delimiter(&integration).await.unwrap(); - rename_and_copy(&integration).await.unwrap(); - stream_get(&integration).await.unwrap(); + put_get_delete_list(&integration).await; + list_uses_directories_correctly(&integration).await; + list_with_delimiter(&integration).await; + rename_and_copy(&integration).await; + stream_get(&integration).await; let mut streams = Vec::with_capacity(max_requests); for _ in 0..max_requests { diff --git a/src/local.rs b/src/local.rs index c3f54e0..0954981 100644 --- a/src/local.rs +++ b/src/local.rs @@ -888,12 +888,12 @@ mod tests { let root = TempDir::new().unwrap(); let integration = LocalFileSystem::new_with_prefix(root.path()).unwrap(); - put_get_delete_list(&integration).await.unwrap(); - list_uses_directories_correctly(&integration).await.unwrap(); - list_with_delimiter(&integration).await.unwrap(); - rename_and_copy(&integration).await.unwrap(); - copy_if_not_exists(&integration).await.unwrap(); - stream_get(&integration).await.unwrap(); + put_get_delete_list(&integration).await; + list_uses_directories_correctly(&integration).await; + list_with_delimiter(&integration).await; + rename_and_copy(&integration).await; + copy_if_not_exists(&integration).await; + stream_get(&integration).await; } #[test] @@ -901,10 +901,10 @@ mod tests { let root = TempDir::new().unwrap(); let integration = LocalFileSystem::new_with_prefix(root.path()).unwrap(); futures::executor::block_on(async move { - put_get_delete_list(&integration).await.unwrap(); - list_uses_directories_correctly(&integration).await.unwrap(); - list_with_delimiter(&integration).await.unwrap(); - stream_get(&integration).await.unwrap(); + put_get_delete_list(&integration).await; + list_uses_directories_correctly(&integration).await; + list_with_delimiter(&integration).await; + stream_get(&integration).await; }); } diff --git a/src/memory.rs b/src/memory.rs index dc3967d..98eb3aa 100644 --- a/src/memory.rs +++ b/src/memory.rs @@ -305,12 +305,12 @@ mod tests { async fn in_memory_test() { let integration = InMemory::new(); - put_get_delete_list(&integration).await.unwrap(); - list_uses_directories_correctly(&integration).await.unwrap(); - list_with_delimiter(&integration).await.unwrap(); - rename_and_copy(&integration).await.unwrap(); - copy_if_not_exists(&integration).await.unwrap(); - stream_get(&integration).await.unwrap(); + put_get_delete_list(&integration).await; + list_uses_directories_correctly(&integration).await; + list_with_delimiter(&integration).await; + rename_and_copy(&integration).await; + copy_if_not_exists(&integration).await; + stream_get(&integration).await; } #[tokio::test] diff --git a/src/throttle.rs b/src/throttle.rs index 6789f0e..dba9f24 100644 --- a/src/throttle.rs +++ b/src/throttle.rs @@ -308,11 +308,11 @@ mod tests { let inner = InMemory::new(); let store = ThrottledStore::new(inner, ThrottleConfig::default()); - put_get_delete_list(&store).await.unwrap(); - list_uses_directories_correctly(&store).await.unwrap(); - list_with_delimiter(&store).await.unwrap(); - rename_and_copy(&store).await.unwrap(); - copy_if_not_exists(&store).await.unwrap(); + put_get_delete_list(&store).await; + list_uses_directories_correctly(&store).await; + list_with_delimiter(&store).await; + rename_and_copy(&store).await; + copy_if_not_exists(&store).await; } #[tokio::test] From c047d50b48d359b62126124757da39f2aa0d251e Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Mon, 8 Aug 2022 11:19:23 +0100 Subject: [PATCH 015/397] Relax path validation (#2355) (#2356) * Relax path validation (#2355) * Iterate over bytes --- src/lib.rs | 36 +++++++++++++++++++++++++++++++ src/local.rs | 20 ++++++++++++++++- src/path/mod.rs | 1 - src/path/parts.rs | 55 +++++++++++++++++++++++++++++++++++++++-------- 4 files changed, 101 insertions(+), 11 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 564799d..71ea3e0 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -644,6 +644,42 @@ mod tests { .await .unwrap(); assert!(files.is_empty()); + + // Test handling of paths containing percent-encoded sequences + + // "HELLO" percent encoded + let hello_prefix = Path::parse("%48%45%4C%4C%4F").unwrap(); + let path = hello_prefix.child("foo.parquet"); + + storage.put(&path, Bytes::from(vec![0, 1])).await.unwrap(); + let files = flatten_list_stream(storage, Some(&hello_prefix)) + .await + .unwrap(); + assert_eq!(files, vec![path.clone()]); + + // Cannot list by decoded representation + let files = flatten_list_stream(storage, Some(&Path::from("HELLO"))) + .await + .unwrap(); + assert!(files.is_empty()); + + // Cannot access by decoded representation + let err = storage + .head(&Path::from("HELLO/foo.parquet")) + .await + .unwrap_err(); + assert!(matches!(err, crate::Error::NotFound { .. }), "{}", err); + + storage.delete(&path).await.unwrap(); + + // Can also write non-percent encoded sequences + let path = Path::parse("%Q.parquet").unwrap(); + storage.put(&path, Bytes::from(vec![0, 1])).await.unwrap(); + + let files = flatten_list_stream(storage, None).await.unwrap(); + assert_eq!(files, vec![path.clone()]); + + storage.delete(&path).await.unwrap(); } fn get_vec_of_bytes(chunk_length: usize, num_chunks: usize) -> Vec { diff --git a/src/local.rs b/src/local.rs index 0954981..6d7fdf9 100644 --- a/src/local.rs +++ b/src/local.rs @@ -1212,7 +1212,7 @@ mod tests { .to_string(); assert!( - err.contains("Invalid path segment - got \"💀\" expected: \"%F0%9F%92%80\""), + err.contains("Encountered illegal character sequence \"💀\" whilst parsing path segment \"💀\""), "{}", err ); @@ -1247,4 +1247,22 @@ mod tests { 0 ); } + + #[tokio::test] + async fn filesystem_filename_with_percent() { + let temp_dir = TempDir::new().unwrap(); + let integration = LocalFileSystem::new_with_prefix(temp_dir.path()).unwrap(); + let filename = "L%3ABC.parquet"; + + std::fs::write(temp_dir.path().join(filename), "foo").unwrap(); + + let list_stream = integration.list(None).await.unwrap(); + let res: Vec<_> = list_stream.try_collect().await.unwrap(); + assert_eq!(res.len(), 1); + assert_eq!(res[0].location.as_ref(), filename); + + let res = integration.list_with_delimiter(None).await.unwrap(); + assert_eq!(res.objects.len(), 1); + assert_eq!(res.objects[0].location.as_ref(), filename); + } } diff --git a/src/path/mod.rs b/src/path/mod.rs index 38b7eb3..5f16d05 100644 --- a/src/path/mod.rs +++ b/src/path/mod.rs @@ -126,7 +126,6 @@ pub enum Error { /// Path::parse("..").unwrap_err(); /// Path::parse("/foo//").unwrap_err(); /// Path::parse("😀").unwrap_err(); -/// Path::parse("%Q").unwrap_err(); /// ``` /// /// [RFC 1738]: https://www.ietf.org/rfc/rfc1738.txt diff --git a/src/path/parts.rs b/src/path/parts.rs index e73b184..9da4815 100644 --- a/src/path/parts.rs +++ b/src/path/parts.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -use percent_encoding::{percent_decode, percent_encode, AsciiSet, CONTROLS}; +use percent_encoding::{percent_encode, AsciiSet, CONTROLS}; use std::borrow::Cow; use crate::path::DELIMITER_BYTE; @@ -23,11 +23,15 @@ use snafu::Snafu; /// Error returned by [`PathPart::parse`] #[derive(Debug, Snafu)] -#[snafu(display("Invalid path segment - got \"{}\" expected: \"{}\"", actual, expected))] +#[snafu(display( + "Encountered illegal character sequence \"{}\" whilst parsing path segment \"{}\"", + illegal, + segment +))] #[allow(missing_copy_implementations)] pub struct InvalidPart { - actual: String, - expected: String, + segment: String, + illegal: String, } /// The PathPart type exists to validate the directory/file names that form part @@ -43,21 +47,40 @@ pub struct PathPart<'a> { impl<'a> PathPart<'a> { /// Parse the provided path segment as a [`PathPart`] returning an error if invalid pub fn parse(segment: &'a str) -> Result { - let decoded: Cow<'a, [u8]> = percent_decode(segment.as_bytes()).into(); - let part = PathPart::from(decoded.as_ref()); - if segment != part.as_ref() { + if segment == "." || segment == ".." { return Err(InvalidPart { - actual: segment.to_string(), - expected: part.raw.to_string(), + segment: segment.to_string(), + illegal: segment.to_string(), }); } + for (idx, b) in segment.as_bytes().iter().cloned().enumerate() { + // A percent character is always valid, even if not + // followed by a valid 2-digit hex code + // https://url.spec.whatwg.org/#percent-encoded-bytes + if b == b'%' { + continue; + } + + if !b.is_ascii() || should_percent_encode(b) { + return Err(InvalidPart { + segment: segment.to_string(), + // This is correct as only single byte characters up to this point + illegal: segment.chars().nth(idx).unwrap().to_string(), + }); + } + } + Ok(Self { raw: segment.into(), }) } } +fn should_percent_encode(c: u8) -> bool { + percent_encode(&[c], INVALID).next().unwrap().len() != 1 +} + /// Characters we want to encode. const INVALID: &AsciiSet = &CONTROLS // The delimiter we are reserving for internal hierarchy @@ -145,4 +168,18 @@ mod tests { let part: PathPart<'_> = "..".into(); assert_eq!(part.raw, "%2E%2E"); } + + #[test] + fn path_part_parse() { + PathPart::parse("foo").unwrap(); + PathPart::parse("foo/bar").unwrap_err(); + + // Test percent-encoded path + PathPart::parse("foo%2Fbar").unwrap(); + PathPart::parse("L%3ABC.parquet").unwrap(); + + // Test path containing bad escape sequence + PathPart::parse("%Z").unwrap(); + PathPart::parse("%%").unwrap(); + } } From 99769b265701fbd8fe12fe643bc6e0af172a089a Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Mon, 8 Aug 2022 11:20:31 +0100 Subject: [PATCH 016/397] Add ObjectStore::get_ranges (#2293) (#2336) * Add ObjectStore::get_ranges (#2293) * Review feedback --- src/lib.rs | 25 ++++++++++++- src/limit.rs | 9 +++++ src/local.rs | 59 ++++++++++++++++++++---------- src/memory.rs | 16 +++++++++ src/throttle.rs | 30 +++++++++++++++- src/util.rs | 96 +++++++++++++++++++++++++++++++++++++++++++++++++ 6 files changed, 214 insertions(+), 21 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 71ea3e0..57e1371 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -176,7 +176,9 @@ mod multipart; mod util; use crate::path::Path; -use crate::util::{collect_bytes, maybe_spawn_blocking}; +use crate::util::{ + coalesce_ranges, collect_bytes, maybe_spawn_blocking, OBJECT_STORE_COALESCE_DEFAULT, +}; use async_trait::async_trait; use bytes::Bytes; use chrono::{DateTime, Utc}; @@ -231,6 +233,21 @@ pub trait ObjectStore: std::fmt::Display + Send + Sync + Debug + 'static { /// in the given byte range async fn get_range(&self, location: &Path, range: Range) -> Result; + /// Return the bytes that are stored at the specified location + /// in the given byte ranges + async fn get_ranges( + &self, + location: &Path, + ranges: &[Range], + ) -> Result> { + coalesce_ranges( + ranges, + |range| self.get_range(location, range), + OBJECT_STORE_COALESCE_DEFAULT, + ) + .await + } + /// Return the metadata for the specified location async fn head(&self, location: &Path) -> Result; @@ -552,6 +569,12 @@ mod tests { // Should be a non-fatal error out_of_range_result.unwrap_err(); + + let ranges = vec![0..1, 2..3, 0..5]; + let bytes = storage.get_ranges(&location, &ranges).await.unwrap(); + for (range, bytes) in ranges.iter().zip(bytes) { + assert_eq!(bytes, expected_data.slice(range.clone())) + } } let head = storage.head(&location).await.unwrap(); diff --git a/src/limit.rs b/src/limit.rs index acee7d5..09c88aa 100644 --- a/src/limit.rs +++ b/src/limit.rs @@ -110,6 +110,15 @@ impl ObjectStore for LimitStore { self.inner.get_range(location, range).await } + async fn get_ranges( + &self, + location: &Path, + ranges: &[Range], + ) -> Result> { + let _permit = self.semaphore.acquire().await.unwrap(); + self.inner.get_ranges(location, ranges).await + } + async fn head(&self, location: &Path) -> Result { let _permit = self.semaphore.acquire().await.unwrap(); self.inner.head(location).await diff --git a/src/local.rs b/src/local.rs index 6d7fdf9..c590340 100644 --- a/src/local.rs +++ b/src/local.rs @@ -322,26 +322,25 @@ impl ObjectStore for LocalFileSystem { let path = self.config.path_to_filesystem(location)?; maybe_spawn_blocking(move || { let mut file = open_file(&path)?; - let to_read = range.end - range.start; - file.seek(SeekFrom::Start(range.start as u64)) - .context(SeekSnafu { path: &path })?; - - let mut buf = Vec::with_capacity(to_read); - let read = file - .take(to_read as u64) - .read_to_end(&mut buf) - .context(UnableToReadBytesSnafu { path: &path })?; - - ensure!( - read == to_read, - OutOfRangeSnafu { - path: &path, - expected: to_read, - actual: read - } - ); + read_range(&mut file, &path, range) + }) + .await + } - Ok(buf.into()) + async fn get_ranges( + &self, + location: &Path, + ranges: &[Range], + ) -> Result> { + let path = self.config.path_to_filesystem(location)?; + let ranges = ranges.to_vec(); + maybe_spawn_blocking(move || { + // Vectored IO might be faster + let mut file = open_file(&path)?; + ranges + .into_iter() + .map(|r| read_range(&mut file, &path, r)) + .collect() }) .await } @@ -750,6 +749,28 @@ impl AsyncWrite for LocalUpload { } } +fn read_range(file: &mut File, path: &PathBuf, range: Range) -> Result { + let to_read = range.end - range.start; + file.seek(SeekFrom::Start(range.start as u64)) + .context(SeekSnafu { path })?; + + let mut buf = Vec::with_capacity(to_read); + let read = file + .take(to_read as u64) + .read_to_end(&mut buf) + .context(UnableToReadBytesSnafu { path })?; + + ensure!( + read == to_read, + OutOfRangeSnafu { + path, + expected: to_read, + actual: read + } + ); + Ok(buf.into()) +} + fn open_file(path: &PathBuf) -> Result { let file = File::open(path).map_err(|e| { if e.kind() == std::io::ErrorKind::NotFound { diff --git a/src/memory.rs b/src/memory.rs index 98eb3aa..e4be5b2 100644 --- a/src/memory.rs +++ b/src/memory.rs @@ -128,6 +128,22 @@ impl ObjectStore for InMemory { Ok(data.slice(range)) } + async fn get_ranges( + &self, + location: &Path, + ranges: &[Range], + ) -> Result> { + let data = self.get_bytes(location).await?; + ranges + .iter() + .map(|range| { + ensure!(range.end <= data.len(), OutOfRangeSnafu); + ensure!(range.start <= range.end, BadRangeSnafu); + Ok(data.slice(range.clone())) + }) + .collect() + } + async fn head(&self, location: &Path) -> Result { let last_modified = Utc::now(); let bytes = self.get_bytes(location).await?; diff --git a/src/throttle.rs b/src/throttle.rs index dba9f24..90f427c 100644 --- a/src/throttle.rs +++ b/src/throttle.rs @@ -197,7 +197,7 @@ impl ObjectStore for ThrottledStore { async fn get_range(&self, location: &Path, range: Range) -> Result { let config = self.config(); - let sleep_duration = config.wait_delete_per_call + let sleep_duration = config.wait_get_per_call + config.wait_get_per_byte * (range.end - range.start) as u32; sleep(sleep_duration).await; @@ -205,6 +205,22 @@ impl ObjectStore for ThrottledStore { self.inner.get_range(location, range).await } + async fn get_ranges( + &self, + location: &Path, + ranges: &[Range], + ) -> Result> { + let config = self.config(); + + let total_bytes: usize = ranges.iter().map(|range| range.end - range.start).sum(); + let sleep_duration = + config.wait_get_per_call + config.wait_get_per_byte * total_bytes as u32; + + sleep(sleep_duration).await; + + self.inner.get_ranges(location, ranges).await + } + async fn head(&self, location: &Path) -> Result { sleep(self.config().wait_put_per_call).await; self.inner.head(location).await @@ -260,11 +276,23 @@ impl ObjectStore for ThrottledStore { self.inner.copy(from, to).await } + async fn rename(&self, from: &Path, to: &Path) -> Result<()> { + sleep(self.config().wait_put_per_call).await; + + self.inner.rename(from, to).await + } + async fn copy_if_not_exists(&self, from: &Path, to: &Path) -> Result<()> { sleep(self.config().wait_put_per_call).await; self.inner.copy_if_not_exists(from, to).await } + + async fn rename_if_not_exists(&self, from: &Path, to: &Path) -> Result<()> { + sleep(self.config().wait_put_per_call).await; + + self.inner.rename_if_not_exists(from, to).await + } } /// Saturated `usize` to `u32` cast. diff --git a/src/util.rs b/src/util.rs index 4f3ed86..1ef4995 100644 --- a/src/util.rs +++ b/src/util.rs @@ -71,3 +71,99 @@ where Err(_) => f(), } } + +/// Range requests with a gap less than or equal to this, +/// will be coalesced into a single request by [`coalesce_ranges`] +pub const OBJECT_STORE_COALESCE_DEFAULT: usize = 1024 * 1024; + +/// Takes a function to fetch ranges and coalesces adjacent ranges if they are +/// less than `coalesce` bytes apart. Out of order `ranges` are not coalesced +pub async fn coalesce_ranges( + ranges: &[std::ops::Range], + mut fetch: F, + coalesce: usize, +) -> Result> +where + F: FnMut(std::ops::Range) -> Fut, + Fut: std::future::Future>, +{ + let mut ret = Vec::with_capacity(ranges.len()); + let mut start_idx = 0; + let mut end_idx = 1; + + while start_idx != ranges.len() { + while end_idx != ranges.len() + && ranges[end_idx] + .start + .checked_sub(ranges[start_idx].end) + .map(|delta| delta <= coalesce) + .unwrap_or(false) + { + end_idx += 1; + } + + let start = ranges[start_idx].start; + let end = ranges[end_idx - 1].end; + let bytes = fetch(start..end).await?; + for i in start_idx..end_idx { + let range = ranges[i].clone(); + ret.push(bytes.slice(range.start - start..range.end - start)) + } + start_idx = end_idx; + end_idx += 1; + } + Ok(ret) +} + +#[cfg(test)] +mod tests { + use super::*; + use std::ops::Range; + + #[tokio::test] + async fn test_coalesce_ranges() { + let do_fetch = |ranges: Vec>, coalesce: usize| async move { + let max = ranges.iter().map(|x| x.end).max().unwrap_or(0); + let src: Vec<_> = (0..max).map(|x| x as u8).collect(); + + let mut fetches = vec![]; + let coalesced = coalesce_ranges( + &ranges, + |range| { + fetches.push(range.clone()); + futures::future::ready(Ok(Bytes::from(src[range].to_vec()))) + }, + coalesce, + ) + .await + .unwrap(); + + assert_eq!(ranges.len(), coalesced.len()); + for (range, bytes) in ranges.iter().zip(coalesced) { + assert_eq!(bytes.as_ref(), &src[range.clone()]); + } + fetches + }; + + let fetches = do_fetch(vec![], 0).await; + assert_eq!(fetches, vec![]); + + let fetches = do_fetch(vec![0..3], 0).await; + assert_eq!(fetches, vec![0..3]); + + let fetches = do_fetch(vec![0..2, 3..5], 0).await; + assert_eq!(fetches, vec![0..2, 3..5]); + + let fetches = do_fetch(vec![0..1, 1..2], 0).await; + assert_eq!(fetches, vec![0..2]); + + let fetches = do_fetch(vec![0..1, 2..72], 1).await; + assert_eq!(fetches, vec![0..72]); + + let fetches = do_fetch(vec![0..1, 56..72, 73..75], 1).await; + assert_eq!(fetches, vec![0..1, 56..75]); + + let fetches = do_fetch(vec![0..1, 5..6, 7..9, 2..3, 4..6], 1).await; + assert_eq!(fetches, vec![0..1, 5..9, 2..6]); + } +} From b1fec672f19441091abfcc11b2ac7343e7e9d708 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Mon, 8 Aug 2022 17:57:09 +0100 Subject: [PATCH 017/397] Fix object_store lint (#2367) --- src/util.rs | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/util.rs b/src/util.rs index 1ef4995..46e9e9e 100644 --- a/src/util.rs +++ b/src/util.rs @@ -84,8 +84,8 @@ pub async fn coalesce_ranges( coalesce: usize, ) -> Result> where - F: FnMut(std::ops::Range) -> Fut, - Fut: std::future::Future>, + F: Send + FnMut(std::ops::Range) -> Fut, + Fut: std::future::Future> + Send, { let mut ret = Vec::with_capacity(ranges.len()); let mut start_idx = 0; @@ -105,8 +105,7 @@ where let start = ranges[start_idx].start; let end = ranges[end_idx - 1].end; let bytes = fetch(start..end).await?; - for i in start_idx..end_idx { - let range = ranges[i].clone(); + for range in ranges.iter().take(end_idx).skip(start_idx) { ret.push(bytes.slice(range.start - start..range.end - start)) } start_idx = end_idx; From beb6e19a63eda9e6ff178b8ef3646954cc7025fe Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Tue, 9 Aug 2022 15:56:02 +0100 Subject: [PATCH 018/397] Canonicalize filesystem paths in user-facing APIs (#2370) (#2371) * Canonicalize LocalFileSystem::root (#2370) * Canonicalize paths passed to Path::from_filesystem_path * Add test --- src/local.rs | 33 +++++++++++++++++++++++++++++---- src/path/mod.rs | 31 +++++++++++++++++++++++-------- 2 files changed, 52 insertions(+), 12 deletions(-) diff --git a/src/local.rs b/src/local.rs index c590340..fd3c359 100644 --- a/src/local.rs +++ b/src/local.rs @@ -18,7 +18,7 @@ //! An object store implementation for a local filesystem use crate::{ maybe_spawn_blocking, - path::{filesystem_path_to_url, Path}, + path::{absolute_path_to_url, Path}, GetResult, ListResult, MultipartId, ObjectMeta, ObjectStore, Result, }; use async_trait::async_trait; @@ -129,6 +129,12 @@ pub(crate) enum Error { path: String, source: io::Error, }, + + #[snafu(display("Unable to canonicalize filesystem root: {}", path.display()))] + UnableToCanonicalize { + path: PathBuf, + source: io::Error, + }, } impl From for super::Error { @@ -214,17 +220,24 @@ impl LocalFileSystem { } /// Create new filesystem storage with `prefix` applied to all paths + /// + /// Returns an error if the path does not exist + /// pub fn new_with_prefix(prefix: impl AsRef) -> Result { + let path = std::fs::canonicalize(&prefix).context(UnableToCanonicalizeSnafu { + path: prefix.as_ref(), + })?; + Ok(Self { config: Arc::new(Config { - root: filesystem_path_to_url(prefix)?, + root: absolute_path_to_url(path)?, }), }) } } impl Config { - /// Return filesystem path of the given location + /// Return an absolute filesystem path of the given location fn path_to_filesystem(&self, location: &Path) -> Result { let mut url = self.root.clone(); url.path_segments_mut() @@ -238,8 +251,9 @@ impl Config { .map_err(|_| Error::InvalidUrl { url }.into()) } + /// Resolves the provided absolute filesystem path to a [`Path`] prefix fn filesystem_to_path(&self, location: &std::path::Path) -> Result { - Ok(Path::from_filesystem_path_with_base( + Ok(Path::from_absolute_path_with_base( location, Some(&self.root), )?) @@ -1286,4 +1300,15 @@ mod tests { assert_eq!(res.objects.len(), 1); assert_eq!(res.objects[0].location.as_ref(), filename); } + + #[tokio::test] + async fn relative_paths() { + LocalFileSystem::new_with_prefix(".").unwrap(); + LocalFileSystem::new_with_prefix("..").unwrap(); + LocalFileSystem::new_with_prefix("../..").unwrap(); + + let integration = LocalFileSystem::new(); + let path = Path::from_filesystem_path(".").unwrap(); + integration.list_with_delimiter(Some(&path)).await.unwrap(); + } } diff --git a/src/path/mod.rs b/src/path/mod.rs index 5f16d05..e5a7b64 100644 --- a/src/path/mod.rs +++ b/src/path/mod.rs @@ -162,23 +162,38 @@ impl Path { /// Convert a filesystem path to a [`Path`] relative to the filesystem root /// - /// This will return an error if the path contains illegal - /// character sequences as defined by [`Path::parse`] + /// This will return an error if the path contains illegal character sequences + /// as defined by [`Path::parse`] or does not exist + /// + /// Note: this will canonicalize the provided path, resolving any symlinks pub fn from_filesystem_path( path: impl AsRef, ) -> Result { - Self::from_filesystem_path_with_base(path, None) + let absolute = std::fs::canonicalize(&path).context(CanonicalizeSnafu { + path: path.as_ref(), + })?; + + Self::from_absolute_path(absolute) + } + + /// Convert an absolute filesystem path to a [`Path`] relative to the filesystem root + /// + /// This will return an error if the path contains illegal character sequences + /// as defined by [`Path::parse`], or `base` is not an absolute path + pub fn from_absolute_path(path: impl AsRef) -> Result { + Self::from_absolute_path_with_base(path, None) } /// Convert a filesystem path to a [`Path`] relative to the provided base /// /// This will return an error if the path contains illegal character sequences - /// as defined by [`Path::parse`], or `base` does not refer to a parent path of `path` - pub(crate) fn from_filesystem_path_with_base( + /// as defined by [`Path::parse`], or `base` does not refer to a parent path of `path`, + /// or `base` is not an absolute path + pub(crate) fn from_absolute_path_with_base( path: impl AsRef, base: Option<&Url>, ) -> Result { - let url = filesystem_path_to_url(path)?; + let url = absolute_path_to_url(path)?; let path = match base { Some(prefix) => url.path().strip_prefix(prefix.path()).ok_or_else(|| { Error::PrefixMismatch { @@ -293,8 +308,8 @@ where } } -/// Given a filesystem path convert it to a URL representation -pub(crate) fn filesystem_path_to_url( +/// Given an absolute filesystem path convert it to a URL representation without canonicalization +pub(crate) fn absolute_path_to_url( path: impl AsRef, ) -> Result { Url::from_file_path(&path).map_err(|_| Error::InvalidPath { From 1665f6a2a0bc50e0350d0d5330f6925badb30c0a Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Wed, 10 Aug 2022 10:17:30 -0400 Subject: [PATCH 019/397] object_store: Update version to `0.4.0`, initial release scripts, CHANGELOG for `0.4.0` release (#2392) * begin putting object store scripts in place * More scripts, initial changelog * Fix tags * updates * Create tarball script * Update version * Start on verify script * Updates * Add release scripts * README updates * prettier * Clean up changelog --- .github_changelog_generator | 27 +++++ CHANGELOG.md | 74 ++++++++++++++ Cargo.toml | 2 +- dev/release/README.md | 20 ++++ dev/release/create-tarball.sh | 128 ++++++++++++++++++++++++ dev/release/release-tarball.sh | 76 ++++++++++++++ dev/release/update_change_log.sh | 76 ++++++++++++++ dev/release/verify-release-candidate.sh | 128 ++++++++++++++++++++++++ 8 files changed, 530 insertions(+), 1 deletion(-) create mode 100644 .github_changelog_generator create mode 100644 CHANGELOG.md create mode 100644 dev/release/README.md create mode 100755 dev/release/create-tarball.sh create mode 100755 dev/release/release-tarball.sh create mode 100755 dev/release/update_change_log.sh create mode 100755 dev/release/verify-release-candidate.sh diff --git a/.github_changelog_generator b/.github_changelog_generator new file mode 100644 index 0000000..cbd8aa0 --- /dev/null +++ b/.github_changelog_generator @@ -0,0 +1,27 @@ +#!/bin/bash +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +# Add special sections for documentation, security and performance +add-sections={"documentation":{"prefix":"**Documentation updates:**","labels":["documentation"]},"security":{"prefix":"**Security updates:**","labels":["security"]},"performance":{"prefix":"**Performance improvements:**","labels":["performance"]}} +# so that the component is shown associated with the issue +issue-line-labels=object-store +# skip non object_store issues +exclude-labels=development-process,invalid,arrow,parquet,arrow-flight +breaking_labels=api-change diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..83b4fa9 --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,74 @@ + + +# Changelog + +## [object_store_0.4.0](https://github.com/apache/arrow-rs/tree/object_store_0.4.0) (2022-08-10) + +[Full Changelog](https://github.com/apache/arrow-rs/compare/object_store_0.3.0...object_store_0.4.0) + +**Implemented enhancements:** + +- Relax Path Validation to Allow Any Percent-Encoded Sequence [\#2355](https://github.com/apache/arrow-rs/issues/2355) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Support get\_multi\_ranges in ObjectStore [\#2293](https://github.com/apache/arrow-rs/issues/2293) +- object\_store: Create explicit test for symlinks [\#2206](https://github.com/apache/arrow-rs/issues/2206) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- object\_store: Make builder style configuration for object stores [\#2203](https://github.com/apache/arrow-rs/issues/2203) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- object\_store: Add example in the main documentation readme [\#2202](https://github.com/apache/arrow-rs/issues/2202) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Switch object\_store to log crate from tokio-tracing [\#2255](https://github.com/apache/arrow-rs/issues/2255) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] + +**Fixed bugs:** + +- Azure/S3 Storage Fails to Copy Blob with URL-encoded Path [\#2353](https://github.com/apache/arrow-rs/issues/2353) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Accessing a file with a percent-encoded name on the filesystem with ObjectStore LocalFileSystem [\#2349](https://github.com/apache/arrow-rs/issues/2349) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Test `local::tests::test_list_root` fails on main on macos [\#2174](https://github.com/apache/arrow-rs/issues/2174) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- object\_store unit test might be flaky [\#2141](https://github.com/apache/arrow-rs/issues/2141) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] + +**Documentation updates:** + +- Update instructions on How to join the slack \#arrow-rust channel -- or maybe try to switch to discord?? [\#2192](https://github.com/apache/arrow-rs/issues/2192) +- Improve `object_store crate` documentation [\#2260](https://github.com/apache/arrow-rs/pull/2260) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([alamb](https://github.com/alamb)) + +**Merged pull requests:** + +- Canonicalize filesystem paths in user-facing APIs \(\#2370\) [\#2371](https://github.com/apache/arrow-rs/pull/2371) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Fix object\_store lint [\#2367](https://github.com/apache/arrow-rs/pull/2367) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Relax path validation \(\#2355\) [\#2356](https://github.com/apache/arrow-rs/pull/2356) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Fix Copy from percent-encoded path \(\#2353\) [\#2354](https://github.com/apache/arrow-rs/pull/2354) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Add ObjectStore::get\_ranges \(\#2293\) [\#2336](https://github.com/apache/arrow-rs/pull/2336) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Remove vestigal ` object_store/.circleci/` [\#2337](https://github.com/apache/arrow-rs/pull/2337) ([alamb](https://github.com/alamb)) +- Handle symlinks in LocalFileSystem \(\#2206\) [\#2269](https://github.com/apache/arrow-rs/pull/2269) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Retry GCP requests on server error [\#2243](https://github.com/apache/arrow-rs/pull/2243) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Add LimitStore \(\#2175\) [\#2242](https://github.com/apache/arrow-rs/pull/2242) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Only trigger `arrow` CI on changes to arrow [\#2227](https://github.com/apache/arrow-rs/pull/2227) ([alamb](https://github.com/alamb)) +- Update instructions on how to join the Slack channel [\#2219](https://github.com/apache/arrow-rs/pull/2219) ([HaoYang670](https://github.com/HaoYang670)) +- Add Builder style config objects for object\_store [\#2204](https://github.com/apache/arrow-rs/pull/2204) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([alamb](https://github.com/alamb)) +- Ignore broken symlinks for LocalFileSystem object store [\#2195](https://github.com/apache/arrow-rs/pull/2195) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([jccampagne](https://github.com/jccampagne)) +- Change CI names to match crate names [\#2189](https://github.com/apache/arrow-rs/pull/2189) ([alamb](https://github.com/alamb)) +- Split most arrow specific CI checks into their own workflows \(reduce common CI time to 21 minutes\) [\#2168](https://github.com/apache/arrow-rs/pull/2168) ([alamb](https://github.com/alamb)) +- Remove another attempt to cache target directory in action.yaml [\#2167](https://github.com/apache/arrow-rs/pull/2167) ([alamb](https://github.com/alamb)) +- Run actions on push to master, pull requests [\#2166](https://github.com/apache/arrow-rs/pull/2166) ([alamb](https://github.com/alamb)) +- Break parquet\_derive and arrow\_flight tests into their own workflows [\#2165](https://github.com/apache/arrow-rs/pull/2165) ([alamb](https://github.com/alamb)) +- Only run integration tests when `arrow` changes [\#2152](https://github.com/apache/arrow-rs/pull/2152) ([alamb](https://github.com/alamb)) +- Break out docs CI job to its own github action [\#2151](https://github.com/apache/arrow-rs/pull/2151) ([alamb](https://github.com/alamb)) +- Do not pretend to cache rust build artifacts, speed up CI by ~20% [\#2150](https://github.com/apache/arrow-rs/pull/2150) ([alamb](https://github.com/alamb)) +- Port `object_store` integration tests, use github actions [\#2148](https://github.com/apache/arrow-rs/pull/2148) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([alamb](https://github.com/alamb)) +- Port Add stream upload \(multi-part upload\) [\#2147](https://github.com/apache/arrow-rs/pull/2147) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([alamb](https://github.com/alamb)) +- Increase upper wait time to reduce flakyness of object store test [\#2142](https://github.com/apache/arrow-rs/pull/2142) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([viirya](https://github.com/viirya)) + +\* *This Changelog was automatically generated by [github_changelog_generator](https://github.com/github-changelog-generator/github-changelog-generator)* diff --git a/Cargo.toml b/Cargo.toml index 7ccec86..ffb65aa 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "object_store" -version = "0.3.0" +version = "0.4.0" edition = "2021" license = "MIT/Apache-2.0" readme = "README.md" diff --git a/dev/release/README.md b/dev/release/README.md new file mode 100644 index 0000000..79ea54f --- /dev/null +++ b/dev/release/README.md @@ -0,0 +1,20 @@ + + +See instructons in [`/dev/release/README.md`](../../../dev/release/README.md) diff --git a/dev/release/create-tarball.sh b/dev/release/create-tarball.sh new file mode 100755 index 0000000..bbffde8 --- /dev/null +++ b/dev/release/create-tarball.sh @@ -0,0 +1,128 @@ +#!/bin/bash +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +# This script creates a signed tarball in +# dev/dist/apache-arrow-object-store-rs--.tar.gz and uploads it to +# the "dev" area of the dist.apache.arrow repository and prepares an +# email for sending to the dev@arrow.apache.org list for a formal +# vote. +# +# Note the tags are expected to be `object_sore_` +# +# See release/README.md for full release instructions +# +# Requirements: +# +# 1. gpg setup for signing and have uploaded your public +# signature to https://pgp.mit.edu/ +# +# 2. Logged into the apache svn server with the appropriate +# credentials +# +# +# Based in part on 02-source.sh from apache/arrow +# + +set -e + +SOURCE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +SOURCE_TOP_DIR="$(cd "${SOURCE_DIR}/../../" && pwd)" + +if [ "$#" -ne 2 ]; then + echo "Usage: $0 " + echo "ex. $0 0.4.0 1" + exit +fi + +object_store_version=$1 +rc=$2 + +tag=object_store_${object_store_version} + +release=apache-arrow-object-store-rs-${object_store_version} +distdir=${SOURCE_TOP_DIR}/dev/dist/${release}-rc${rc} +tarname=${release}.tar.gz +tarball=${distdir}/${tarname} +url="https://dist.apache.org/repos/dist/dev/arrow/${release}-rc${rc}" + +echo "Attempting to create ${tarball} from tag ${tag}" + +release_hash=$(cd "${SOURCE_TOP_DIR}" && git rev-list --max-count=1 ${tag}) + +if [ -z "$release_hash" ]; then + echo "Cannot continue: unknown git tag: $tag" +fi + +echo "Draft email for dev@arrow.apache.org mailing list" +echo "" +echo "---------------------------------------------------------" +cat < containing the files in git at $release_hash +# the files in the tarball are prefixed with {object_store_version=} (e.g. 0.4.0) +mkdir -p ${distdir} +(cd "${SOURCE_TOP_DIR}" && git archive ${release_hash} --prefix ${release}/ | gzip > ${tarball}) + +echo "Running rat license checker on ${tarball}" +${SOURCE_DIR}/../../../dev/release/run-rat.sh ${tarball} + +echo "Signing tarball and creating checksums" +gpg --armor --output ${tarball}.asc --detach-sig ${tarball} +# create signing with relative path of tarball +# so that they can be verified with a command such as +# shasum --check apache-arrow-rs-4.1.0-rc2.tar.gz.sha512 +(cd ${distdir} && shasum -a 256 ${tarname}) > ${tarball}.sha256 +(cd ${distdir} && shasum -a 512 ${tarname}) > ${tarball}.sha512 + +echo "Uploading to apache dist/dev to ${url}" +svn co --depth=empty https://dist.apache.org/repos/dist/dev/arrow ${SOURCE_TOP_DIR}/dev/dist +svn add ${distdir} +svn ci -m "Apache Arrow Rust ${object_store_version=} ${rc}" ${distdir} diff --git a/dev/release/release-tarball.sh b/dev/release/release-tarball.sh new file mode 100755 index 0000000..b1919bb --- /dev/null +++ b/dev/release/release-tarball.sh @@ -0,0 +1,76 @@ +#!/bin/bash +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +# This script copies a tarball from the "dev" area of the +# dist.apache.arrow repository to the "release" area +# +# This script should only be run after the release has been approved +# by the arrow PMC committee. +# +# See release/README.md for full release instructions +# +# Based in part on post-01-upload.sh from apache/arrow + + +set -e +set -u + +SOURCE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +SOURCE_TOP_DIR="$(cd "${SOURCE_DIR}/../../" && pwd)" + +if [ "$#" -ne 2 ]; then + echo "Usage: $0 " + echo "ex. $0 0.4.0 1" + exit +fi + +version=$1 +rc=$2 + +tmp_dir=tmp-apache-arrow-dist + +echo "Recreate temporary directory: ${tmp_dir}" +rm -rf ${tmp_dir} +mkdir -p ${tmp_dir} + +echo "Clone dev dist repository" +svn \ + co \ + https://dist.apache.org/repos/dist/dev/arrow/apache-arrow-obect-store-rs-${version}-rc${rc} \ + ${tmp_dir}/dev + +echo "Clone release dist repository" +svn co https://dist.apache.org/repos/dist/release/arrow ${tmp_dir}/release + +echo "Copy ${version}-rc${rc} to release working copy" +release_version=arrow-object-store-rs-${version} +mkdir -p ${tmp_dir}/release/${release_version} +cp -r ${tmp_dir}/dev/* ${tmp_dir}/release/${release_version}/ +svn add ${tmp_dir}/release/${release_version} + +echo "Commit release" +svn ci -m "Apache Arrow Rust Object Store ${version}" ${tmp_dir}/release + +echo "Clean up" +rm -rf ${tmp_dir} + +echo "Success!" +echo "The release is available here:" +echo " https://dist.apache.org/repos/dist/release/arrow/${release_version}" diff --git a/dev/release/update_change_log.sh b/dev/release/update_change_log.sh new file mode 100755 index 0000000..77252c5 --- /dev/null +++ b/dev/release/update_change_log.sh @@ -0,0 +1,76 @@ +#!/bin/bash +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +# invokes the changelog generator from +# https://github.com/github-changelog-generator/github-changelog-generator +# +# With the config located in +# arrow-rs/object_store/.github_changelog_generator +# +# Usage: +# CHANGELOG_GITHUB_TOKEN= ./update_change_log.sh + +set -e + +SINCE_TAG="object_store_0.3.0" +FUTURE_RELEASE="object_store_0.4.0" + +SOURCE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +SOURCE_TOP_DIR="$(cd "${SOURCE_DIR}/../../" && pwd)" + +OUTPUT_PATH="${SOURCE_TOP_DIR}/CHANGELOG.md" + +# remove license header so github-changelog-generator has a clean base to append +sed -i.bak '1,18d' "${OUTPUT_PATH}" + +pushd "${SOURCE_TOP_DIR}" +docker run -it --rm -e CHANGELOG_GITHUB_TOKEN="$CHANGELOG_GITHUB_TOKEN" -v "$(pwd)":/usr/local/src/your-app githubchangeloggenerator/github-changelog-generator \ + --user apache \ + --project arrow-rs \ + --cache-file=.githubchangeloggenerator.cache \ + --cache-log=.githubchangeloggenerator.cache.log \ + --http-cache \ + --max-issues=300 \ + --since-tag ${SINCE_TAG} \ + --future-release ${FUTURE_RELEASE} + +sed -i.bak "s/\\\n/\n\n/" "${OUTPUT_PATH}" + +# Put license header back on +echo ' +' | cat - "${OUTPUT_PATH}" > "${OUTPUT_PATH}".tmp +mv "${OUTPUT_PATH}".tmp "${OUTPUT_PATH}" diff --git a/dev/release/verify-release-candidate.sh b/dev/release/verify-release-candidate.sh new file mode 100755 index 0000000..06a5d8b --- /dev/null +++ b/dev/release/verify-release-candidate.sh @@ -0,0 +1,128 @@ +#!/bin/bash +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +case $# in + 2) VERSION="$1" + RC_NUMBER="$2" + ;; + *) echo "Usage: $0 X.Y.Z RC_NUMBER" + exit 1 + ;; +esac + +set -e +set -x +set -o pipefail + +SOURCE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]:-$0}")" && pwd)" +ARROW_DIR="$(dirname $(dirname ${SOURCE_DIR}))" +ARROW_DIST_URL='https://dist.apache.org/repos/dist/dev/arrow' + +download_dist_file() { + curl \ + --silent \ + --show-error \ + --fail \ + --location \ + --remote-name $ARROW_DIST_URL/$1 +} + +download_rc_file() { + download_dist_file apache-arrow-object-store-rs-${VERSION}-rc${RC_NUMBER}/$1 +} + +import_gpg_keys() { + download_dist_file KEYS + gpg --import KEYS +} + +if type shasum >/dev/null 2>&1; then + sha256_verify="shasum -a 256 -c" + sha512_verify="shasum -a 512 -c" +else + sha256_verify="sha256sum -c" + sha512_verify="sha512sum -c" +fi + +fetch_archive() { + local dist_name=$1 + download_rc_file ${dist_name}.tar.gz + download_rc_file ${dist_name}.tar.gz.asc + download_rc_file ${dist_name}.tar.gz.sha256 + download_rc_file ${dist_name}.tar.gz.sha512 + gpg --verify ${dist_name}.tar.gz.asc ${dist_name}.tar.gz + ${sha256_verify} ${dist_name}.tar.gz.sha256 + ${sha512_verify} ${dist_name}.tar.gz.sha512 +} + +setup_tempdir() { + cleanup() { + if [ "${TEST_SUCCESS}" = "yes" ]; then + rm -fr "${ARROW_TMPDIR}" + else + echo "Failed to verify release candidate. See ${ARROW_TMPDIR} for details." + fi + } + + if [ -z "${ARROW_TMPDIR}" ]; then + # clean up automatically if ARROW_TMPDIR is not defined + ARROW_TMPDIR=$(mktemp -d -t "$1.XXXXX") + trap cleanup EXIT + else + # don't clean up automatically + mkdir -p "${ARROW_TMPDIR}" + fi +} + +test_source_distribution() { + # install rust toolchain in a similar fashion like test-miniconda + export RUSTUP_HOME=$PWD/test-rustup + export CARGO_HOME=$PWD/test-rustup + + curl https://sh.rustup.rs -sSf | sh -s -- -y --no-modify-path + + export PATH=$RUSTUP_HOME/bin:$PATH + source $RUSTUP_HOME/env + + # build and test rust + cargo build + cargo test --all + + # verify that the crate can be published to crates.io + cargo publish --dry-run +} + +TEST_SUCCESS=no + +setup_tempdir "arrow-${VERSION}" +echo "Working in sandbox ${ARROW_TMPDIR}" +cd ${ARROW_TMPDIR} + +dist_name="apache-arrow-object-store-rs-${VERSION}" +import_gpg_keys +fetch_archive ${dist_name} +tar xf ${dist_name}.tar.gz +pushd ${dist_name} +test_source_distribution +popd + +TEST_SUCCESS=yes +echo 'Release candidate looks good!' +exit 0 From 3ca6809e53c6a5dd7a10105b2bb68f595fbfd175 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 11 Aug 2022 12:33:02 +0100 Subject: [PATCH 020/397] Tweak object_store changelog (#2400) --- CHANGELOG.md | 4 ---- 1 file changed, 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 83b4fa9..93faa67 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -30,18 +30,14 @@ - object\_store: Create explicit test for symlinks [\#2206](https://github.com/apache/arrow-rs/issues/2206) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] - object\_store: Make builder style configuration for object stores [\#2203](https://github.com/apache/arrow-rs/issues/2203) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] - object\_store: Add example in the main documentation readme [\#2202](https://github.com/apache/arrow-rs/issues/2202) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- Switch object\_store to log crate from tokio-tracing [\#2255](https://github.com/apache/arrow-rs/issues/2255) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] **Fixed bugs:** - Azure/S3 Storage Fails to Copy Blob with URL-encoded Path [\#2353](https://github.com/apache/arrow-rs/issues/2353) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] - Accessing a file with a percent-encoded name on the filesystem with ObjectStore LocalFileSystem [\#2349](https://github.com/apache/arrow-rs/issues/2349) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- Test `local::tests::test_list_root` fails on main on macos [\#2174](https://github.com/apache/arrow-rs/issues/2174) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- object\_store unit test might be flaky [\#2141](https://github.com/apache/arrow-rs/issues/2141) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] **Documentation updates:** -- Update instructions on How to join the slack \#arrow-rust channel -- or maybe try to switch to discord?? [\#2192](https://github.com/apache/arrow-rs/issues/2192) - Improve `object_store crate` documentation [\#2260](https://github.com/apache/arrow-rs/pull/2260) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([alamb](https://github.com/alamb)) **Merged pull requests:** From 30386bd8a4cc2d4cd30b2786d5fddc899c977e0a Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 11 Aug 2022 13:17:10 +0100 Subject: [PATCH 021/397] Use correct tags when generating changelogs, fix release tarball typo (#2401) * Exclude tags when generating changelogs * Fix release-tarball typo --- dev/release/README.md | 2 +- dev/release/release-tarball.sh | 2 +- dev/release/update_change_log.sh | 1 + 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/dev/release/README.md b/dev/release/README.md index 79ea54f..89f6e57 100644 --- a/dev/release/README.md +++ b/dev/release/README.md @@ -17,4 +17,4 @@ under the License. --> -See instructons in [`/dev/release/README.md`](../../../dev/release/README.md) +See instructions in [`/dev/release/README.md`](../../../dev/release/README.md) diff --git a/dev/release/release-tarball.sh b/dev/release/release-tarball.sh index b1919bb..75ff886 100755 --- a/dev/release/release-tarball.sh +++ b/dev/release/release-tarball.sh @@ -53,7 +53,7 @@ mkdir -p ${tmp_dir} echo "Clone dev dist repository" svn \ co \ - https://dist.apache.org/repos/dist/dev/arrow/apache-arrow-obect-store-rs-${version}-rc${rc} \ + https://dist.apache.org/repos/dist/dev/arrow/apache-arrow-object-store-rs-${version}-rc${rc} \ ${tmp_dir}/dev echo "Clone release dist repository" diff --git a/dev/release/update_change_log.sh b/dev/release/update_change_log.sh index 77252c5..673a180 100755 --- a/dev/release/update_change_log.sh +++ b/dev/release/update_change_log.sh @@ -48,6 +48,7 @@ docker run -it --rm -e CHANGELOG_GITHUB_TOKEN="$CHANGELOG_GITHUB_TOKEN" -v "$(pw --cache-log=.githubchangeloggenerator.cache.log \ --http-cache \ --max-issues=300 \ + --exclude-tags-regex "^\d+\.\d+\.\d+$" \ --since-tag ${SINCE_TAG} \ --future-release ${FUTURE_RELEASE} From 7d8fb76167f222c40a29a07a11e574f13c2d963d Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Thu, 11 Aug 2022 10:48:50 -0400 Subject: [PATCH 022/397] Add comments to changelog generator script (#2412) --- dev/release/update_change_log.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/dev/release/update_change_log.sh b/dev/release/update_change_log.sh index 673a180..ebd50df 100755 --- a/dev/release/update_change_log.sh +++ b/dev/release/update_change_log.sh @@ -40,6 +40,8 @@ OUTPUT_PATH="${SOURCE_TOP_DIR}/CHANGELOG.md" # remove license header so github-changelog-generator has a clean base to append sed -i.bak '1,18d' "${OUTPUT_PATH}" +# use exclude-tags-regex to filter out tags used for arrow +# crates and only look at tags that begin with `object_store_` pushd "${SOURCE_TOP_DIR}" docker run -it --rm -e CHANGELOG_GITHUB_TOKEN="$CHANGELOG_GITHUB_TOKEN" -v "$(pwd)":/usr/local/src/your-app githubchangeloggenerator/github-changelog-generator \ --user apache \ From c3dd12b06589114b0e1c23eda4524b0c3c7998d7 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 11 Aug 2022 20:22:39 +0100 Subject: [PATCH 023/397] Fix clippy lints (#2414) (#2415) --- src/lib.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lib.rs b/src/lib.rs index 57e1371..f7adedb 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -313,7 +313,7 @@ pub struct ListResult { } /// The metadata that describes an object. -#[derive(Debug, Clone, PartialEq)] +#[derive(Debug, Clone, PartialEq, Eq)] pub struct ObjectMeta { /// The full path to the object pub location: Path, From 2243d46ee0cd48767fff90150482e2bccd476674 Mon Sep 17 00:00:00 2001 From: Robert Pack <42610831+roeap@users.noreply.github.com> Date: Sat, 13 Aug 2022 23:04:36 +0200 Subject: [PATCH 024/397] feat: add token provider authorization to azure store (#2374) * feat: add token provider authorizatiojn to azure store * Apply suggestions from code review Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> * feat: adpot latest APIs from altest version * chore: clippy * fix: lifetime issue * chore: better errors and docs * chore: fmt whitespace * fix: firce first error in get method * chore: avoid unwrapping some options Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> --- Cargo.toml | 17 +-- src/azure.rs | 390 ++++++++++++++++++++++++++++----------------------- 2 files changed, 223 insertions(+), 184 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index ffb65aa..bb37198 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -22,11 +22,7 @@ edition = "2021" license = "MIT/Apache-2.0" readme = "README.md" description = "A generic object store interface for uniformly interacting with AWS S3, Google Cloud Storage, Azure Blob Storage and local files." -keywords = [ - "object", - "storage", - "cloud", -] +keywords = ["object", "storage", "cloud"] repository = "https://github.com/apache/arrow-rs" [package.metadata.docs.rs] @@ -35,9 +31,10 @@ all-features = true [dependencies] # In alphabetical order async-trait = "0.1.53" # Microsoft Azure Blob storage integration -azure_core = { version = "0.2", optional = true, default-features = false, features = ["enable_reqwest_rustls"] } -azure_storage = { version = "0.2", optional = true, default-features = false, features = ["account"] } -azure_storage_blobs = { version = "0.2", optional = true, default-features = false, features = ["enable_reqwest_rustls"] } +azure_core = { version = "0.4", optional = true, default-features = false, features = ["enable_reqwest_rustls"] } +azure_identity = { version = "0.5", optional = true, default-features = false, features = ["enable_reqwest_rustls"]} +azure_storage = { version = "0.5", optional = true, default-features = false, features = ["enable_reqwest_rustls"]} +azure_storage_blobs = { version = "0.5", optional = true, default-features = false, features = ["enable_reqwest_rustls"] } bytes = "1.0" chrono = { version = "0.4", default-features = false, features = ["clock"] } # Google Cloud Storage integration @@ -70,7 +67,7 @@ url = "2.2" walkdir = "2" [features] -azure = ["azure_core", "azure_storage_blobs", "azure_storage", "reqwest"] +azure = ["azure_core", "azure_storage_blobs", "azure_storage", "reqwest", "azure_identity"] azure_test = ["azure", "azure_core/azurite_workaround", "azure_storage/azurite_workaround", "azure_storage_blobs/azurite_workaround"] gcp = ["serde", "serde_json", "quick-xml", "reqwest", "reqwest/json", "reqwest/stream", "chrono/serde", "rustls-pemfile", "base64", "rand", "ring"] aws = ["rusoto_core", "rusoto_credential", "rusoto_s3", "rusoto_sts", "hyper", "hyper-rustls"] @@ -78,4 +75,4 @@ aws = ["rusoto_core", "rusoto_credential", "rusoto_s3", "rusoto_sts", "hyper", " [dev-dependencies] # In alphabetical order dotenv = "0.15.0" tempfile = "3.1.0" -futures-test = "0.3" \ No newline at end of file +futures-test = "0.3" diff --git a/src/azure.rs b/src/azure.rs index 6a5f537..9987c03 100644 --- a/src/azure.rs +++ b/src/azure.rs @@ -33,22 +33,26 @@ use crate::{ GetResult, ListResult, MultipartId, ObjectMeta, ObjectStore, Result, }; use async_trait::async_trait; -use azure_core::{prelude::*, HttpClient}; -use azure_storage::core::prelude::{AsStorageClient, StorageAccountClient}; -use azure_storage_blobs::blob::responses::ListBlobsResponse; +use azure_core::{ + error::{Error as AzureError, ErrorKind as AzureErrorKind}, + prelude::*, + StatusCode, +}; +use azure_identity::{ + AutoRefreshingTokenCredential, ClientSecretCredential, TokenCredentialOptions, +}; +use azure_storage::core::clients::StorageClient; use azure_storage_blobs::blob::Blob; -use azure_storage_blobs::{ - prelude::{AsBlobClient, AsContainerClient, ContainerClient}, - DeleteSnapshotsMethod, +use azure_storage_blobs::container::operations::ListBlobsResponse; +use azure_storage_blobs::prelude::{ + AsContainerClient, ContainerClient, DeleteSnapshotsMethod, }; use bytes::Bytes; -use futures::{ - future::BoxFuture, - stream::{self, BoxStream}, - StreamExt, TryStreamExt, -}; +use chrono::{TimeZone, Utc}; +use futures::{future::BoxFuture, stream::BoxStream, StreamExt, TryStreamExt}; use snafu::{ResultExt, Snafu}; use std::collections::BTreeSet; +use std::fmt::{Debug, Formatter}; use std::io; use std::{convert::TryInto, sync::Arc}; use tokio::io::AsyncWrite; @@ -66,7 +70,7 @@ enum Error { source, ))] UnableToDeleteData { - source: Box, + source: AzureError, container: String, path: String, }, @@ -79,7 +83,7 @@ enum Error { source, ))] UnableToGetData { - source: Box, + source: AzureError, container: String, path: String, }, @@ -92,7 +96,7 @@ enum Error { source, ))] UnableToHeadData { - source: Box, + source: AzureError, container: String, path: String, }, @@ -105,7 +109,7 @@ enum Error { source, ))] UnableToGetPieceOfData { - source: Box, + source: AzureError, container: String, path: String, }, @@ -118,7 +122,7 @@ enum Error { source, ))] UnableToPutData { - source: Box, + source: AzureError, container: String, path: String, }, @@ -130,7 +134,7 @@ enum Error { source, ))] UnableToListData { - source: Box, + source: AzureError, container: String, }, @@ -142,7 +146,7 @@ enum Error { source ))] UnableToCopyFile { - source: Box, + source: AzureError, container: String, from: String, to: String, @@ -160,12 +164,12 @@ enum Error { NotFound { path: String, - source: Box, + source: AzureError, }, AlreadyExists { path: String, - source: Box, + source: AzureError, }, #[cfg(not(feature = "azure_test"))] @@ -189,18 +193,24 @@ enum Error { #[snafu(display("Account must be specified"))] MissingAccount {}, - #[snafu(display("Access key must be specified"))] - MissingAccessKey {}, - #[snafu(display("Container name must be specified"))] MissingContainerName {}, + + #[snafu(display("At least one authorization option must be specified"))] + MissingCredentials {}, } impl From for super::Error { fn from(source: Error) -> Self { match source { - Error::NotFound { path, source } => Self::NotFound { path, source }, - Error::AlreadyExists { path, source } => Self::AlreadyExists { path, source }, + Error::NotFound { path, source } => Self::NotFound { + path, + source: Box::new(source), + }, + Error::AlreadyExists { path, source } => Self::AlreadyExists { + path, + source: Box::new(source), + }, _ => Self::Generic { store: "Azure Blob Storage", source: Box::new(source), @@ -227,25 +237,15 @@ impl std::fmt::Display for MicrosoftAzure { } } -#[allow(clippy::borrowed_box)] -fn check_err_not_found(err: &Box) -> bool { - if let Some(azure_core::HttpError::StatusCode { status, .. }) = - err.downcast_ref::() - { - return status.as_u16() == 404; - }; - false -} - #[async_trait] impl ObjectStore for MicrosoftAzure { async fn put(&self, location: &Path, bytes: Bytes) -> Result<()> { let bytes = bytes::BytesMut::from(&*bytes); self.container_client - .as_blob_client(location.as_ref()) + .blob_client(location.as_ref()) .put_block_blob(bytes) - .execute() + .into_future() .await .context(UnableToPutDataSnafu { container: &self.container_name, @@ -277,29 +277,32 @@ impl ObjectStore for MicrosoftAzure { } async fn get(&self, location: &Path) -> Result { - let blob = self + let loc = location.clone(); + let mut stream = self .container_client - .as_blob_client(location.as_ref()) + .blob_client(location.as_ref()) .get() - .execute() - .await - .map_err(|err| { - if check_err_not_found(&err) { - return Error::NotFound { - source: err, - path: location.to_string(), - }; - }; - Error::UnableToGetData { - source: err, - container: self.container_name.clone(), - path: location.to_string(), - } - })?; + .into_stream() + .and_then(|chunk| chunk.data.collect()) + .map_err(move |err| match err.kind() { + AzureErrorKind::HttpResponse { + status: StatusCode::NotFound, + .. + } => crate::Error::NotFound { + source: Box::new(err), + path: loc.to_string(), + }, + _ => crate::Error::Generic { + source: Box::new(err), + store: "MicrosoftAzure", + }, + }) + .boxed(); - Ok(GetResult::Stream( - futures::stream::once(async move { Ok(blob.data) }).boxed(), - )) + let first = stream.next().await.transpose()?.unwrap_or_default(); + Ok(GetResult::Stream(Box::pin( + futures::stream::once(async { Ok(first) }).chain(stream), + ))) } async fn get_range( @@ -307,49 +310,62 @@ impl ObjectStore for MicrosoftAzure { location: &Path, range: std::ops::Range, ) -> Result { - let blob = self + let map_azure_err = |err: AzureError| match err.kind() { + AzureErrorKind::HttpResponse { + status: StatusCode::NotFound, + .. + } => Error::NotFound { + source: err, + path: location.to_string(), + }, + _ => Error::UnableToGetPieceOfData { + source: err, + container: self.container_name.clone(), + path: location.to_string(), + }, + }; + + let mut stream = self .container_client - .as_blob_client(location.as_ref()) + .blob_client(location.as_ref()) .get() .range(range) - .execute() - .await - .map_err(|err| { - if check_err_not_found(&err) { - return Error::NotFound { - source: err, - path: location.to_string(), - }; - }; - Error::UnableToGetPieceOfData { - source: err, - container: self.container_name.clone(), - path: location.to_string(), - } - })?; + .into_stream(); + + let mut chunk: Vec = vec![]; + while let Some(value) = stream.next().await { + let value = value + .map_err(map_azure_err)? + .data + .collect() + .await + .map_err(map_azure_err)?; + chunk.extend(&value); + } - Ok(blob.data) + Ok(chunk.into()) } async fn head(&self, location: &Path) -> Result { let res = self .container_client - .as_blob_client(location.as_ref()) + .blob_client(location.as_ref()) .get_properties() - .execute() + .into_future() .await - .map_err(|err| { - if check_err_not_found(&err) { - return Error::NotFound { - source: err, - path: location.to_string(), - }; - }; - Error::UnableToHeadData { + .map_err(|err| match err.kind() { + AzureErrorKind::HttpResponse { + status: StatusCode::NotFound, + .. + } => Error::NotFound { + source: err, + path: location.to_string(), + }, + _ => Error::UnableToHeadData { source: err, container: self.container_name.clone(), path: location.to_string(), - } + }, })?; convert_object_meta(res.blob)?.ok_or_else(|| super::Error::NotFound { @@ -360,10 +376,10 @@ impl ObjectStore for MicrosoftAzure { async fn delete(&self, location: &Path) -> Result<()> { self.container_client - .as_blob_client(location.as_ref()) + .blob_client(location.as_ref()) .delete() .delete_snapshots_method(DeleteSnapshotsMethod::Include) - .execute() + .into_future() .await .context(UnableToDeleteDataSnafu { container: &self.container_name, @@ -426,9 +442,9 @@ impl ObjectStore for MicrosoftAzure { async fn copy(&self, from: &Path, to: &Path) -> Result<()> { let from_url = self.get_copy_from_url(from)?; self.container_client - .as_blob_client(to.as_ref()) - .copy(&from_url) - .execute() + .blob_client(to.as_ref()) + .copy(from_url) + .into_future() .await .context(UnableToCopyFileSnafu { container: &self.container_name, @@ -441,20 +457,20 @@ impl ObjectStore for MicrosoftAzure { async fn copy_if_not_exists(&self, from: &Path, to: &Path) -> Result<()> { let from_url = self.get_copy_from_url(from)?; self.container_client - .as_blob_client(to.as_ref()) - .copy(&from_url) - .if_match_condition(IfMatchCondition::NotMatch("*".to_string())) - .execute() + .blob_client(to.as_ref()) + .copy(from_url) + .if_match(IfMatchCondition::NotMatch("*".to_string())) + .into_future() .await .map_err(|err| { - if let Some(azure_core::HttpError::StatusCode { status, .. }) = - err.downcast_ref::() + if let AzureErrorKind::HttpResponse { + status: StatusCode::Conflict, + .. + } = err.kind() { - if status.as_u16() == 409 { - return Error::AlreadyExists { - source: err, - path: to.to_string(), - }; + return Error::AlreadyExists { + source: err, + path: to.to_string(), }; }; Error::UnableToCopyFile { @@ -486,60 +502,33 @@ impl MicrosoftAzure { prefix: Option<&Path>, delimiter: bool, ) -> Result>> { - enum ListState { - Start, - HasMore(String), - Done, + let mut stream = self.container_client.list_blobs(); + if let Some(prefix_val) = format_prefix(prefix) { + stream = stream.prefix(prefix_val); + } + if delimiter { + stream = stream.delimiter(Delimiter::new(DELIMITER)); } - let prefix_raw = format_prefix(prefix); - - Ok(stream::unfold(ListState::Start, move |state| { - let mut request = self.container_client.list_blobs(); - - if let Some(p) = prefix_raw.as_deref() { - request = request.prefix(p); - } - - if delimiter { - request = request.delimiter(Delimiter::new(DELIMITER)); - } - - async move { - match state { - ListState::HasMore(ref marker) => { - request = request.next_marker(marker as &str); - } - ListState::Done => { - return None; - } - ListState::Start => {} - } - - let resp = match request.execute().await.context(UnableToListDataSnafu { - container: &self.container_name, - }) { - Ok(resp) => resp, - Err(err) => return Some((Err(crate::Error::from(err)), state)), - }; - - let next_state = if let Some(marker) = &resp.next_marker { - ListState::HasMore(marker.as_str().to_string()) - } else { - ListState::Done - }; + let stream = stream + .into_stream() + .map(|resp| match resp { + Ok(list_blobs) => Ok(list_blobs), + Err(err) => Err(crate::Error::from(Error::UnableToListData { + source: err, + container: self.container_name.clone(), + })), + }) + .boxed(); - Some((Ok(resp), next_state)) - } - }) - .boxed()) + Ok(stream) } } /// Returns `None` if is a directory fn convert_object_meta(blob: Blob) -> Result> { let location = Path::parse(blob.name)?; - let last_modified = blob.properties.last_modified; + let last_modified = Utc.timestamp(blob.properties.last_modified.unix_timestamp(), 0); let size = blob .properties .content_length @@ -580,7 +569,7 @@ fn url_from_env(env_name: &str, default_url: &str) -> Result { Ok(url) } -/// Configure a connection to Mirosoft Azure Blob Storage bucket using +/// Configure a connection to Microsoft Azure Blob Storage container using /// the specified credentials. /// /// # Example @@ -595,14 +584,28 @@ fn url_from_env(env_name: &str, default_url: &str) -> Result { /// .with_container_name(BUCKET_NAME) /// .build(); /// ``` -#[derive(Debug, Default)] +#[derive(Default)] pub struct MicrosoftAzureBuilder { account: Option, access_key: Option, container_name: Option, + bearer_token: Option, + client_id: Option, + client_secret: Option, + tenant_id: Option, use_emulator: bool, } +impl Debug for MicrosoftAzureBuilder { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!( + f, + "MicrosoftAzureBuilder {{ account: {:?}, container_name: {:?} }}", + self.account, self.container_name + ) + } +} + impl MicrosoftAzureBuilder { /// Create a new [`MicrosoftAzureBuilder`] with default values. pub fn new() -> Self { @@ -615,18 +618,46 @@ impl MicrosoftAzureBuilder { self } - /// Set the Azure Access Key (required) + /// Set the Azure Access Key (required - one of access key, bearer token, or client credentials) pub fn with_access_key(mut self, access_key: impl Into) -> Self { self.access_key = Some(access_key.into()); self } + /// Set a static bearer token to be used for authorizing requests + /// (required - one of access key, bearer token, or client credentials) + pub fn with_bearer_token(mut self, bearer_token: impl Into) -> Self { + self.bearer_token = Some(bearer_token.into()); + self + } + /// Set the Azure Container Name (required) pub fn with_container_name(mut self, container_name: impl Into) -> Self { self.container_name = Some(container_name.into()); self } + /// Set a client id used for client secret authorization + /// (required - one of access key, bearer token, or client credentials) + pub fn with_client_id(mut self, client_id: impl Into) -> Self { + self.client_id = Some(client_id.into()); + self + } + + /// Set a client secret used for client secret authorization + /// (required - one of access key, bearer token, or client credentials) + pub fn with_client_secret(mut self, client_secret: impl Into) -> Self { + self.client_secret = Some(client_secret.into()); + self + } + + /// Set the tenant id of the Azure AD tenant + /// (required - one of access key, bearer token, or client credentials) + pub fn with_tenant_id(mut self, tenant_id: impl Into) -> Self { + self.tenant_id = Some(tenant_id.into()); + self + } + /// Set if the Azure emulator should be used (defaults to false) pub fn with_use_emulator(mut self, use_emulator: bool) -> Self { self.use_emulator = use_emulator; @@ -640,20 +671,20 @@ impl MicrosoftAzureBuilder { account, access_key, container_name, + bearer_token, + client_id, + client_secret, + tenant_id, use_emulator, } = self; let account = account.ok_or(Error::MissingAccount {})?; - let access_key = access_key.ok_or(Error::MissingAccessKey {})?; let container_name = container_name.ok_or(Error::MissingContainerName {})?; - let http_client: Arc = Arc::new(reqwest::Client::new()); - - let (is_emulator, storage_account_client) = if use_emulator { + let (is_emulator, storage_client) = if use_emulator { check_if_emulator_works()?; // Allow overriding defaults. Values taken from // from https://docs.rs/azure_storage/0.2.0/src/azure_storage/core/clients/storage_account_client.rs.html#129-141 - let http_client = azure_core::new_http_client(); let blob_storage_url = url_from_env("AZURITE_BLOB_STORAGE_URL", "http://127.0.0.1:10000")?; let queue_storage_url = @@ -663,8 +694,7 @@ impl MicrosoftAzureBuilder { let filesystem_url = url_from_env("AZURITE_TABLE_STORAGE_URL", "http://127.0.0.1:10004")?; - let storage_client = StorageAccountClient::new_emulator( - http_client, + let storage_client = StorageClient::new_emulator( &blob_storage_url, &table_storage_url, &queue_storage_url, @@ -673,25 +703,37 @@ impl MicrosoftAzureBuilder { (true, storage_client) } else { - ( - false, - StorageAccountClient::new_access_key( - Arc::clone(&http_client), - &account, - &access_key, - ), - ) + let client = if let Some(bearer_token) = bearer_token { + Ok(StorageClient::new_bearer_token(&account, bearer_token)) + } else if let Some(access_key) = access_key { + Ok(StorageClient::new_access_key(&account, access_key)) + } else if let (Some(client_id), Some(client_secret), Some(tenant_id)) = + (tenant_id, client_id, client_secret) + { + let credential = Arc::new(AutoRefreshingTokenCredential::new(Arc::new( + ClientSecretCredential::new( + tenant_id, + client_id, + client_secret, + TokenCredentialOptions::default(), + ), + ))); + Ok(StorageClient::new_token_credential(&account, credential)) + } else { + Err(Error::MissingCredentials {}) + }?; + + (false, client) }; - let storage_client = storage_account_client.as_storage_client(); - let blob_base_url = storage_account_client + let blob_base_url = storage_client .blob_storage_url() .as_ref() // make url ending consistent between the emulator and remote storage account .trim_end_matches('/') .to_string(); - let container_client = storage_client.as_container_client(&container_name); + let container_client = Arc::new(storage_client.container_client(&container_name)); Ok(MicrosoftAzure { container_client, @@ -735,9 +777,9 @@ impl CloudMultiPartUploadImpl for AzureMultiPartUpload { Box::pin(async move { client - .as_blob_client(location.as_ref()) + .blob_client(location.as_ref()) .put_block(block_id.clone(), buf) - .execute() + .into_future() .await .map_err(|err| io::Error::new(io::ErrorKind::Other, err))?; @@ -761,7 +803,7 @@ impl CloudMultiPartUploadImpl for AzureMultiPartUpload { .map(|(part_number, maybe_part)| match maybe_part { Some(part) => { Ok(azure_storage_blobs::blob::BlobBlockType::Uncommitted( - azure_storage_blobs::BlockId::new(part.content_id), + azure_storage_blobs::prelude::BlockId::new(part.content_id), )) } None => Err(io::Error::new( @@ -779,9 +821,9 @@ impl CloudMultiPartUploadImpl for AzureMultiPartUpload { }; client - .as_blob_client(location.as_ref()) - .put_block_list(&block_list) - .execute() + .blob_client(location.as_ref()) + .put_block_list(block_list) + .into_future() .await .map_err(|err| io::Error::new(io::ErrorKind::Other, err))?; From 788d00de14dc81308a5fde689c3e07eecfea77f9 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Mon, 15 Aug 2022 12:30:30 +0100 Subject: [PATCH 025/397] Replace rusoto with custom implementation for AWS (#2176) (#2352) * Replace rusoto (#2176) * Add integration test for metadata endpoint * Fix WebIdentity * Fix doc * Fix handling of multipart errors * Use separate client for credentials * Include port in Host header canonical request * Fix doc link * Review feedback --- Cargo.toml | 11 +- src/aws.rs | 1343 -------------------------------------- src/aws/client.rs | 483 ++++++++++++++ src/aws/credential.rs | 590 +++++++++++++++++ src/aws/mod.rs | 646 ++++++++++++++++++ src/azure.rs | 85 +-- src/client/mod.rs | 2 + src/client/pagination.rs | 70 ++ src/client/token.rs | 10 +- src/gcp.rs | 219 +++---- src/lib.rs | 14 +- src/multipart.rs | 59 +- 12 files changed, 1972 insertions(+), 1560 deletions(-) delete mode 100644 src/aws.rs create mode 100644 src/aws/client.rs create mode 100644 src/aws/credential.rs create mode 100644 src/aws/mod.rs create mode 100644 src/client/pagination.rs diff --git a/Cargo.toml b/Cargo.toml index bb37198..8c713d8 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -46,17 +46,8 @@ rustls-pemfile = { version = "1.0", default-features = false, optional = true } ring = { version = "0.16", default-features = false, features = ["std"], optional = true } base64 = { version = "0.13", default-features = false, optional = true } rand = { version = "0.8", default-features = false, features = ["std", "std_rng"], optional = true } -# for rusoto -hyper = { version = "0.14", optional = true, default-features = false } -# for rusoto -hyper-rustls = { version = "0.23.0", optional = true, default-features = false, features = ["webpki-tokio", "http1", "http2", "tls12"] } itertools = "0.10.1" percent-encoding = "2.1" -# rusoto crates are for Amazon S3 integration -rusoto_core = { version = "0.48.0", optional = true, default-features = false, features = ["rustls"] } -rusoto_credential = { version = "0.48.0", optional = true, default-features = false } -rusoto_s3 = { version = "0.48.0", optional = true, default-features = false, features = ["rustls"] } -rusoto_sts = { version = "0.48.0", optional = true, default-features = false, features = ["rustls"] } snafu = "0.7" tokio = { version = "1.18", features = ["sync", "macros", "parking_lot", "rt-multi-thread", "time", "io-util"] } tracing = { version = "0.1" } @@ -70,7 +61,7 @@ walkdir = "2" azure = ["azure_core", "azure_storage_blobs", "azure_storage", "reqwest", "azure_identity"] azure_test = ["azure", "azure_core/azurite_workaround", "azure_storage/azurite_workaround", "azure_storage_blobs/azurite_workaround"] gcp = ["serde", "serde_json", "quick-xml", "reqwest", "reqwest/json", "reqwest/stream", "chrono/serde", "rustls-pemfile", "base64", "rand", "ring"] -aws = ["rusoto_core", "rusoto_credential", "rusoto_s3", "rusoto_sts", "hyper", "hyper-rustls"] +aws = ["serde", "serde_json", "quick-xml", "reqwest", "reqwest/json", "reqwest/stream", "chrono/serde", "rustls-pemfile", "base64", "rand", "ring"] [dev-dependencies] # In alphabetical order dotenv = "0.15.0" diff --git a/src/aws.rs b/src/aws.rs deleted file mode 100644 index bcb294c..0000000 --- a/src/aws.rs +++ /dev/null @@ -1,1343 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -//! An object store implementation for S3 -//! -//! ## Multi-part uploads -//! -//! Multi-part uploads can be initiated with the [ObjectStore::put_multipart] method. -//! Data passed to the writer is automatically buffered to meet the minimum size -//! requirements for a part. Multiple parts are uploaded concurrently. -//! -//! If the writer fails for any reason, you may have parts uploaded to AWS but not -//! used that you may be charged for. Use the [ObjectStore::abort_multipart] method -//! to abort the upload and drop those unneeded parts. In addition, you may wish to -//! consider implementing [automatic cleanup] of unused parts that are older than one -//! week. -//! -//! [automatic cleanup]: https://aws.amazon.com/blogs/aws/s3-lifecycle-management-update-support-for-multipart-uploads-and-delete-markers/ -use crate::multipart::{CloudMultiPartUpload, CloudMultiPartUploadImpl, UploadPart}; -use crate::util::format_http_range; -use crate::MultipartId; -use crate::{ - collect_bytes, - path::{Path, DELIMITER}, - util::format_prefix, - GetResult, ListResult, ObjectMeta, ObjectStore, Result, -}; -use async_trait::async_trait; -use bytes::Bytes; -use chrono::{DateTime, Utc}; -use futures::future::BoxFuture; -use futures::{ - stream::{self, BoxStream}, - Future, Stream, StreamExt, TryStreamExt, -}; -use hyper::client::Builder as HyperBuilder; -use percent_encoding::{percent_encode, AsciiSet, NON_ALPHANUMERIC}; -use rusoto_core::ByteStream; -use rusoto_credential::{InstanceMetadataProvider, StaticProvider}; -use rusoto_s3::S3; -use rusoto_sts::WebIdentityProvider; -use snafu::{OptionExt, ResultExt, Snafu}; -use std::io; -use std::ops::Range; -use std::{ - convert::TryFrom, fmt, num::NonZeroUsize, ops::Deref, sync::Arc, time::Duration, -}; -use tokio::io::AsyncWrite; -use tokio::sync::{OwnedSemaphorePermit, Semaphore}; -use tracing::{debug, warn}; - -// Do not URI-encode any of the unreserved characters that RFC 3986 defines: -// A-Z, a-z, 0-9, hyphen ( - ), underscore ( _ ), period ( . ), and tilde ( ~ ). -const STRICT_ENCODE_SET: AsciiSet = NON_ALPHANUMERIC - .remove(b'-') - .remove(b'.') - .remove(b'_') - .remove(b'~'); - -/// This struct is used to maintain the URI path encoding -const STRICT_PATH_ENCODE_SET: AsciiSet = STRICT_ENCODE_SET.remove(b'/'); - -/// The maximum number of times a request will be retried in the case of an AWS server error -pub const MAX_NUM_RETRIES: u32 = 3; - -/// A specialized `Error` for object store-related errors -#[derive(Debug, Snafu)] -#[allow(missing_docs)] -enum Error { - #[snafu(display( - "Expected streamed data to have length {}, got {}", - expected, - actual - ))] - DataDoesNotMatchLength { expected: usize, actual: usize }, - - #[snafu(display( - "Did not receive any data. Bucket: {}, Location: {}", - bucket, - path - ))] - NoData { bucket: String, path: String }, - - #[snafu(display( - "Unable to DELETE data. Bucket: {}, Location: {}, Error: {} ({:?})", - bucket, - path, - source, - source, - ))] - UnableToDeleteData { - source: rusoto_core::RusotoError, - bucket: String, - path: String, - }, - - #[snafu(display( - "Unable to GET data. Bucket: {}, Location: {}, Error: {} ({:?})", - bucket, - path, - source, - source, - ))] - UnableToGetData { - source: rusoto_core::RusotoError, - bucket: String, - path: String, - }, - - #[snafu(display( - "Unable to HEAD data. Bucket: {}, Location: {}, Error: {} ({:?})", - bucket, - path, - source, - source, - ))] - UnableToHeadData { - source: rusoto_core::RusotoError, - bucket: String, - path: String, - }, - - #[snafu(display( - "Unable to GET part of the data. Bucket: {}, Location: {}, Error: {} ({:?})", - bucket, - path, - source, - source, - ))] - UnableToGetPieceOfData { - source: std::io::Error, - bucket: String, - path: String, - }, - - #[snafu(display( - "Unable to PUT data. Bucket: {}, Location: {}, Error: {} ({:?})", - bucket, - path, - source, - source, - ))] - UnableToPutData { - source: rusoto_core::RusotoError, - bucket: String, - path: String, - }, - - #[snafu(display( - "Unable to upload data. Bucket: {}, Location: {}, Error: {} ({:?})", - bucket, - path, - source, - source, - ))] - UnableToUploadData { - source: rusoto_core::RusotoError, - bucket: String, - path: String, - }, - - #[snafu(display( - "Unable to cleanup multipart data. Bucket: {}, Location: {}, Error: {} ({:?})", - bucket, - path, - source, - source, - ))] - UnableToCleanupMultipartData { - source: rusoto_core::RusotoError, - bucket: String, - path: String, - }, - - #[snafu(display( - "Unable to list data. Bucket: {}, Error: {} ({:?})", - bucket, - source, - source, - ))] - UnableToListData { - source: rusoto_core::RusotoError, - bucket: String, - }, - - #[snafu(display( - "Unable to copy object. Bucket: {}, From: {}, To: {}, Error: {}", - bucket, - from, - to, - source, - ))] - UnableToCopyObject { - source: rusoto_core::RusotoError, - bucket: String, - from: String, - to: String, - }, - - #[snafu(display( - "Unable to parse last modified date. Bucket: {}, Error: {} ({:?})", - bucket, - source, - source, - ))] - UnableToParseLastModified { - source: chrono::ParseError, - bucket: String, - }, - - #[snafu(display( - "Unable to buffer data into temporary file, Error: {} ({:?})", - source, - source, - ))] - UnableToBufferStream { source: std::io::Error }, - - #[snafu(display( - "Could not parse `{}` as an AWS region. Regions should look like `us-east-2`. {} ({:?})", - region, - source, - source, - ))] - InvalidRegion { - region: String, - source: rusoto_core::region::ParseRegionError, - }, - - #[snafu(display( - "Region must be specified for AWS S3. Regions should look like `us-east-2`" - ))] - MissingRegion {}, - - #[snafu(display("Missing bucket name"))] - MissingBucketName {}, - - #[snafu(display("Missing aws-access-key"))] - MissingAccessKey, - - #[snafu(display("Missing aws-secret-access-key"))] - MissingSecretAccessKey, - - NotFound { - path: String, - source: Box, - }, -} - -impl From for super::Error { - fn from(source: Error) -> Self { - match source { - Error::NotFound { path, source } => Self::NotFound { path, source }, - _ => Self::Generic { - store: "S3", - source: Box::new(source), - }, - } - } -} - -/// Interface for [Amazon S3](https://aws.amazon.com/s3/). -pub struct AmazonS3 { - /// S3 client w/o any connection limit. - /// - /// You should normally use [`Self::client`] instead. - client_unrestricted: rusoto_s3::S3Client, - - /// Semaphore that limits the usage of [`client_unrestricted`](Self::client_unrestricted). - connection_semaphore: Arc, - - /// Bucket name used by this object store client. - bucket_name: String, -} - -impl fmt::Debug for AmazonS3 { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - f.debug_struct("AmazonS3") - .field("client", &"rusoto_s3::S3Client") - .field("bucket_name", &self.bucket_name) - .finish() - } -} - -impl fmt::Display for AmazonS3 { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - write!(f, "AmazonS3({})", self.bucket_name) - } -} - -#[async_trait] -impl ObjectStore for AmazonS3 { - async fn put(&self, location: &Path, bytes: Bytes) -> Result<()> { - let bucket_name = self.bucket_name.clone(); - let request_factory = move || { - let bytes = bytes.clone(); - - let length = bytes.len(); - let stream_data = Ok(bytes); - let stream = futures::stream::once(async move { stream_data }); - let byte_stream = ByteStream::new_with_size(stream, length); - - rusoto_s3::PutObjectRequest { - bucket: bucket_name.clone(), - key: location.to_string(), - body: Some(byte_stream), - ..Default::default() - } - }; - - let s3 = self.client().await; - - s3_request(move || { - let (s3, request_factory) = (s3.clone(), request_factory.clone()); - - async move { s3.put_object(request_factory()).await } - }) - .await - .context(UnableToPutDataSnafu { - bucket: &self.bucket_name, - path: location.as_ref(), - })?; - - Ok(()) - } - - async fn put_multipart( - &self, - location: &Path, - ) -> Result<(MultipartId, Box)> { - let bucket_name = self.bucket_name.clone(); - - let request_factory = move || rusoto_s3::CreateMultipartUploadRequest { - bucket: bucket_name.clone(), - key: location.to_string(), - ..Default::default() - }; - - let s3 = self.client().await; - - let data = s3_request(move || { - let (s3, request_factory) = (s3.clone(), request_factory.clone()); - - async move { s3.create_multipart_upload(request_factory()).await } - }) - .await - .context(UnableToUploadDataSnafu { - bucket: &self.bucket_name, - path: location.as_ref(), - })?; - - let upload_id = data.upload_id.unwrap(); - - let inner = S3MultiPartUpload { - upload_id: upload_id.clone(), - bucket: self.bucket_name.clone(), - key: location.to_string(), - client_unrestricted: self.client_unrestricted.clone(), - connection_semaphore: Arc::clone(&self.connection_semaphore), - }; - - Ok((upload_id, Box::new(CloudMultiPartUpload::new(inner, 8)))) - } - - async fn abort_multipart( - &self, - location: &Path, - multipart_id: &MultipartId, - ) -> Result<()> { - let request_factory = move || rusoto_s3::AbortMultipartUploadRequest { - bucket: self.bucket_name.clone(), - key: location.to_string(), - upload_id: multipart_id.to_string(), - ..Default::default() - }; - - let s3 = self.client().await; - s3_request(move || { - let (s3, request_factory) = (s3.clone(), request_factory); - - async move { s3.abort_multipart_upload(request_factory()).await } - }) - .await - .context(UnableToCleanupMultipartDataSnafu { - bucket: &self.bucket_name, - path: location.as_ref(), - })?; - - Ok(()) - } - - async fn get(&self, location: &Path) -> Result { - Ok(GetResult::Stream( - self.get_object(location, None).await?.boxed(), - )) - } - - async fn get_range(&self, location: &Path, range: Range) -> Result { - let size_hint = range.end - range.start; - let stream = self.get_object(location, Some(range)).await?; - collect_bytes(stream, Some(size_hint)).await - } - - async fn head(&self, location: &Path) -> Result { - let key = location.to_string(); - let head_request = rusoto_s3::HeadObjectRequest { - bucket: self.bucket_name.clone(), - key: key.clone(), - ..Default::default() - }; - let s = self - .client() - .await - .head_object(head_request) - .await - .map_err(|e| match e { - rusoto_core::RusotoError::Service( - rusoto_s3::HeadObjectError::NoSuchKey(_), - ) => Error::NotFound { - path: key.clone(), - source: e.into(), - }, - rusoto_core::RusotoError::Unknown(h) if h.status.as_u16() == 404 => { - Error::NotFound { - path: key.clone(), - source: "resource not found".into(), - } - } - _ => Error::UnableToHeadData { - bucket: self.bucket_name.to_owned(), - path: key.clone(), - source: e, - }, - })?; - - // Note: GetObject and HeadObject return a different date format from ListObjects - // - // S3 List returns timestamps in the form - // 2013-09-17T18:07:53.000Z - // S3 GetObject returns timestamps in the form - // Last-Modified: Sun, 1 Jan 2006 12:00:00 GMT - let last_modified = match s.last_modified { - Some(lm) => DateTime::parse_from_rfc2822(&lm) - .context(UnableToParseLastModifiedSnafu { - bucket: &self.bucket_name, - })? - .with_timezone(&Utc), - None => Utc::now(), - }; - - Ok(ObjectMeta { - last_modified, - location: location.clone(), - size: usize::try_from(s.content_length.unwrap_or(0)) - .expect("unsupported size on this platform"), - }) - } - - async fn delete(&self, location: &Path) -> Result<()> { - let bucket_name = self.bucket_name.clone(); - - let request_factory = move || rusoto_s3::DeleteObjectRequest { - bucket: bucket_name.clone(), - key: location.to_string(), - ..Default::default() - }; - - let s3 = self.client().await; - - s3_request(move || { - let (s3, request_factory) = (s3.clone(), request_factory.clone()); - - async move { s3.delete_object(request_factory()).await } - }) - .await - .context(UnableToDeleteDataSnafu { - bucket: &self.bucket_name, - path: location.as_ref(), - })?; - - Ok(()) - } - - async fn list( - &self, - prefix: Option<&Path>, - ) -> Result>> { - Ok(self - .list_objects_v2(prefix, None) - .await? - .map_ok(move |list_objects_v2_result| { - let contents = list_objects_v2_result.contents.unwrap_or_default(); - let iter = contents - .into_iter() - .map(|object| convert_object_meta(object, &self.bucket_name)); - - futures::stream::iter(iter) - }) - .try_flatten() - .boxed()) - } - - async fn list_with_delimiter(&self, prefix: Option<&Path>) -> Result { - Ok(self - .list_objects_v2(prefix, Some(DELIMITER.to_string())) - .await? - .try_fold( - ListResult { - common_prefixes: vec![], - objects: vec![], - }, - |acc, list_objects_v2_result| async move { - let mut res = acc; - let contents = list_objects_v2_result.contents.unwrap_or_default(); - let mut objects = contents - .into_iter() - .map(|object| convert_object_meta(object, &self.bucket_name)) - .collect::>>()?; - - res.objects.append(&mut objects); - - let prefixes = - list_objects_v2_result.common_prefixes.unwrap_or_default(); - res.common_prefixes.reserve(prefixes.len()); - - for p in prefixes { - let prefix = - p.prefix.expect("can't have a prefix without a value"); - res.common_prefixes.push(Path::parse(prefix)?); - } - - Ok(res) - }, - ) - .await?) - } - - async fn copy(&self, from: &Path, to: &Path) -> Result<()> { - let from = from.as_ref(); - let to = to.as_ref(); - let bucket_name = self.bucket_name.clone(); - - let copy_source = format!( - "{}/{}", - &bucket_name, - percent_encode(from.as_ref(), &STRICT_PATH_ENCODE_SET) - ); - - let request_factory = move || rusoto_s3::CopyObjectRequest { - bucket: bucket_name.clone(), - copy_source, - key: to.to_string(), - ..Default::default() - }; - - let s3 = self.client().await; - - s3_request(move || { - let (s3, request_factory) = (s3.clone(), request_factory.clone()); - - async move { s3.copy_object(request_factory()).await } - }) - .await - .context(UnableToCopyObjectSnafu { - bucket: &self.bucket_name, - from, - to, - })?; - - Ok(()) - } - - async fn copy_if_not_exists(&self, _source: &Path, _dest: &Path) -> Result<()> { - // Will need dynamodb_lock - Err(crate::Error::NotImplemented) - } -} - -fn convert_object_meta(object: rusoto_s3::Object, bucket: &str) -> Result { - let key = object.key.expect("object doesn't exist without a key"); - let location = Path::parse(key)?; - let last_modified = match object.last_modified { - Some(lm) => DateTime::parse_from_rfc3339(&lm) - .context(UnableToParseLastModifiedSnafu { bucket })? - .with_timezone(&Utc), - None => Utc::now(), - }; - let size = usize::try_from(object.size.unwrap_or(0)) - .expect("unsupported size on this platform"); - - Ok(ObjectMeta { - location, - last_modified, - size, - }) -} - -/// Configure a connection to Amazon S3 using the specified credentials in -/// the specified Amazon region and bucket. -/// -/// # Example -/// ``` -/// # let REGION = "foo"; -/// # let BUCKET_NAME = "foo"; -/// # let ACCESS_KEY_ID = "foo"; -/// # let SECRET_KEY = "foo"; -/// # use object_store::aws::AmazonS3Builder; -/// let s3 = AmazonS3Builder::new() -/// .with_region(REGION) -/// .with_bucket_name(BUCKET_NAME) -/// .with_access_key_id(ACCESS_KEY_ID) -/// .with_secret_access_key(SECRET_KEY) -/// .build(); -/// ``` -#[derive(Debug)] -pub struct AmazonS3Builder { - access_key_id: Option, - secret_access_key: Option, - region: Option, - bucket_name: Option, - endpoint: Option, - token: Option, - max_connections: NonZeroUsize, - allow_http: bool, -} - -impl Default for AmazonS3Builder { - fn default() -> Self { - Self { - access_key_id: None, - secret_access_key: None, - region: None, - bucket_name: None, - endpoint: None, - token: None, - max_connections: NonZeroUsize::new(16).unwrap(), - allow_http: false, - } - } -} - -impl AmazonS3Builder { - /// Create a new [`AmazonS3Builder`] with default values. - pub fn new() -> Self { - Default::default() - } - - /// Set the AWS Access Key (required) - pub fn with_access_key_id(mut self, access_key_id: impl Into) -> Self { - self.access_key_id = Some(access_key_id.into()); - self - } - - /// Set the AWS Secret Access Key (required) - pub fn with_secret_access_key( - mut self, - secret_access_key: impl Into, - ) -> Self { - self.secret_access_key = Some(secret_access_key.into()); - self - } - - /// Set the region (e.g. `us-east-1`) (required) - pub fn with_region(mut self, region: impl Into) -> Self { - self.region = Some(region.into()); - self - } - - /// Set the bucket_name (required) - pub fn with_bucket_name(mut self, bucket_name: impl Into) -> Self { - self.bucket_name = Some(bucket_name.into()); - self - } - - /// Sets the endpoint for communicating with AWS S3. Default value - /// is based on region. - /// - /// For example, this might be set to `"http://localhost:4566:` - /// for testing against a localstack instance. - pub fn with_endpoint(mut self, endpoint: impl Into) -> Self { - self.endpoint = Some(endpoint.into()); - self - } - - /// Set the token to use for requests (passed to underlying provider) - pub fn with_token(mut self, token: impl Into) -> Self { - self.token = Some(token.into()); - self - } - - /// Sets the maximum number of concurrent outstanding - /// connectons. Default is `16`. - #[deprecated(note = "use LimitStore instead")] - pub fn with_max_connections(mut self, max_connections: NonZeroUsize) -> Self { - self.max_connections = max_connections; - self - } - - /// Sets what protocol is allowed. If `allow_http` is : - /// * false (default): Only HTTPS are allowed - /// * true: HTTP and HTTPS are allowed - pub fn with_allow_http(mut self, allow_http: bool) -> Self { - self.allow_http = allow_http; - self - } - - /// Create a [`AmazonS3`] instance from the provided values, - /// consuming `self`. - pub fn build(self) -> Result { - let Self { - access_key_id, - secret_access_key, - region, - bucket_name, - endpoint, - token, - max_connections, - allow_http, - } = self; - - let region = region.ok_or(Error::MissingRegion {})?; - let bucket_name = bucket_name.ok_or(Error::MissingBucketName {})?; - - let region: rusoto_core::Region = match endpoint { - None => region.parse().context(InvalidRegionSnafu { region })?, - Some(endpoint) => rusoto_core::Region::Custom { - name: region, - endpoint, - }, - }; - - let mut builder = HyperBuilder::default(); - builder.pool_max_idle_per_host(max_connections.get()); - - let connector = if allow_http { - hyper_rustls::HttpsConnectorBuilder::new() - .with_webpki_roots() - .https_or_http() - .enable_http1() - .enable_http2() - .build() - } else { - hyper_rustls::HttpsConnectorBuilder::new() - .with_webpki_roots() - .https_only() - .enable_http1() - .enable_http2() - .build() - }; - - let http_client = - rusoto_core::request::HttpClient::from_builder(builder, connector); - - let client = match (access_key_id, secret_access_key, token) { - (Some(access_key_id), Some(secret_access_key), Some(token)) => { - let credentials_provider = StaticProvider::new( - access_key_id, - secret_access_key, - Some(token), - None, - ); - rusoto_s3::S3Client::new_with(http_client, credentials_provider, region) - } - (Some(access_key_id), Some(secret_access_key), None) => { - let credentials_provider = - StaticProvider::new_minimal(access_key_id, secret_access_key); - rusoto_s3::S3Client::new_with(http_client, credentials_provider, region) - } - (None, Some(_), _) => return Err(Error::MissingAccessKey.into()), - (Some(_), None, _) => return Err(Error::MissingSecretAccessKey.into()), - _ if std::env::var_os("AWS_WEB_IDENTITY_TOKEN_FILE").is_some() => { - rusoto_s3::S3Client::new_with( - http_client, - WebIdentityProvider::from_k8s_env(), - region, - ) - } - _ => rusoto_s3::S3Client::new_with( - http_client, - InstanceMetadataProvider::new(), - region, - ), - }; - - Ok(AmazonS3 { - client_unrestricted: client, - connection_semaphore: Arc::new(Semaphore::new(max_connections.get())), - bucket_name, - }) - } -} - -/// S3 client bundled w/ a semaphore permit. -#[derive(Clone)] -struct SemaphoreClient { - /// Permit for this specific use of the client. - /// - /// Note that this field is never read and therefore considered "dead code" by rustc. - #[allow(dead_code)] - permit: Arc, - - inner: rusoto_s3::S3Client, -} - -impl Deref for SemaphoreClient { - type Target = rusoto_s3::S3Client; - - fn deref(&self) -> &Self::Target { - &self.inner - } -} - -impl AmazonS3 { - /// Get a client according to the current connection limit. - async fn client(&self) -> SemaphoreClient { - let permit = Arc::clone(&self.connection_semaphore) - .acquire_owned() - .await - .expect("semaphore shouldn't be closed yet"); - SemaphoreClient { - permit: Arc::new(permit), - inner: self.client_unrestricted.clone(), - } - } - - async fn get_object( - &self, - location: &Path, - range: Option>, - ) -> Result>> { - let key = location.to_string(); - let get_request = rusoto_s3::GetObjectRequest { - bucket: self.bucket_name.clone(), - key: key.clone(), - range: range.map(format_http_range), - ..Default::default() - }; - let bucket_name = self.bucket_name.clone(); - let stream = self - .client() - .await - .get_object(get_request) - .await - .map_err(|e| match e { - rusoto_core::RusotoError::Service( - rusoto_s3::GetObjectError::NoSuchKey(_), - ) => Error::NotFound { - path: key.clone(), - source: e.into(), - }, - _ => Error::UnableToGetData { - bucket: self.bucket_name.to_owned(), - path: key.clone(), - source: e, - }, - })? - .body - .context(NoDataSnafu { - bucket: self.bucket_name.to_owned(), - path: key.clone(), - })? - .map_err(move |source| Error::UnableToGetPieceOfData { - source, - bucket: bucket_name.clone(), - path: key.clone(), - }) - .err_into(); - - Ok(stream) - } - - async fn list_objects_v2( - &self, - prefix: Option<&Path>, - delimiter: Option, - ) -> Result>> { - enum ListState { - Start, - HasMore(String), - Done, - } - - let prefix = format_prefix(prefix); - let bucket = self.bucket_name.clone(); - - let request_factory = move || rusoto_s3::ListObjectsV2Request { - bucket, - prefix, - delimiter, - ..Default::default() - }; - let s3 = self.client().await; - - Ok(stream::unfold(ListState::Start, move |state| { - let request_factory = request_factory.clone(); - let s3 = s3.clone(); - - async move { - let continuation_token = match &state { - ListState::HasMore(continuation_token) => Some(continuation_token), - ListState::Done => { - return None; - } - // If this is the first request we've made, we don't need to make any - // modifications to the request - ListState::Start => None, - }; - - let resp = s3_request(move || { - let (s3, request_factory, continuation_token) = ( - s3.clone(), - request_factory.clone(), - continuation_token.cloned(), - ); - - async move { - s3.list_objects_v2(rusoto_s3::ListObjectsV2Request { - continuation_token, - ..request_factory() - }) - .await - } - }) - .await; - - let resp = match resp { - Ok(resp) => resp, - Err(e) => return Some((Err(e), state)), - }; - - // The AWS response contains a field named `is_truncated` as well as - // `next_continuation_token`, and we're assuming that `next_continuation_token` - // is only set when `is_truncated` is true (and therefore not - // checking `is_truncated`). - let next_state = if let Some(next_continuation_token) = - &resp.next_continuation_token - { - ListState::HasMore(next_continuation_token.to_string()) - } else { - ListState::Done - }; - - Some((Ok(resp), next_state)) - } - }) - .map_err(move |e| { - Error::UnableToListData { - source: e, - bucket: self.bucket_name.clone(), - } - .into() - }) - .boxed()) - } -} - -/// Handles retrying a request to S3 up to `MAX_NUM_RETRIES` times if S3 returns 5xx server errors. -/// -/// The `future_factory` argument is a function `F` that takes no arguments and, when called, will -/// return a `Future` (type `G`) that, when `await`ed, will perform a request to S3 through -/// `rusoto` and return a `Result` that returns some type `R` on success and some -/// `rusoto_core::RusotoError` on error. -/// -/// If the executed `Future` returns success, this function will return that success. -/// If the executed `Future` returns a 5xx server error, this function will wait an amount of -/// time that increases exponentially with the number of times it has retried, get a new `Future` by -/// calling `future_factory` again, and retry the request by `await`ing the `Future` again. -/// The retries will continue until the maximum number of retries has been attempted. In that case, -/// this function will return the last encountered error. -/// -/// Client errors (4xx) will never be retried by this function. -async fn s3_request( - future_factory: F, -) -> Result> -where - E: std::error::Error + Send, - F: Fn() -> G + Send, - G: Future>> + Send, - R: Send, -{ - let mut attempts = 0; - - loop { - let request = future_factory(); - - let result = request.await; - - match result { - Ok(r) => return Ok(r), - Err(error) => { - attempts += 1; - - let should_retry = matches!( - error, - rusoto_core::RusotoError::Unknown(ref response) - if response.status.is_server_error() - ); - - if attempts > MAX_NUM_RETRIES { - warn!( - ?error, - attempts, "maximum number of retries exceeded for AWS S3 request" - ); - return Err(error); - } else if !should_retry { - return Err(error); - } else { - debug!(?error, attempts, "retrying AWS S3 request"); - let wait_time = Duration::from_millis(2u64.pow(attempts) * 50); - tokio::time::sleep(wait_time).await; - } - } - } - } -} - -struct S3MultiPartUpload { - bucket: String, - key: String, - upload_id: String, - client_unrestricted: rusoto_s3::S3Client, - connection_semaphore: Arc, -} - -impl CloudMultiPartUploadImpl for S3MultiPartUpload { - fn put_multipart_part( - &self, - buf: Vec, - part_idx: usize, - ) -> BoxFuture<'static, Result<(usize, UploadPart), io::Error>> { - // Get values to move into future; we don't want a reference to Self - let bucket = self.bucket.clone(); - let key = self.key.clone(); - let upload_id = self.upload_id.clone(); - let content_length = buf.len(); - - let request_factory = move || rusoto_s3::UploadPartRequest { - bucket, - key, - upload_id, - // AWS part number is 1-indexed - part_number: (part_idx + 1).try_into().unwrap(), - content_length: Some(content_length.try_into().unwrap()), - body: Some(buf.into()), - ..Default::default() - }; - - let s3 = self.client_unrestricted.clone(); - let connection_semaphore = Arc::clone(&self.connection_semaphore); - - Box::pin(async move { - let _permit = connection_semaphore - .acquire_owned() - .await - .expect("semaphore shouldn't be closed yet"); - - let response = s3_request(move || { - let (s3, request_factory) = (s3.clone(), request_factory.clone()); - async move { s3.upload_part(request_factory()).await } - }) - .await - .map_err(|err| io::Error::new(io::ErrorKind::Other, err))?; - - Ok(( - part_idx, - UploadPart { - content_id: response.e_tag.unwrap(), - }, - )) - }) - } - - fn complete( - &self, - completed_parts: Vec>, - ) -> BoxFuture<'static, Result<(), io::Error>> { - let parts = - completed_parts - .into_iter() - .enumerate() - .map(|(part_number, maybe_part)| match maybe_part { - Some(part) => { - Ok(rusoto_s3::CompletedPart { - e_tag: Some(part.content_id), - part_number: Some((part_number + 1).try_into().map_err( - |err| io::Error::new(io::ErrorKind::Other, err), - )?), - }) - } - None => Err(io::Error::new( - io::ErrorKind::Other, - format!("Missing information for upload part {:?}", part_number), - )), - }); - - // Get values to move into future; we don't want a reference to Self - let bucket = self.bucket.clone(); - let key = self.key.clone(); - let upload_id = self.upload_id.clone(); - - let request_factory = move || -> Result<_, io::Error> { - Ok(rusoto_s3::CompleteMultipartUploadRequest { - bucket, - key, - upload_id, - multipart_upload: Some(rusoto_s3::CompletedMultipartUpload { - parts: Some(parts.collect::>()?), - }), - ..Default::default() - }) - }; - - let s3 = self.client_unrestricted.clone(); - let connection_semaphore = Arc::clone(&self.connection_semaphore); - - Box::pin(async move { - let _permit = connection_semaphore - .acquire_owned() - .await - .expect("semaphore shouldn't be closed yet"); - - s3_request(move || { - let (s3, request_factory) = (s3.clone(), request_factory.clone()); - - async move { s3.complete_multipart_upload(request_factory()?).await } - }) - .await - .map_err(|err| io::Error::new(io::ErrorKind::Other, err))?; - - Ok(()) - }) - } -} - -#[cfg(test)] -mod tests { - use super::*; - use crate::{ - tests::{ - get_nonexistent_object, list_uses_directories_correctly, list_with_delimiter, - put_get_delete_list, rename_and_copy, stream_get, - }, - Error as ObjectStoreError, - }; - use bytes::Bytes; - use std::env; - - const NON_EXISTENT_NAME: &str = "nonexistentname"; - - // Helper macro to skip tests if TEST_INTEGRATION and the AWS - // environment variables are not set. Returns a configured - // AmazonS3Builder - macro_rules! maybe_skip_integration { - () => {{ - dotenv::dotenv().ok(); - - let required_vars = [ - "AWS_DEFAULT_REGION", - "OBJECT_STORE_BUCKET", - "AWS_ACCESS_KEY_ID", - "AWS_SECRET_ACCESS_KEY", - ]; - let unset_vars: Vec<_> = required_vars - .iter() - .filter_map(|&name| match env::var(name) { - Ok(_) => None, - Err(_) => Some(name), - }) - .collect(); - let unset_var_names = unset_vars.join(", "); - - let force = env::var("TEST_INTEGRATION"); - - if force.is_ok() && !unset_var_names.is_empty() { - panic!( - "TEST_INTEGRATION is set, \ - but variable(s) {} need to be set", - unset_var_names - ); - } else if force.is_err() { - eprintln!( - "skipping AWS integration test - set {}TEST_INTEGRATION to run", - if unset_var_names.is_empty() { - String::new() - } else { - format!("{} and ", unset_var_names) - } - ); - return; - } else { - let config = AmazonS3Builder::new() - .with_access_key_id( - env::var("AWS_ACCESS_KEY_ID") - .expect("already checked AWS_ACCESS_KEY_ID"), - ) - .with_secret_access_key( - env::var("AWS_SECRET_ACCESS_KEY") - .expect("already checked AWS_SECRET_ACCESS_KEY"), - ) - .with_region( - env::var("AWS_DEFAULT_REGION") - .expect("already checked AWS_DEFAULT_REGION"), - ) - .with_bucket_name( - env::var("OBJECT_STORE_BUCKET") - .expect("already checked OBJECT_STORE_BUCKET"), - ) - .with_allow_http(true); - - let config = if let Some(endpoint) = env::var("AWS_ENDPOINT").ok() { - config.with_endpoint(endpoint) - } else { - config - }; - - let config = if let Some(token) = env::var("AWS_SESSION_TOKEN").ok() { - config.with_token(token) - } else { - config - }; - - config - } - }}; - } - - #[tokio::test] - async fn s3_test() { - let config = maybe_skip_integration!(); - let integration = config.build().unwrap(); - - put_get_delete_list(&integration).await; - list_uses_directories_correctly(&integration).await; - list_with_delimiter(&integration).await; - rename_and_copy(&integration).await; - stream_get(&integration).await; - } - - #[tokio::test] - async fn s3_test_get_nonexistent_location() { - let config = maybe_skip_integration!(); - let integration = config.build().unwrap(); - - let location = Path::from_iter([NON_EXISTENT_NAME]); - - let err = get_nonexistent_object(&integration, Some(location)) - .await - .unwrap_err(); - if let ObjectStoreError::NotFound { path, source } = err { - let source_variant = source.downcast_ref::>(); - assert!( - matches!( - source_variant, - Some(rusoto_core::RusotoError::Service( - rusoto_s3::GetObjectError::NoSuchKey(_) - )), - ), - "got: {:?}", - source_variant - ); - assert_eq!(path, NON_EXISTENT_NAME); - } else { - panic!("unexpected error type: {:?}", err); - } - } - - #[tokio::test] - async fn s3_test_get_nonexistent_bucket() { - let config = maybe_skip_integration!().with_bucket_name(NON_EXISTENT_NAME); - let integration = config.build().unwrap(); - - let location = Path::from_iter([NON_EXISTENT_NAME]); - - let err = integration.get(&location).await.unwrap_err().to_string(); - assert!( - err.contains("The specified bucket does not exist"), - "{}", - err - ) - } - - #[tokio::test] - async fn s3_test_put_nonexistent_bucket() { - let config = maybe_skip_integration!().with_bucket_name(NON_EXISTENT_NAME); - - let integration = config.build().unwrap(); - - let location = Path::from_iter([NON_EXISTENT_NAME]); - let data = Bytes::from("arbitrary data"); - - let err = integration - .put(&location, data) - .await - .unwrap_err() - .to_string(); - - assert!( - err.contains("The specified bucket does not exist") - && err.contains("Unable to PUT data"), - "{}", - err - ) - } - - #[tokio::test] - async fn s3_test_delete_nonexistent_location() { - let config = maybe_skip_integration!(); - let integration = config.build().unwrap(); - - let location = Path::from_iter([NON_EXISTENT_NAME]); - - integration.delete(&location).await.unwrap(); - } - - #[tokio::test] - async fn s3_test_delete_nonexistent_bucket() { - let config = maybe_skip_integration!().with_bucket_name(NON_EXISTENT_NAME); - let integration = config.build().unwrap(); - - let location = Path::from_iter([NON_EXISTENT_NAME]); - - let err = integration.delete(&location).await.unwrap_err().to_string(); - assert!( - err.contains("The specified bucket does not exist") - && err.contains("Unable to DELETE data"), - "{}", - err - ) - } -} diff --git a/src/aws/client.rs b/src/aws/client.rs new file mode 100644 index 0000000..36ba9ad --- /dev/null +++ b/src/aws/client.rs @@ -0,0 +1,483 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::aws::credential::{AwsCredential, CredentialExt, CredentialProvider}; +use crate::client::pagination::stream_paginated; +use crate::client::retry::RetryExt; +use crate::multipart::UploadPart; +use crate::path::DELIMITER; +use crate::util::{format_http_range, format_prefix}; +use crate::{ + BoxStream, ListResult, MultipartId, ObjectMeta, Path, Result, RetryConfig, StreamExt, +}; +use bytes::{Buf, Bytes}; +use chrono::{DateTime, Utc}; +use percent_encoding::{utf8_percent_encode, AsciiSet, PercentEncode, NON_ALPHANUMERIC}; +use reqwest::{Client as ReqwestClient, Method, Response, StatusCode}; +use serde::{Deserialize, Serialize}; +use snafu::{ResultExt, Snafu}; +use std::ops::Range; +use std::sync::Arc; + +// http://docs.aws.amazon.com/general/latest/gr/sigv4-create-canonical-request.html +// +// Do not URI-encode any of the unreserved characters that RFC 3986 defines: +// A-Z, a-z, 0-9, hyphen ( - ), underscore ( _ ), period ( . ), and tilde ( ~ ). +const STRICT_ENCODE_SET: AsciiSet = NON_ALPHANUMERIC + .remove(b'-') + .remove(b'.') + .remove(b'_') + .remove(b'~'); + +/// This struct is used to maintain the URI path encoding +const STRICT_PATH_ENCODE_SET: AsciiSet = STRICT_ENCODE_SET.remove(b'/'); + +/// A specialized `Error` for object store-related errors +#[derive(Debug, Snafu)] +#[allow(missing_docs)] +pub(crate) enum Error { + #[snafu(display("Error performing get request {}: {}", path, source))] + GetRequest { + source: reqwest::Error, + path: String, + }, + + #[snafu(display("Error performing put request {}: {}", path, source))] + PutRequest { + source: reqwest::Error, + path: String, + }, + + #[snafu(display("Error performing delete request {}: {}", path, source))] + DeleteRequest { + source: reqwest::Error, + path: String, + }, + + #[snafu(display("Error performing copy request {}: {}", path, source))] + CopyRequest { + source: reqwest::Error, + path: String, + }, + + #[snafu(display("Error performing list request: {}", source))] + ListRequest { source: reqwest::Error }, + + #[snafu(display("Error performing create multipart request: {}", source))] + CreateMultipartRequest { source: reqwest::Error }, + + #[snafu(display("Error performing complete multipart request: {}", source))] + CompleteMultipartRequest { source: reqwest::Error }, + + #[snafu(display("Got invalid list response: {}", source))] + InvalidListResponse { source: quick_xml::de::DeError }, + + #[snafu(display("Got invalid multipart response: {}", source))] + InvalidMultipartResponse { source: quick_xml::de::DeError }, +} + +impl From for crate::Error { + fn from(err: Error) -> Self { + match err { + Error::GetRequest { source, path } + | Error::DeleteRequest { source, path } + | Error::CopyRequest { source, path } + | Error::PutRequest { source, path } + if matches!(source.status(), Some(StatusCode::NOT_FOUND)) => + { + Self::NotFound { + path, + source: Box::new(source), + } + } + _ => Self::Generic { + store: "S3", + source: Box::new(err), + }, + } + } +} + +#[derive(Debug, Deserialize)] +#[serde(rename_all = "PascalCase")] +pub struct ListResponse { + #[serde(default)] + pub contents: Vec, + #[serde(default)] + pub common_prefixes: Vec, + #[serde(default)] + pub next_continuation_token: Option, +} + +impl TryFrom for ListResult { + type Error = crate::Error; + + fn try_from(value: ListResponse) -> Result { + let common_prefixes = value + .common_prefixes + .into_iter() + .map(|x| Ok(Path::parse(&x.prefix)?)) + .collect::>()?; + + let objects = value + .contents + .into_iter() + .map(TryFrom::try_from) + .collect::>()?; + + Ok(Self { + common_prefixes, + objects, + }) + } +} + +#[derive(Debug, Deserialize)] +#[serde(rename_all = "PascalCase")] +pub struct ListPrefix { + pub prefix: String, +} + +#[derive(Debug, Deserialize)] +#[serde(rename_all = "PascalCase")] +pub struct ListContents { + pub key: String, + pub size: usize, + pub last_modified: DateTime, +} + +impl TryFrom for ObjectMeta { + type Error = crate::Error; + + fn try_from(value: ListContents) -> Result { + Ok(Self { + location: Path::parse(value.key)?, + last_modified: value.last_modified, + size: value.size, + }) + } +} + +#[derive(Debug, Deserialize)] +#[serde(rename_all = "PascalCase")] +struct InitiateMultipart { + upload_id: String, +} + +#[derive(Debug, Serialize)] +#[serde(rename_all = "PascalCase", rename = "CompleteMultipartUpload")] +struct CompleteMultipart { + part: Vec, +} + +#[derive(Debug, Serialize)] +struct MultipartPart { + #[serde(rename = "$unflatten=ETag")] + e_tag: String, + #[serde(rename = "$unflatten=PartNumber")] + part_number: usize, +} + +#[derive(Debug)] +pub struct S3Config { + pub region: String, + pub endpoint: String, + pub bucket: String, + pub credentials: CredentialProvider, + pub retry_config: RetryConfig, + pub allow_http: bool, +} + +impl S3Config { + fn path_url(&self, path: &Path) -> String { + format!("{}/{}/{}", self.endpoint, self.bucket, encode_path(path)) + } +} + +#[derive(Debug)] +pub(crate) struct S3Client { + config: S3Config, + client: ReqwestClient, +} + +impl S3Client { + pub fn new(config: S3Config) -> Self { + let client = reqwest::ClientBuilder::new() + .https_only(!config.allow_http) + .build() + .unwrap(); + + Self { config, client } + } + + /// Returns the config + pub fn config(&self) -> &S3Config { + &self.config + } + + async fn get_credential(&self) -> Result> { + self.config.credentials.get_credential().await + } + + /// Make an S3 GET request + pub async fn get_request( + &self, + path: &Path, + range: Option>, + head: bool, + ) -> Result { + use reqwest::header::RANGE; + + let credential = self.get_credential().await?; + let url = self.config.path_url(path); + let method = match head { + true => Method::HEAD, + false => Method::GET, + }; + + let mut builder = self.client.request(method, url); + + if let Some(range) = range { + builder = builder.header(RANGE, format_http_range(range)); + } + + let response = builder + .with_aws_sigv4(credential.as_ref(), &self.config.region, "s3") + .send_retry(&self.config.retry_config) + .await + .context(GetRequestSnafu { + path: path.as_ref(), + })? + .error_for_status() + .context(GetRequestSnafu { + path: path.as_ref(), + })?; + + Ok(response) + } + + /// Make an S3 PUT request + pub async fn put_request( + &self, + path: &Path, + bytes: Option, + query: &T, + ) -> Result { + let credential = self.get_credential().await?; + let url = self.config.path_url(path); + + let mut builder = self.client.request(Method::PUT, url); + if let Some(bytes) = bytes { + builder = builder.body(bytes) + } + + let response = builder + .query(query) + .with_aws_sigv4(credential.as_ref(), &self.config.region, "s3") + .send_retry(&self.config.retry_config) + .await + .context(PutRequestSnafu { + path: path.as_ref(), + })? + .error_for_status() + .context(PutRequestSnafu { + path: path.as_ref(), + })?; + + Ok(response) + } + + /// Make an S3 Delete request + pub async fn delete_request( + &self, + path: &Path, + query: &T, + ) -> Result<()> { + let credential = self.get_credential().await?; + let url = self.config.path_url(path); + + self.client + .request(Method::DELETE, url) + .query(query) + .with_aws_sigv4(credential.as_ref(), &self.config.region, "s3") + .send_retry(&self.config.retry_config) + .await + .context(DeleteRequestSnafu { + path: path.as_ref(), + })? + .error_for_status() + .context(DeleteRequestSnafu { + path: path.as_ref(), + })?; + + Ok(()) + } + + /// Make an S3 Copy request + pub async fn copy_request(&self, from: &Path, to: &Path) -> Result<()> { + let credential = self.get_credential().await?; + let url = self.config.path_url(to); + let source = format!("{}/{}", self.config.bucket, encode_path(from)); + + self.client + .request(Method::PUT, url) + .header("x-amz-copy-source", source) + .with_aws_sigv4(credential.as_ref(), &self.config.region, "s3") + .send_retry(&self.config.retry_config) + .await + .context(CopyRequestSnafu { + path: from.as_ref(), + })? + .error_for_status() + .context(CopyRequestSnafu { + path: from.as_ref(), + })?; + + Ok(()) + } + + /// Make an S3 List request + async fn list_request( + &self, + prefix: Option<&str>, + delimiter: bool, + token: Option<&str>, + ) -> Result<(ListResult, Option)> { + let credential = self.get_credential().await?; + let url = format!("{}/{}", self.config.endpoint, self.config.bucket); + + let mut query = Vec::with_capacity(4); + + // Note: the order of these matters to ensure the generated URL is canonical + if let Some(token) = token { + query.push(("continuation-token", token)) + } + + if delimiter { + query.push(("delimiter", DELIMITER)) + } + + query.push(("list-type", "2")); + + if let Some(prefix) = prefix { + query.push(("prefix", prefix)) + } + + let response = self + .client + .request(Method::GET, &url) + .query(&query) + .with_aws_sigv4(credential.as_ref(), &self.config.region, "s3") + .send_retry(&self.config.retry_config) + .await + .context(ListRequestSnafu)? + .error_for_status() + .context(ListRequestSnafu)? + .bytes() + .await + .context(ListRequestSnafu)?; + + let mut response: ListResponse = quick_xml::de::from_reader(response.reader()) + .context(InvalidListResponseSnafu)?; + let token = response.next_continuation_token.take(); + + Ok((response.try_into()?, token)) + } + + /// Perform a list operation automatically handling pagination + pub fn list_paginated( + &self, + prefix: Option<&Path>, + delimiter: bool, + ) -> BoxStream<'_, Result> { + let prefix = format_prefix(prefix); + stream_paginated(prefix, move |prefix, token| async move { + let (r, next_token) = self + .list_request(prefix.as_deref(), delimiter, token.as_deref()) + .await?; + Ok((r, prefix, next_token)) + }) + .boxed() + } + + pub async fn create_multipart(&self, location: &Path) -> Result { + let credential = self.get_credential().await?; + let url = format!( + "{}/{}/{}?uploads", + self.config.endpoint, + self.config.bucket, + encode_path(location) + ); + + let response = self + .client + .request(Method::POST, url) + .with_aws_sigv4(credential.as_ref(), &self.config.region, "s3") + .send_retry(&self.config.retry_config) + .await + .context(CreateMultipartRequestSnafu)? + .error_for_status() + .context(CreateMultipartRequestSnafu)? + .bytes() + .await + .context(CreateMultipartRequestSnafu)?; + + let response: InitiateMultipart = quick_xml::de::from_reader(response.reader()) + .context(InvalidMultipartResponseSnafu)?; + + Ok(response.upload_id) + } + + pub async fn complete_multipart( + &self, + location: &Path, + upload_id: &str, + parts: Vec, + ) -> Result<()> { + let parts = parts + .into_iter() + .enumerate() + .map(|(part_idx, part)| MultipartPart { + e_tag: part.content_id, + part_number: part_idx + 1, + }) + .collect(); + + let request = CompleteMultipart { part: parts }; + let body = quick_xml::se::to_string(&request).unwrap(); + + let credential = self.get_credential().await?; + let url = self.config.path_url(location); + + self.client + .request(Method::POST, url) + .query(&[("uploadId", upload_id)]) + .body(body) + .with_aws_sigv4(credential.as_ref(), &self.config.region, "s3") + .send_retry(&self.config.retry_config) + .await + .context(CompleteMultipartRequestSnafu)? + .error_for_status() + .context(CompleteMultipartRequestSnafu)?; + + Ok(()) + } +} + +fn encode_path(path: &Path) -> PercentEncode<'_> { + utf8_percent_encode(path.as_ref(), &STRICT_PATH_ENCODE_SET) +} diff --git a/src/aws/credential.rs b/src/aws/credential.rs new file mode 100644 index 0000000..b750059 --- /dev/null +++ b/src/aws/credential.rs @@ -0,0 +1,590 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::client::retry::RetryExt; +use crate::client::token::{TemporaryToken, TokenCache}; +use crate::{Result, RetryConfig}; +use bytes::Buf; +use chrono::{DateTime, Utc}; +use futures::TryFutureExt; +use reqwest::header::{HeaderMap, HeaderValue}; +use reqwest::{Client, Method, Request, RequestBuilder}; +use serde::Deserialize; +use std::collections::BTreeMap; +use std::sync::Arc; +use std::time::Instant; + +type StdError = Box; + +/// SHA256 hash of empty string +static EMPTY_SHA256_HASH: &str = + "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855"; + +#[derive(Debug)] +pub struct AwsCredential { + pub key_id: String, + pub secret_key: String, + pub token: Option, +} + +impl AwsCredential { + /// Signs a string + /// + /// + fn sign( + &self, + to_sign: &str, + date: DateTime, + region: &str, + service: &str, + ) -> String { + let date_string = date.format("%Y%m%d").to_string(); + let date_hmac = hmac_sha256(format!("AWS4{}", self.secret_key), date_string); + let region_hmac = hmac_sha256(date_hmac, region); + let service_hmac = hmac_sha256(region_hmac, service); + let signing_hmac = hmac_sha256(service_hmac, b"aws4_request"); + hex_encode(hmac_sha256(signing_hmac, to_sign).as_ref()) + } +} + +struct RequestSigner<'a> { + date: DateTime, + credential: &'a AwsCredential, + service: &'a str, + region: &'a str, +} + +const DATE_HEADER: &str = "x-amz-date"; +const HASH_HEADER: &str = "x-amz-content-sha256"; +const TOKEN_HEADER: &str = "x-amz-security-token"; +const AUTH_HEADER: &str = "authorization"; + +const ALL_HEADERS: &[&str; 4] = &[DATE_HEADER, HASH_HEADER, TOKEN_HEADER, AUTH_HEADER]; + +impl<'a> RequestSigner<'a> { + fn sign(&self, request: &mut Request) { + if let Some(ref token) = self.credential.token { + let token_val = HeaderValue::from_str(token).unwrap(); + request.headers_mut().insert(TOKEN_HEADER, token_val); + } + + let host_val = HeaderValue::from_str( + &request.url()[url::Position::BeforeHost..url::Position::AfterPort], + ) + .unwrap(); + request.headers_mut().insert("host", host_val); + + let date_str = self.date.format("%Y%m%dT%H%M%SZ").to_string(); + let date_val = HeaderValue::from_str(&date_str).unwrap(); + request.headers_mut().insert(DATE_HEADER, date_val); + + let digest = match request.body() { + None => EMPTY_SHA256_HASH.to_string(), + Some(body) => hex_digest(body.as_bytes().unwrap()), + }; + + let header_digest = HeaderValue::from_str(&digest).unwrap(); + request.headers_mut().insert(HASH_HEADER, header_digest); + + let (signed_headers, canonical_headers) = canonicalize_headers(request.headers()); + + // https://docs.aws.amazon.com/general/latest/gr/sigv4-create-canonical-request.html + let canonical_request = format!( + "{}\n{}\n{}\n{}\n{}\n{}", + request.method().as_str(), + request.url().path(), // S3 doesn't percent encode this like other services + request.url().query().unwrap_or(""), // This assumes the query pairs are in order + canonical_headers, + signed_headers, + digest + ); + + let hashed_canonical_request = hex_digest(canonical_request.as_bytes()); + let scope = format!( + "{}/{}/{}/aws4_request", + self.date.format("%Y%m%d"), + self.region, + self.service + ); + + let string_to_sign = format!( + "AWS4-HMAC-SHA256\n{}\n{}\n{}", + self.date.format("%Y%m%dT%H%M%SZ"), + scope, + hashed_canonical_request + ); + + // sign the string + let signature = + self.credential + .sign(&string_to_sign, self.date, self.region, self.service); + + // build the actual auth header + let authorisation = format!( + "AWS4-HMAC-SHA256 Credential={}/{}, SignedHeaders={}, Signature={}", + self.credential.key_id, scope, signed_headers, signature + ); + + let authorization_val = HeaderValue::from_str(&authorisation).unwrap(); + request.headers_mut().insert(AUTH_HEADER, authorization_val); + } +} + +pub trait CredentialExt { + /// Sign a request + fn with_aws_sigv4( + self, + credential: &AwsCredential, + region: &str, + service: &str, + ) -> Self; +} + +impl CredentialExt for RequestBuilder { + fn with_aws_sigv4( + mut self, + credential: &AwsCredential, + region: &str, + service: &str, + ) -> Self { + // Hack around lack of access to underlying request + // https://github.com/seanmonstar/reqwest/issues/1212 + let mut request = self + .try_clone() + .expect("not stream") + .build() + .expect("request valid"); + + let date = Utc::now(); + let signer = RequestSigner { + date, + credential, + service, + region, + }; + + signer.sign(&mut request); + + for header in ALL_HEADERS { + if let Some(val) = request.headers_mut().remove(*header) { + self = self.header(*header, val) + } + } + self + } +} + +fn hmac_sha256(secret: impl AsRef<[u8]>, bytes: impl AsRef<[u8]>) -> ring::hmac::Tag { + let key = ring::hmac::Key::new(ring::hmac::HMAC_SHA256, secret.as_ref()); + ring::hmac::sign(&key, bytes.as_ref()) +} + +/// Computes the SHA256 digest of `body` returned as a hex encoded string +fn hex_digest(bytes: &[u8]) -> String { + let digest = ring::digest::digest(&ring::digest::SHA256, bytes); + hex_encode(digest.as_ref()) +} + +/// Returns `bytes` as a lower-case hex encoded string +fn hex_encode(bytes: &[u8]) -> String { + use std::fmt::Write; + let mut out = String::with_capacity(bytes.len() * 2); + for byte in bytes { + // String writing is infallible + let _ = write!(out, "{:02x}", byte); + } + out +} + +/// Canonicalizes headers into the AWS Canonical Form. +/// +/// +fn canonicalize_headers(header_map: &HeaderMap) -> (String, String) { + let mut headers = BTreeMap::<&str, Vec<&str>>::new(); + let mut value_count = 0; + let mut value_bytes = 0; + let mut key_bytes = 0; + + for (key, value) in header_map { + let key = key.as_str(); + if ["authorization", "content-length", "user-agent"].contains(&key) { + continue; + } + + let value = std::str::from_utf8(value.as_bytes()).unwrap(); + key_bytes += key.len(); + value_bytes += value.len(); + value_count += 1; + headers.entry(key).or_default().push(value); + } + + let mut signed_headers = String::with_capacity(key_bytes + headers.len()); + let mut canonical_headers = + String::with_capacity(key_bytes + value_bytes + headers.len() + value_count); + + for (header_idx, (name, values)) in headers.into_iter().enumerate() { + if header_idx != 0 { + signed_headers.push(';'); + } + + signed_headers.push_str(name); + canonical_headers.push_str(name); + canonical_headers.push(':'); + for (value_idx, value) in values.into_iter().enumerate() { + if value_idx != 0 { + canonical_headers.push(','); + } + canonical_headers.push_str(value.trim()); + } + canonical_headers.push('\n'); + } + + (signed_headers, canonical_headers) +} + +/// Provides credentials for use when signing requests +#[derive(Debug)] +pub enum CredentialProvider { + Static(StaticCredentialProvider), + Instance(InstanceCredentialProvider), + WebIdentity(WebIdentityProvider), +} + +impl CredentialProvider { + pub async fn get_credential(&self) -> Result> { + match self { + Self::Static(s) => Ok(Arc::clone(&s.credential)), + Self::Instance(c) => c.get_credential().await, + Self::WebIdentity(c) => c.get_credential().await, + } + } +} + +/// A static set of credentials +#[derive(Debug)] +pub struct StaticCredentialProvider { + pub credential: Arc, +} + +/// Credentials sourced from the instance metadata service +/// +/// +#[derive(Debug)] +pub struct InstanceCredentialProvider { + pub cache: TokenCache>, + pub client: Client, + pub retry_config: RetryConfig, +} + +impl InstanceCredentialProvider { + async fn get_credential(&self) -> Result> { + self.cache + .get_or_insert_with(|| { + const METADATA_ENDPOINT: &str = "http://169.254.169.254"; + instance_creds(&self.client, &self.retry_config, METADATA_ENDPOINT) + .map_err(|source| crate::Error::Generic { + store: "S3", + source, + }) + }) + .await + } +} + +/// Credentials sourced using AssumeRoleWithWebIdentity +/// +/// +#[derive(Debug)] +pub struct WebIdentityProvider { + pub cache: TokenCache>, + pub token: String, + pub role_arn: String, + pub session_name: String, + pub endpoint: String, + pub client: Client, + pub retry_config: RetryConfig, +} + +impl WebIdentityProvider { + async fn get_credential(&self) -> Result> { + self.cache + .get_or_insert_with(|| { + web_identity( + &self.client, + &self.retry_config, + &self.token, + &self.role_arn, + &self.session_name, + &self.endpoint, + ) + .map_err(|source| crate::Error::Generic { + store: "S3", + source, + }) + }) + .await + } +} + +#[derive(Debug, Deserialize)] +#[serde(rename_all = "PascalCase")] +struct InstanceCredentials { + access_key_id: String, + secret_access_key: String, + token: String, + expiration: DateTime, +} + +impl From for AwsCredential { + fn from(s: InstanceCredentials) -> Self { + Self { + key_id: s.access_key_id, + secret_key: s.secret_access_key, + token: Some(s.token), + } + } +} + +/// +async fn instance_creds( + client: &Client, + retry_config: &RetryConfig, + endpoint: &str, +) -> Result>, StdError> { + const CREDENTIALS_PATH: &str = "latest/meta-data/iam/security-credentials"; + const AWS_EC2_METADATA_TOKEN_HEADER: &str = "X-aws-ec2-metadata-token"; + + let token_url = format!("{}/latest/api/token", endpoint); + let token = client + .request(Method::PUT, token_url) + .header("X-aws-ec2-metadata-token-ttl-seconds", "600") // 10 minute TTL + .send_retry(retry_config) + .await? + .text() + .await?; + + let role_url = format!("{}/{}/", endpoint, CREDENTIALS_PATH); + let role = client + .request(Method::GET, role_url) + .header(AWS_EC2_METADATA_TOKEN_HEADER, &token) + .send_retry(retry_config) + .await? + .text() + .await?; + + let creds_url = format!("{}/{}/{}", endpoint, CREDENTIALS_PATH, role); + let creds: InstanceCredentials = client + .request(Method::GET, creds_url) + .header(AWS_EC2_METADATA_TOKEN_HEADER, &token) + .send_retry(retry_config) + .await? + .json() + .await?; + + let now = Utc::now(); + let ttl = (creds.expiration - now).to_std().unwrap_or_default(); + Ok(TemporaryToken { + token: Arc::new(creds.into()), + expiry: Instant::now() + ttl, + }) +} + +#[derive(Debug, Deserialize)] +#[serde(rename_all = "PascalCase")] +struct AssumeRoleResponse { + assume_role_with_web_identity_result: AssumeRoleResult, +} + +#[derive(Debug, Deserialize)] +#[serde(rename_all = "PascalCase")] +struct AssumeRoleResult { + credentials: AssumeRoleCredentials, +} + +#[derive(Debug, Deserialize)] +#[serde(rename_all = "PascalCase")] +struct AssumeRoleCredentials { + session_token: String, + secret_access_key: String, + access_key_id: String, + expiration: DateTime, +} + +impl From for AwsCredential { + fn from(s: AssumeRoleCredentials) -> Self { + Self { + key_id: s.access_key_id, + secret_key: s.secret_access_key, + token: Some(s.session_token), + } + } +} + +/// +async fn web_identity( + client: &Client, + retry_config: &RetryConfig, + token: &str, + role_arn: &str, + session_name: &str, + endpoint: &str, +) -> Result>, StdError> { + let bytes = client + .request(Method::POST, endpoint) + .query(&[ + ("Action", "AssumeRoleWithWebIdentity"), + ("DurationSeconds", "3600"), + ("RoleArn", role_arn), + ("RoleSessionName", session_name), + ("Version", "2011-06-15"), + ("WebIdentityToken", token), + ]) + .send_retry(retry_config) + .await? + .bytes() + .await?; + + let resp: AssumeRoleResponse = quick_xml::de::from_reader(bytes.reader()) + .map_err(|e| format!("Invalid AssumeRoleWithWebIdentity response: {}", e))?; + + let creds = resp.assume_role_with_web_identity_result.credentials; + let now = Utc::now(); + let ttl = (creds.expiration - now).to_std().unwrap_or_default(); + + Ok(TemporaryToken { + token: Arc::new(creds.into()), + expiry: Instant::now() + ttl, + }) +} + +#[cfg(test)] +mod tests { + use super::*; + use reqwest::{Client, Method}; + use std::env; + + // Test generated using https://docs.aws.amazon.com/general/latest/gr/sigv4-signed-request-examples.html + #[test] + fn test_sign() { + let client = Client::new(); + + // Test credentials from https://docs.aws.amazon.com/AmazonS3/latest/userguide/RESTAuthentication.html + let credential = AwsCredential { + key_id: "AKIAIOSFODNN7EXAMPLE".to_string(), + secret_key: "wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY".to_string(), + token: None, + }; + + // method = 'GET' + // service = 'ec2' + // host = 'ec2.amazonaws.com' + // region = 'us-east-1' + // endpoint = 'https://ec2.amazonaws.com' + // request_parameters = '' + let date = DateTime::parse_from_rfc3339("2022-08-06T18:01:34Z") + .unwrap() + .with_timezone(&Utc); + + let mut request = client + .request(Method::GET, "https://ec2.amazon.com/") + .build() + .unwrap(); + + let signer = RequestSigner { + date, + credential: &credential, + service: "ec2", + region: "us-east-1", + }; + + signer.sign(&mut request); + assert_eq!(request.headers().get(AUTH_HEADER).unwrap(), "AWS4-HMAC-SHA256 Credential=AKIAIOSFODNN7EXAMPLE/20220806/us-east-1/ec2/aws4_request, SignedHeaders=host;x-amz-content-sha256;x-amz-date, Signature=a3c787a7ed37f7fdfbfd2d7056a3d7c9d85e6d52a2bfbec73793c0be6e7862d4") + } + + #[test] + fn test_sign_port() { + let client = Client::new(); + + let credential = AwsCredential { + key_id: "H20ABqCkLZID4rLe".to_string(), + secret_key: "jMqRDgxSsBqqznfmddGdu1TmmZOJQxdM".to_string(), + token: None, + }; + + let date = DateTime::parse_from_rfc3339("2022-08-09T13:05:25Z") + .unwrap() + .with_timezone(&Utc); + + let mut request = client + .request(Method::GET, "http://localhost:9000/tsm-schemas") + .query(&[ + ("delimiter", "/"), + ("encoding-type", "url"), + ("list-type", "2"), + ("prefix", ""), + ]) + .build() + .unwrap(); + + let signer = RequestSigner { + date, + credential: &credential, + service: "s3", + region: "us-east-1", + }; + + signer.sign(&mut request); + assert_eq!(request.headers().get(AUTH_HEADER).unwrap(), "AWS4-HMAC-SHA256 Credential=H20ABqCkLZID4rLe/20220809/us-east-1/s3/aws4_request, SignedHeaders=host;x-amz-content-sha256;x-amz-date, Signature=9ebf2f92872066c99ac94e573b4e1b80f4dbb8a32b1e8e23178318746e7d1b4d") + } + + #[tokio::test] + async fn test_instance_metadata() { + if env::var("TEST_INTEGRATION").is_err() { + eprintln!("skipping AWS integration test"); + } + + // For example https://github.com/aws/amazon-ec2-metadata-mock + let endpoint = env::var("EC2_METADATA_ENDPOINT").unwrap(); + let client = Client::new(); + let retry_config = RetryConfig::default(); + + // Verify only allows IMDSv2 + let resp = client + .request(Method::GET, format!("{}/latest/meta-data/ami-id", endpoint)) + .send() + .await + .unwrap(); + + assert_eq!( + resp.status(), + reqwest::StatusCode::UNAUTHORIZED, + "Ensure metadata endpoint is set to only allow IMDSv2" + ); + + let creds = instance_creds(&client, &retry_config, &endpoint) + .await + .unwrap(); + + let id = &creds.token.key_id; + let secret = &creds.token.secret_key; + let token = creds.token.token.as_ref().unwrap(); + + assert!(!id.is_empty()); + assert!(!secret.is_empty()); + assert!(!token.is_empty()) + } +} diff --git a/src/aws/mod.rs b/src/aws/mod.rs new file mode 100644 index 0000000..06d20cc --- /dev/null +++ b/src/aws/mod.rs @@ -0,0 +1,646 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! An object store implementation for S3 +//! +//! ## Multi-part uploads +//! +//! Multi-part uploads can be initiated with the [ObjectStore::put_multipart] method. +//! Data passed to the writer is automatically buffered to meet the minimum size +//! requirements for a part. Multiple parts are uploaded concurrently. +//! +//! If the writer fails for any reason, you may have parts uploaded to AWS but not +//! used that you may be charged for. Use the [ObjectStore::abort_multipart] method +//! to abort the upload and drop those unneeded parts. In addition, you may wish to +//! consider implementing [automatic cleanup] of unused parts that are older than one +//! week. +//! +//! [automatic cleanup]: https://aws.amazon.com/blogs/aws/s3-lifecycle-management-update-support-for-multipart-uploads-and-delete-markers/ + +use async_trait::async_trait; +use bytes::Bytes; +use chrono::{DateTime, Utc}; +use futures::stream::BoxStream; +use futures::TryStreamExt; +use reqwest::Client; +use snafu::{OptionExt, ResultExt, Snafu}; +use std::collections::BTreeSet; +use std::ops::Range; +use std::sync::Arc; +use tokio::io::AsyncWrite; +use tracing::info; + +use crate::aws::client::{S3Client, S3Config}; +use crate::aws::credential::{ + AwsCredential, CredentialProvider, InstanceCredentialProvider, + StaticCredentialProvider, WebIdentityProvider, +}; +use crate::multipart::{CloudMultiPartUpload, CloudMultiPartUploadImpl, UploadPart}; +use crate::{ + GetResult, ListResult, MultipartId, ObjectMeta, ObjectStore, Path, Result, + RetryConfig, StreamExt, +}; + +mod client; +mod credential; + +/// A specialized `Error` for object store-related errors +#[derive(Debug, Snafu)] +#[allow(missing_docs)] +enum Error { + #[snafu(display("Last-Modified Header missing from response"))] + MissingLastModified, + + #[snafu(display("Content-Length Header missing from response"))] + MissingContentLength, + + #[snafu(display("Invalid last modified '{}': {}", last_modified, source))] + InvalidLastModified { + last_modified: String, + source: chrono::ParseError, + }, + + #[snafu(display("Invalid content length '{}': {}", content_length, source))] + InvalidContentLength { + content_length: String, + source: std::num::ParseIntError, + }, + + #[snafu(display("Missing region"))] + MissingRegion, + + #[snafu(display("Missing bucket name"))] + MissingBucketName, + + #[snafu(display("Missing AccessKeyId"))] + MissingAccessKeyId, + + #[snafu(display("Missing SecretAccessKey"))] + MissingSecretAccessKey, + + #[snafu(display("ETag Header missing from response"))] + MissingEtag, + + #[snafu(display("Received header containing non-ASCII data"))] + BadHeader { source: reqwest::header::ToStrError }, + + #[snafu(display("Error reading token file: {}", source))] + ReadTokenFile { source: std::io::Error }, +} + +impl From for super::Error { + fn from(err: Error) -> Self { + Self::Generic { + store: "S3", + source: Box::new(err), + } + } +} + +/// Interface for [Amazon S3](https://aws.amazon.com/s3/). +#[derive(Debug)] +pub struct AmazonS3 { + client: Arc, +} + +impl std::fmt::Display for AmazonS3 { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "AmazonS3({})", self.client.config().bucket) + } +} + +#[async_trait] +impl ObjectStore for AmazonS3 { + async fn put(&self, location: &Path, bytes: Bytes) -> Result<()> { + self.client.put_request(location, Some(bytes), &()).await?; + Ok(()) + } + + async fn put_multipart( + &self, + location: &Path, + ) -> Result<(MultipartId, Box)> { + let id = self.client.create_multipart(location).await?; + + let upload = S3MultiPartUpload { + location: location.clone(), + upload_id: id.clone(), + client: Arc::clone(&self.client), + }; + + Ok((id, Box::new(CloudMultiPartUpload::new(upload, 8)))) + } + + async fn abort_multipart( + &self, + location: &Path, + multipart_id: &MultipartId, + ) -> Result<()> { + self.client + .delete_request(location, &[("uploadId", multipart_id)]) + .await + } + + async fn get(&self, location: &Path) -> Result { + let response = self.client.get_request(location, None, false).await?; + let stream = response + .bytes_stream() + .map_err(|source| crate::Error::Generic { + store: "S3", + source: Box::new(source), + }) + .boxed(); + + Ok(GetResult::Stream(stream)) + } + + async fn get_range(&self, location: &Path, range: Range) -> Result { + let bytes = self + .client + .get_request(location, Some(range), false) + .await? + .bytes() + .await + .map_err(|source| client::Error::GetRequest { + source, + path: location.to_string(), + })?; + Ok(bytes) + } + + async fn head(&self, location: &Path) -> Result { + use reqwest::header::{CONTENT_LENGTH, LAST_MODIFIED}; + + // Extract meta from headers + // https://docs.aws.amazon.com/AmazonS3/latest/API/API_HeadObject.html#API_HeadObject_ResponseSyntax + let response = self.client.get_request(location, None, true).await?; + let headers = response.headers(); + + let last_modified = headers + .get(LAST_MODIFIED) + .context(MissingLastModifiedSnafu)?; + + let content_length = headers + .get(CONTENT_LENGTH) + .context(MissingContentLengthSnafu)?; + + let last_modified = last_modified.to_str().context(BadHeaderSnafu)?; + let last_modified = DateTime::parse_from_rfc2822(last_modified) + .context(InvalidLastModifiedSnafu { last_modified })? + .with_timezone(&Utc); + + let content_length = content_length.to_str().context(BadHeaderSnafu)?; + let content_length = content_length + .parse() + .context(InvalidContentLengthSnafu { content_length })?; + Ok(ObjectMeta { + location: location.clone(), + last_modified, + size: content_length, + }) + } + + async fn delete(&self, location: &Path) -> Result<()> { + self.client.delete_request(location, &()).await + } + + async fn list( + &self, + prefix: Option<&Path>, + ) -> Result>> { + let stream = self + .client + .list_paginated(prefix, false) + .map_ok(|r| futures::stream::iter(r.objects.into_iter().map(Ok))) + .try_flatten() + .boxed(); + + Ok(stream) + } + + async fn list_with_delimiter(&self, prefix: Option<&Path>) -> Result { + let mut stream = self.client.list_paginated(prefix, true); + + let mut common_prefixes = BTreeSet::new(); + let mut objects = Vec::new(); + + while let Some(result) = stream.next().await { + let response = result?; + common_prefixes.extend(response.common_prefixes.into_iter()); + objects.extend(response.objects.into_iter()); + } + + Ok(ListResult { + common_prefixes: common_prefixes.into_iter().collect(), + objects, + }) + } + + async fn copy(&self, from: &Path, to: &Path) -> Result<()> { + self.client.copy_request(from, to).await + } + + async fn copy_if_not_exists(&self, _source: &Path, _dest: &Path) -> Result<()> { + // Will need dynamodb_lock + Err(crate::Error::NotImplemented) + } +} + +struct S3MultiPartUpload { + location: Path, + upload_id: String, + client: Arc, +} + +#[async_trait] +impl CloudMultiPartUploadImpl for S3MultiPartUpload { + async fn put_multipart_part( + &self, + buf: Vec, + part_idx: usize, + ) -> Result { + use reqwest::header::ETAG; + let part = (part_idx + 1).to_string(); + + let response = self + .client + .put_request( + &self.location, + Some(buf.into()), + &[("partNumber", &part), ("uploadId", &self.upload_id)], + ) + .await?; + + let etag = response + .headers() + .get(ETAG) + .context(MissingEtagSnafu) + .map_err(crate::Error::from)?; + + let etag = etag + .to_str() + .context(BadHeaderSnafu) + .map_err(crate::Error::from)?; + + Ok(UploadPart { + content_id: etag.to_string(), + }) + } + + async fn complete( + &self, + completed_parts: Vec, + ) -> Result<(), std::io::Error> { + self.client + .complete_multipart(&self.location, &self.upload_id, completed_parts) + .await?; + Ok(()) + } +} + +/// Configure a connection to Amazon S3 using the specified credentials in +/// the specified Amazon region and bucket. +/// +/// # Example +/// ``` +/// # let REGION = "foo"; +/// # let BUCKET_NAME = "foo"; +/// # let ACCESS_KEY_ID = "foo"; +/// # let SECRET_KEY = "foo"; +/// # use object_store::aws::AmazonS3Builder; +/// let s3 = AmazonS3Builder::new() +/// .with_region(REGION) +/// .with_bucket_name(BUCKET_NAME) +/// .with_access_key_id(ACCESS_KEY_ID) +/// .with_secret_access_key(SECRET_KEY) +/// .build(); +/// ``` +#[derive(Debug, Default)] +pub struct AmazonS3Builder { + access_key_id: Option, + secret_access_key: Option, + region: Option, + bucket_name: Option, + endpoint: Option, + token: Option, + retry_config: RetryConfig, + allow_http: bool, +} + +impl AmazonS3Builder { + /// Create a new [`AmazonS3Builder`] with default values. + pub fn new() -> Self { + Default::default() + } + + /// Set the AWS Access Key (required) + pub fn with_access_key_id(mut self, access_key_id: impl Into) -> Self { + self.access_key_id = Some(access_key_id.into()); + self + } + + /// Set the AWS Secret Access Key (required) + pub fn with_secret_access_key( + mut self, + secret_access_key: impl Into, + ) -> Self { + self.secret_access_key = Some(secret_access_key.into()); + self + } + + /// Set the region (e.g. `us-east-1`) (required) + pub fn with_region(mut self, region: impl Into) -> Self { + self.region = Some(region.into()); + self + } + + /// Set the bucket_name (required) + pub fn with_bucket_name(mut self, bucket_name: impl Into) -> Self { + self.bucket_name = Some(bucket_name.into()); + self + } + + /// Sets the endpoint for communicating with AWS S3. Default value + /// is based on region. + /// + /// For example, this might be set to `"http://localhost:4566:` + /// for testing against a localstack instance. + pub fn with_endpoint(mut self, endpoint: impl Into) -> Self { + self.endpoint = Some(endpoint.into()); + self + } + + /// Set the token to use for requests (passed to underlying provider) + pub fn with_token(mut self, token: impl Into) -> Self { + self.token = Some(token.into()); + self + } + + /// Sets what protocol is allowed. If `allow_http` is : + /// * false (default): Only HTTPS are allowed + /// * true: HTTP and HTTPS are allowed + pub fn with_allow_http(mut self, allow_http: bool) -> Self { + self.allow_http = allow_http; + self + } + + /// Set the retry configuration + pub fn with_retry(mut self, retry_config: RetryConfig) -> Self { + self.retry_config = retry_config; + self + } + + /// Create a [`AmazonS3`] instance from the provided values, + /// consuming `self`. + pub fn build(self) -> Result { + let bucket = self.bucket_name.context(MissingBucketNameSnafu)?; + let region = self.region.context(MissingRegionSnafu)?; + + let credentials = match (self.access_key_id, self.secret_access_key, self.token) { + (Some(key_id), Some(secret_key), token) => { + info!("Using Static credential provider"); + CredentialProvider::Static(StaticCredentialProvider { + credential: Arc::new(AwsCredential { + key_id, + secret_key, + token, + }), + }) + } + (None, Some(_), _) => return Err(Error::MissingAccessKeyId.into()), + (Some(_), None, _) => return Err(Error::MissingSecretAccessKey.into()), + // TODO: Replace with `AmazonS3Builder::credentials_from_env` + _ => match ( + std::env::var_os("AWS_WEB_IDENTITY_TOKEN_FILE"), + std::env::var("AWS_ROLE_ARN"), + ) { + (Some(token_file), Ok(role_arn)) => { + info!("Using WebIdentity credential provider"); + let token = std::fs::read_to_string(token_file) + .context(ReadTokenFileSnafu)?; + + let session_name = std::env::var("AWS_ROLE_SESSION_NAME") + .unwrap_or_else(|_| "WebIdentitySession".to_string()); + + let endpoint = format!("https://sts.{}.amazonaws.com", region); + + // Disallow non-HTTPs requests + let client = Client::builder().https_only(true).build().unwrap(); + + CredentialProvider::WebIdentity(WebIdentityProvider { + cache: Default::default(), + token, + session_name, + role_arn, + endpoint, + client, + retry_config: self.retry_config.clone(), + }) + } + _ => { + info!("Using Instance credential provider"); + + // The instance metadata endpoint is access over HTTP + let client = Client::builder().https_only(false).build().unwrap(); + + CredentialProvider::Instance(InstanceCredentialProvider { + cache: Default::default(), + client, + retry_config: self.retry_config.clone(), + }) + } + }, + }; + + let endpoint = self + .endpoint + .unwrap_or_else(|| format!("https://s3.{}.amazonaws.com", region)); + + let config = S3Config { + region, + endpoint, + bucket, + credentials, + retry_config: self.retry_config, + allow_http: self.allow_http, + }; + + let client = Arc::new(S3Client::new(config)); + + Ok(AmazonS3 { client }) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::tests::{ + get_nonexistent_object, list_uses_directories_correctly, list_with_delimiter, + put_get_delete_list, rename_and_copy, stream_get, + }; + use bytes::Bytes; + use std::env; + + const NON_EXISTENT_NAME: &str = "nonexistentname"; + + // Helper macro to skip tests if TEST_INTEGRATION and the AWS + // environment variables are not set. Returns a configured + // AmazonS3Builder + macro_rules! maybe_skip_integration { + () => {{ + dotenv::dotenv().ok(); + + let required_vars = [ + "AWS_DEFAULT_REGION", + "OBJECT_STORE_BUCKET", + "AWS_ACCESS_KEY_ID", + "AWS_SECRET_ACCESS_KEY", + ]; + let unset_vars: Vec<_> = required_vars + .iter() + .filter_map(|&name| match env::var(name) { + Ok(_) => None, + Err(_) => Some(name), + }) + .collect(); + let unset_var_names = unset_vars.join(", "); + + let force = env::var("TEST_INTEGRATION"); + + if force.is_ok() && !unset_var_names.is_empty() { + panic!( + "TEST_INTEGRATION is set, \ + but variable(s) {} need to be set", + unset_var_names + ); + } else if force.is_err() { + eprintln!( + "skipping AWS integration test - set {}TEST_INTEGRATION to run", + if unset_var_names.is_empty() { + String::new() + } else { + format!("{} and ", unset_var_names) + } + ); + return; + } else { + let config = AmazonS3Builder::new() + .with_access_key_id( + env::var("AWS_ACCESS_KEY_ID") + .expect("already checked AWS_ACCESS_KEY_ID"), + ) + .with_secret_access_key( + env::var("AWS_SECRET_ACCESS_KEY") + .expect("already checked AWS_SECRET_ACCESS_KEY"), + ) + .with_region( + env::var("AWS_DEFAULT_REGION") + .expect("already checked AWS_DEFAULT_REGION"), + ) + .with_bucket_name( + env::var("OBJECT_STORE_BUCKET") + .expect("already checked OBJECT_STORE_BUCKET"), + ) + .with_allow_http(true); + + let config = if let Some(endpoint) = env::var("AWS_ENDPOINT").ok() { + config.with_endpoint(endpoint) + } else { + config + }; + + let config = if let Some(token) = env::var("AWS_SESSION_TOKEN").ok() { + config.with_token(token) + } else { + config + }; + + config + } + }}; + } + + #[tokio::test] + async fn s3_test() { + let config = maybe_skip_integration!(); + let integration = config.build().unwrap(); + + put_get_delete_list(&integration).await; + list_uses_directories_correctly(&integration).await; + list_with_delimiter(&integration).await; + rename_and_copy(&integration).await; + stream_get(&integration).await; + } + + #[tokio::test] + async fn s3_test_get_nonexistent_location() { + let config = maybe_skip_integration!(); + let integration = config.build().unwrap(); + + let location = Path::from_iter([NON_EXISTENT_NAME]); + + let err = get_nonexistent_object(&integration, Some(location)) + .await + .unwrap_err(); + assert!(matches!(err, crate::Error::NotFound { .. }), "{}", err); + } + + #[tokio::test] + async fn s3_test_get_nonexistent_bucket() { + let config = maybe_skip_integration!().with_bucket_name(NON_EXISTENT_NAME); + let integration = config.build().unwrap(); + + let location = Path::from_iter([NON_EXISTENT_NAME]); + + let err = integration.get(&location).await.unwrap_err(); + assert!(matches!(err, crate::Error::NotFound { .. }), "{}", err); + } + + #[tokio::test] + async fn s3_test_put_nonexistent_bucket() { + let config = maybe_skip_integration!().with_bucket_name(NON_EXISTENT_NAME); + + let integration = config.build().unwrap(); + + let location = Path::from_iter([NON_EXISTENT_NAME]); + let data = Bytes::from("arbitrary data"); + + let err = integration.put(&location, data).await.unwrap_err(); + assert!(matches!(err, crate::Error::NotFound { .. }), "{}", err); + } + + #[tokio::test] + async fn s3_test_delete_nonexistent_location() { + let config = maybe_skip_integration!(); + let integration = config.build().unwrap(); + + let location = Path::from_iter([NON_EXISTENT_NAME]); + + integration.delete(&location).await.unwrap(); + } + + #[tokio::test] + async fn s3_test_delete_nonexistent_bucket() { + let config = maybe_skip_integration!().with_bucket_name(NON_EXISTENT_NAME); + let integration = config.build().unwrap(); + + let location = Path::from_iter([NON_EXISTENT_NAME]); + + let err = integration.delete(&location).await.unwrap_err(); + assert!(matches!(err, crate::Error::NotFound { .. }), "{}", err); + } +} diff --git a/src/azure.rs b/src/azure.rs index 9987c03..a9dbc53 100644 --- a/src/azure.rs +++ b/src/azure.rs @@ -49,7 +49,7 @@ use azure_storage_blobs::prelude::{ }; use bytes::Bytes; use chrono::{TimeZone, Utc}; -use futures::{future::BoxFuture, stream::BoxStream, StreamExt, TryStreamExt}; +use futures::{stream::BoxStream, StreamExt, TryStreamExt}; use snafu::{ResultExt, Snafu}; use std::collections::BTreeSet; use std::fmt::{Debug, Formatter}; @@ -765,70 +765,47 @@ impl AzureMultiPartUpload { } } +#[async_trait] impl CloudMultiPartUploadImpl for AzureMultiPartUpload { - fn put_multipart_part( + async fn put_multipart_part( &self, buf: Vec, part_idx: usize, - ) -> BoxFuture<'static, Result<(usize, UploadPart), io::Error>> { - let client = Arc::clone(&self.container_client); - let location = self.location.clone(); + ) -> Result { let block_id = self.get_block_id(part_idx); - Box::pin(async move { - client - .blob_client(location.as_ref()) - .put_block(block_id.clone(), buf) - .into_future() - .await - .map_err(|err| io::Error::new(io::ErrorKind::Other, err))?; + self.container_client + .blob_client(self.location.as_ref()) + .put_block(block_id.clone(), buf) + .into_future() + .await + .map_err(|err| io::Error::new(io::ErrorKind::Other, err))?; - Ok(( - part_idx, - UploadPart { - content_id: block_id, - }, - )) + Ok(UploadPart { + content_id: block_id, }) } - fn complete( - &self, - completed_parts: Vec>, - ) -> BoxFuture<'static, Result<(), io::Error>> { - let parts = - completed_parts - .into_iter() - .enumerate() - .map(|(part_number, maybe_part)| match maybe_part { - Some(part) => { - Ok(azure_storage_blobs::blob::BlobBlockType::Uncommitted( - azure_storage_blobs::prelude::BlockId::new(part.content_id), - )) - } - None => Err(io::Error::new( - io::ErrorKind::Other, - format!("Missing information for upload part {:?}", part_number), - )), - }); - - let client = Arc::clone(&self.container_client); - let location = self.location.clone(); - - Box::pin(async move { - let block_list = azure_storage_blobs::blob::BlockList { - blocks: parts.collect::>()?, - }; - - client - .blob_client(location.as_ref()) - .put_block_list(block_list) - .into_future() - .await - .map_err(|err| io::Error::new(io::ErrorKind::Other, err))?; + async fn complete(&self, completed_parts: Vec) -> Result<(), io::Error> { + let blocks = completed_parts + .into_iter() + .map(|part| { + azure_storage_blobs::blob::BlobBlockType::Uncommitted( + azure_storage_blobs::prelude::BlockId::new(part.content_id), + ) + }) + .collect(); - Ok(()) - }) + let block_list = azure_storage_blobs::blob::BlockList { blocks }; + + self.container_client + .blob_client(self.location.as_ref()) + .put_block_list(block_list) + .into_future() + .await + .map_err(|err| io::Error::new(io::ErrorKind::Other, err))?; + + Ok(()) } } diff --git a/src/client/mod.rs b/src/client/mod.rs index 1166ebe..7241002 100644 --- a/src/client/mod.rs +++ b/src/client/mod.rs @@ -18,6 +18,8 @@ //! Generic utilities reqwest based ObjectStore implementations pub mod backoff; +#[cfg(feature = "gcp")] pub mod oauth; +pub mod pagination; pub mod retry; pub mod token; diff --git a/src/client/pagination.rs b/src/client/pagination.rs new file mode 100644 index 0000000..3ab17fe --- /dev/null +++ b/src/client/pagination.rs @@ -0,0 +1,70 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::Result; +use futures::Stream; +use std::future::Future; + +/// Takes a paginated operation `op` that when called with: +/// +/// - A state `S` +/// - An optional next token `Option` +/// +/// Returns +/// +/// - A response value `T` +/// - The next state `S` +/// - The next continuation token `Option` +/// +/// And converts it into a `Stream>` which will first call `op(state, None)`, and yield +/// the returned response `T`. If the returned continuation token was `None` the stream will then +/// finish, otherwise it will continue to call `op(state, token)` with the values returned by the +/// previous call to `op`, until a continuation token of `None` is returned +/// +pub fn stream_paginated(state: S, op: F) -> impl Stream> +where + F: Fn(S, Option) -> Fut + Copy, + Fut: Future)>>, +{ + enum PaginationState { + Start(T), + HasMore(T, String), + Done, + } + + futures::stream::unfold(PaginationState::Start(state), move |state| async move { + let (s, page_token) = match state { + PaginationState::Start(s) => (s, None), + PaginationState::HasMore(s, page_token) => (s, Some(page_token)), + PaginationState::Done => { + return None; + } + }; + + let (resp, s, continuation) = match op(s, page_token).await { + Ok(resp) => resp, + Err(e) => return Some((Err(e), PaginationState::Done)), + }; + + let next_state = match continuation { + Some(token) => PaginationState::HasMore(s, token), + None => PaginationState::Done, + }; + + Some((Ok(resp), next_state)) + }) +} diff --git a/src/client/token.rs b/src/client/token.rs index a56a294..2ff2861 100644 --- a/src/client/token.rs +++ b/src/client/token.rs @@ -30,11 +30,19 @@ pub struct TemporaryToken { /// Provides [`TokenCache::get_or_insert_with`] which can be used to cache a /// [`TemporaryToken`] based on its expiry -#[derive(Debug, Default)] +#[derive(Debug)] pub struct TokenCache { cache: Mutex>>, } +impl Default for TokenCache { + fn default() -> Self { + Self { + cache: Default::default(), + } + } +} + impl TokenCache { pub async fn get_or_insert_with(&self, f: F) -> Result where diff --git a/src/gcp.rs b/src/gcp.rs index 0dc5a95..c9bb633 100644 --- a/src/gcp.rs +++ b/src/gcp.rs @@ -38,7 +38,6 @@ use std::sync::Arc; use async_trait::async_trait; use bytes::{Buf, Bytes}; use chrono::{DateTime, Utc}; -use futures::future::BoxFuture; use futures::{stream::BoxStream, StreamExt, TryStreamExt}; use percent_encoding::{percent_encode, NON_ALPHANUMERIC}; use reqwest::header::RANGE; @@ -46,6 +45,7 @@ use reqwest::{header, Client, Method, Response, StatusCode}; use snafu::{ResultExt, Snafu}; use tokio::io::AsyncWrite; +use crate::client::pagination::stream_paginated; use crate::client::retry::RetryExt; use crate::{ client::{oauth::OAuthProvider, token::TokenCache}, @@ -476,44 +476,16 @@ impl GoogleCloudStorageClient { &self, prefix: Option<&Path>, delimiter: bool, - ) -> Result>> { + ) -> BoxStream<'_, Result> { let prefix = format_prefix(prefix); - - enum ListState { - Start, - HasMore(String), - Done, - } - - Ok(futures::stream::unfold(ListState::Start, move |state| { - let prefix = prefix.clone(); - - async move { - let page_token = match &state { - ListState::Start => None, - ListState::HasMore(page_token) => Some(page_token.as_str()), - ListState::Done => { - return None; - } - }; - - let resp = match self - .list_request(prefix.as_deref(), delimiter, page_token) - .await - { - Ok(resp) => resp, - Err(e) => return Some((Err(e), state)), - }; - - let next_state = match &resp.next_page_token { - Some(token) => ListState::HasMore(token.clone()), - None => ListState::Done, - }; - - Some((Ok(resp), next_state)) - } + stream_paginated(prefix, move |prefix, token| async move { + let mut r = self + .list_request(prefix.as_deref(), delimiter, token.as_deref()) + .await?; + let next_token = r.next_page_token.take(); + Ok((r, prefix, next_token)) }) - .boxed()) + .boxed() } } @@ -544,116 +516,105 @@ struct GCSMultipartUpload { multipart_id: MultipartId, } +#[async_trait] impl CloudMultiPartUploadImpl for GCSMultipartUpload { /// Upload an object part - fn put_multipart_part( + async fn put_multipart_part( &self, buf: Vec, part_idx: usize, - ) -> BoxFuture<'static, Result<(usize, UploadPart), io::Error>> { + ) -> Result { let upload_id = self.multipart_id.clone(); let url = format!( "{}/{}/{}", self.client.base_url, self.client.bucket_name_encoded, self.encoded_path ); - let client = Arc::clone(&self.client); - - Box::pin(async move { - let token = client - .get_token() - .await - .map_err(|err| io::Error::new(io::ErrorKind::Other, err))?; - - let response = client - .client - .request(Method::PUT, &url) - .bearer_auth(token) - .query(&[ - ("partNumber", format!("{}", part_idx + 1)), - ("uploadId", upload_id), - ]) - .header(header::CONTENT_TYPE, "application/octet-stream") - .header(header::CONTENT_LENGTH, format!("{}", buf.len())) - .body(buf) - .send_retry(&client.retry_config) - .await - .map_err(reqwest_error_as_io)? - .error_for_status() - .map_err(reqwest_error_as_io)?; - - let content_id = response - .headers() - .get("ETag") - .ok_or_else(|| { - io::Error::new( - io::ErrorKind::InvalidData, - "response headers missing ETag", - ) - })? - .to_str() - .map_err(|err| io::Error::new(io::ErrorKind::InvalidData, err))? - .to_string(); - Ok((part_idx, UploadPart { content_id })) - }) + let token = self + .client + .get_token() + .await + .map_err(|err| io::Error::new(io::ErrorKind::Other, err))?; + + let response = self + .client + .client + .request(Method::PUT, &url) + .bearer_auth(token) + .query(&[ + ("partNumber", format!("{}", part_idx + 1)), + ("uploadId", upload_id), + ]) + .header(header::CONTENT_TYPE, "application/octet-stream") + .header(header::CONTENT_LENGTH, format!("{}", buf.len())) + .body(buf) + .send_retry(&self.client.retry_config) + .await + .map_err(reqwest_error_as_io)? + .error_for_status() + .map_err(reqwest_error_as_io)?; + + let content_id = response + .headers() + .get("ETag") + .ok_or_else(|| { + io::Error::new( + io::ErrorKind::InvalidData, + "response headers missing ETag", + ) + })? + .to_str() + .map_err(|err| io::Error::new(io::ErrorKind::InvalidData, err))? + .to_string(); + + Ok(UploadPart { content_id }) } /// Complete a multipart upload - fn complete( - &self, - completed_parts: Vec>, - ) -> BoxFuture<'static, Result<(), io::Error>> { - let client = Arc::clone(&self.client); + async fn complete(&self, completed_parts: Vec) -> Result<(), io::Error> { let upload_id = self.multipart_id.clone(); let url = format!( "{}/{}/{}", self.client.base_url, self.client.bucket_name_encoded, self.encoded_path ); - Box::pin(async move { - let parts: Vec = completed_parts - .into_iter() - .enumerate() - .map(|(part_number, maybe_part)| match maybe_part { - Some(part) => Ok(MultipartPart { - e_tag: part.content_id, - part_number: part_number + 1, - }), - None => Err(io::Error::new( - io::ErrorKind::Other, - format!("Missing information for upload part {:?}", part_number), - )), - }) - .collect::, io::Error>>()?; - - let token = client - .get_token() - .await - .map_err(|err| io::Error::new(io::ErrorKind::Other, err))?; - - let upload_info = CompleteMultipartUpload { parts }; - - let data = quick_xml::se::to_string(&upload_info) - .map_err(|err| io::Error::new(io::ErrorKind::Other, err))? - // We cannot disable the escaping that transforms "/" to ""e;" :( - // https://github.com/tafia/quick-xml/issues/362 - // https://github.com/tafia/quick-xml/issues/350 - .replace(""", "\""); - - client - .client - .request(Method::POST, &url) - .bearer_auth(token) - .query(&[("uploadId", upload_id)]) - .body(data) - .send_retry(&client.retry_config) - .await - .map_err(reqwest_error_as_io)? - .error_for_status() - .map_err(reqwest_error_as_io)?; - - Ok(()) - }) + let parts = completed_parts + .into_iter() + .enumerate() + .map(|(part_number, part)| MultipartPart { + e_tag: part.content_id, + part_number: part_number + 1, + }) + .collect(); + + let token = self + .client + .get_token() + .await + .map_err(|err| io::Error::new(io::ErrorKind::Other, err))?; + + let upload_info = CompleteMultipartUpload { parts }; + + let data = quick_xml::se::to_string(&upload_info) + .map_err(|err| io::Error::new(io::ErrorKind::Other, err))? + // We cannot disable the escaping that transforms "/" to ""e;" :( + // https://github.com/tafia/quick-xml/issues/362 + // https://github.com/tafia/quick-xml/issues/350 + .replace(""", "\""); + + self.client + .client + .request(Method::POST, &url) + .bearer_auth(token) + .query(&[("uploadId", upload_id)]) + .body(data) + .send_retry(&self.client.retry_config) + .await + .map_err(reqwest_error_as_io)? + .error_for_status() + .map_err(reqwest_error_as_io)?; + + Ok(()) } } @@ -734,7 +695,7 @@ impl ObjectStore for GoogleCloudStorage { ) -> Result>> { let stream = self .client - .list_paginated(prefix, false)? + .list_paginated(prefix, false) .map_ok(|r| { futures::stream::iter( r.items.into_iter().map(|x| convert_object_meta(&x)), @@ -747,7 +708,7 @@ impl ObjectStore for GoogleCloudStorage { } async fn list_with_delimiter(&self, prefix: Option<&Path>) -> Result { - let mut stream = self.client.list_paginated(prefix, true)?; + let mut stream = self.client.list_paginated(prefix, true); let mut common_prefixes = BTreeSet::new(); let mut objects = Vec::new(); diff --git a/src/lib.rs b/src/lib.rs index f7adedb..374f559 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -165,10 +165,10 @@ pub mod memory; pub mod path; pub mod throttle; -#[cfg(feature = "gcp")] +#[cfg(any(feature = "gcp", feature = "aws"))] mod client; -#[cfg(feature = "gcp")] +#[cfg(any(feature = "gcp", feature = "aws"))] pub use client::{backoff::BackoffConfig, retry::RetryConfig}; #[cfg(any(feature = "azure", feature = "aws", feature = "gcp"))] @@ -471,6 +471,16 @@ pub enum Error { OAuth { source: client::oauth::Error }, } +impl From for std::io::Error { + fn from(e: Error) -> Self { + let kind = match &e { + Error::NotFound { .. } => std::io::ErrorKind::NotFound, + _ => std::io::ErrorKind::Other, + }; + Self::new(kind, e) + } +} + #[cfg(test)] mod test_util { use super::*; diff --git a/src/multipart.rs b/src/multipart.rs index c16022d..1985d86 100644 --- a/src/multipart.rs +++ b/src/multipart.rs @@ -15,7 +15,8 @@ // specific language governing permissions and limitations // under the License. -use futures::{future::BoxFuture, stream::FuturesUnordered, Future, StreamExt}; +use async_trait::async_trait; +use futures::{stream::FuturesUnordered, Future, StreamExt}; use std::{io, pin::Pin, sync::Arc, task::Poll}; use tokio::io::AsyncWrite; @@ -26,23 +27,19 @@ type BoxedTryFuture = Pin> + Sen /// A trait that can be implemented by cloud-based object stores /// and used in combination with [`CloudMultiPartUpload`] to provide /// multipart upload support -/// -/// Note: this does not use AsyncTrait as the lifetimes are difficult to manage -pub(crate) trait CloudMultiPartUploadImpl { +#[async_trait] +pub(crate) trait CloudMultiPartUploadImpl: 'static { /// Upload a single part - fn put_multipart_part( + async fn put_multipart_part( &self, buf: Vec, part_idx: usize, - ) -> BoxFuture<'static, Result<(usize, UploadPart), io::Error>>; + ) -> Result; /// Complete the upload with the provided parts /// /// `completed_parts` is in order of part number - fn complete( - &self, - completed_parts: Vec>, - ) -> BoxFuture<'static, Result<(), io::Error>>; + async fn complete(&self, completed_parts: Vec) -> Result<(), io::Error>; } #[derive(Debug, Clone)] @@ -128,10 +125,12 @@ where self.current_buffer.extend_from_slice(buf); let out_buffer = std::mem::take(&mut self.current_buffer); - let task = self - .inner - .put_multipart_part(out_buffer, self.current_part_idx); - self.tasks.push(task); + let inner = Arc::clone(&self.inner); + let part_idx = self.current_part_idx; + self.tasks.push(Box::pin(async move { + let upload_part = inner.put_multipart_part(out_buffer, part_idx).await?; + Ok((part_idx, upload_part)) + })); self.current_part_idx += 1; // We need to poll immediately after adding to setup waker @@ -157,10 +156,12 @@ where // If current_buffer is not empty, see if it can be submitted if !self.current_buffer.is_empty() && self.tasks.len() < self.max_concurrency { let out_buffer: Vec = std::mem::take(&mut self.current_buffer); - let task = self - .inner - .put_multipart_part(out_buffer, self.current_part_idx); - self.tasks.push(task); + let inner = Arc::clone(&self.inner); + let part_idx = self.current_part_idx; + self.tasks.push(Box::pin(async move { + let upload_part = inner.put_multipart_part(out_buffer, part_idx).await?; + Ok((part_idx, upload_part)) + })); } self.as_mut().poll_tasks(cx)?; @@ -185,10 +186,26 @@ where // If shutdown task is not set, set it let parts = std::mem::take(&mut self.completed_parts); + let parts = parts + .into_iter() + .enumerate() + .map(|(idx, part)| { + part.ok_or_else(|| { + io::Error::new( + io::ErrorKind::Other, + format!("Missing information for upload part {}", idx), + ) + }) + }) + .collect::>()?; + let inner = Arc::clone(&self.inner); - let completion_task = self - .completion_task - .get_or_insert_with(|| inner.complete(parts)); + let completion_task = self.completion_task.get_or_insert_with(|| { + Box::pin(async move { + inner.complete(parts).await?; + Ok(()) + }) + }); Pin::new(completion_task).poll(cx) } From b251c55abd9172475e51ee19c792d1a5cfb2edc6 Mon Sep 17 00:00:00 2001 From: Quentin Date: Sat, 20 Aug 2022 17:53:50 +0200 Subject: [PATCH 026/397] Build AmazonS3builder from environment variables (#2361) (#2536) * Build AmazonS3Builder from env (#2361) * clippy: use Self instead of AmazonS3Builder * rustdoc: remove unnecessary bare_url * Save the current environment setup * fmt: missing use in documentation --- src/aws/mod.rs | 78 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 78 insertions(+) diff --git a/src/aws/mod.rs b/src/aws/mod.rs index 06d20cc..2a13083 100644 --- a/src/aws/mod.rs +++ b/src/aws/mod.rs @@ -347,6 +347,48 @@ impl AmazonS3Builder { Default::default() } + /// Fill the [`AmazonS3Builder`] with regular AWS environment variables + /// + /// Variables extracted from environment: + /// * AWS_ACCESS_KEY_ID -> access_key_id + /// * AWS_SECRET_ACCESS_KEY -> secret_access_key + /// * AWS_DEFAULT_REGION -> region + /// * AWS_ENDPOINT -> endpoint + /// * AWS_SESSION_TOKEN -> token + /// # Example + /// ``` + /// use object_store::aws::AmazonS3Builder; + /// + /// let s3 = AmazonS3Builder::from_env() + /// .with_bucket_name("foo") + /// .build(); + /// ``` + pub fn from_env() -> Self { + let mut builder: Self = Default::default(); + + if let Ok(access_key_id) = std::env::var("AWS_ACCESS_KEY_ID") { + builder.access_key_id = Some(access_key_id); + } + + if let Ok(secret_access_key) = std::env::var("AWS_SECRET_ACCESS_KEY") { + builder.secret_access_key = Some(secret_access_key); + } + + if let Ok(secret) = std::env::var("AWS_DEFAULT_REGION") { + builder.region = Some(secret); + } + + if let Ok(endpoint) = std::env::var("AWS_ENDPOINT") { + builder.endpoint = Some(endpoint); + } + + if let Ok(token) = std::env::var("AWS_SESSION_TOKEN") { + builder.token = Some(token); + } + + builder + } + /// Set the AWS Access Key (required) pub fn with_access_key_id(mut self, access_key_id: impl Into) -> Self { self.access_key_id = Some(access_key_id.into()); @@ -574,6 +616,42 @@ mod tests { }}; } + #[test] + fn s3_test_config_from_env() { + let aws_access_key_id = env::var("AWS_ACCESS_KEY_ID") + .unwrap_or("object_store:fake_access_key_id".into()); + let aws_secret_access_key = env::var("AWS_SECRET_ACCESS_KEY") + .unwrap_or("object_store:fake_secret_key".into()); + + let aws_default_region = env::var("AWS_DEFAULT_REGION") + .unwrap_or("object_store:fake_default_region".into()); + + let aws_endpoint = + env::var("AWS_ENDPOINT").unwrap_or("object_store:fake_endpoint".into()); + let aws_session_token = env::var("AWS_SESSION_TOKEN") + .unwrap_or("object_store:fake_session_token".into()); + + // required + env::set_var("AWS_ACCESS_KEY_ID", &aws_access_key_id); + env::set_var("AWS_SECRET_ACCESS_KEY", &aws_secret_access_key); + env::set_var("AWS_DEFAULT_REGION", &aws_default_region); + + // optional + env::set_var("AWS_ENDPOINT", &aws_endpoint); + env::set_var("AWS_SESSION_TOKEN", &aws_session_token); + + let builder = AmazonS3Builder::from_env(); + assert_eq!(builder.access_key_id.unwrap(), aws_access_key_id.as_str()); + assert_eq!( + builder.secret_access_key.unwrap(), + aws_secret_access_key.as_str() + ); + assert_eq!(builder.region.unwrap(), aws_default_region); + + assert_eq!(builder.endpoint.unwrap(), aws_endpoint); + assert_eq!(builder.token.unwrap(), aws_session_token); + } + #[tokio::test] async fn s3_test() { let config = maybe_skip_integration!(); From 5eaf136f911ae038fc61e262cff6439ec1482a98 Mon Sep 17 00:00:00 2001 From: Quentin Date: Tue, 23 Aug 2022 11:22:07 +0200 Subject: [PATCH 027/397] Update environment variable name for amazonS3builder in integration (#2550) (#2553) * update variable name for amazonS3builder in integration (#2550) * apply env at step level (#2550) --- src/aws/mod.rs | 33 ++++++++++++++++++--------------- 1 file changed, 18 insertions(+), 15 deletions(-) diff --git a/src/aws/mod.rs b/src/aws/mod.rs index 2a13083..9a7a5b8 100644 --- a/src/aws/mod.rs +++ b/src/aws/mod.rs @@ -547,10 +547,10 @@ mod tests { dotenv::dotenv().ok(); let required_vars = [ - "AWS_DEFAULT_REGION", + "OBJECT_STORE_AWS_DEFAULT_REGION", "OBJECT_STORE_BUCKET", - "AWS_ACCESS_KEY_ID", - "AWS_SECRET_ACCESS_KEY", + "OBJECT_STORE_AWS_ACCESS_KEY_ID", + "OBJECT_STORE_AWS_SECRET_ACCESS_KEY", ]; let unset_vars: Vec<_> = required_vars .iter() @@ -582,16 +582,16 @@ mod tests { } else { let config = AmazonS3Builder::new() .with_access_key_id( - env::var("AWS_ACCESS_KEY_ID") - .expect("already checked AWS_ACCESS_KEY_ID"), + env::var("OBJECT_STORE_AWS_ACCESS_KEY_ID") + .expect("already checked OBJECT_STORE_AWS_ACCESS_KEY_ID"), ) .with_secret_access_key( - env::var("AWS_SECRET_ACCESS_KEY") - .expect("already checked AWS_SECRET_ACCESS_KEY"), + env::var("OBJECT_STORE_AWS_SECRET_ACCESS_KEY") + .expect("already checked OBJECT_STORE_AWS_SECRET_ACCESS_KEY"), ) .with_region( - env::var("AWS_DEFAULT_REGION") - .expect("already checked AWS_DEFAULT_REGION"), + env::var("OBJECT_STORE_AWS_DEFAULT_REGION") + .expect("already checked OBJECT_STORE_AWS_DEFAULT_REGION"), ) .with_bucket_name( env::var("OBJECT_STORE_BUCKET") @@ -599,13 +599,16 @@ mod tests { ) .with_allow_http(true); - let config = if let Some(endpoint) = env::var("AWS_ENDPOINT").ok() { - config.with_endpoint(endpoint) - } else { - config - }; + let config = + if let Some(endpoint) = env::var("OBJECT_STORE_AWS_ENDPOINT").ok() { + config.with_endpoint(endpoint) + } else { + config + }; - let config = if let Some(token) = env::var("AWS_SESSION_TOKEN").ok() { + let config = if let Some(token) = + env::var("OBJECT_STORE_AWS_SESSION_TOKEN").ok() + { config.with_token(token) } else { config From eaff982fd8c79c860e78ef02df5b760684aabc43 Mon Sep 17 00:00:00 2001 From: Dan Harris <1327726+thinkharderdev@users.noreply.github.com> Date: Tue, 23 Aug 2022 05:34:41 -0400 Subject: [PATCH 028/397] Fix panix in coalesce (#2554) --- src/util.rs | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/src/util.rs b/src/util.rs index 46e9e9e..f548ed4 100644 --- a/src/util.rs +++ b/src/util.rs @@ -87,11 +87,17 @@ where F: Send + FnMut(std::ops::Range) -> Fut, Fut: std::future::Future> + Send, { + if ranges.is_empty() { + return Ok(vec![]); + } + let mut ret = Vec::with_capacity(ranges.len()); let mut start_idx = 0; let mut end_idx = 1; while start_idx != ranges.len() { + let mut range_end = ranges[start_idx].end; + while end_idx != ranges.len() && ranges[end_idx] .start @@ -99,12 +105,14 @@ where .map(|delta| delta <= coalesce) .unwrap_or(false) { + if ranges[end_idx].end > range_end { + range_end = ranges[end_idx].end; + } end_idx += 1; } let start = ranges[start_idx].start; - let end = ranges[end_idx - 1].end; - let bytes = fetch(start..end).await?; + let bytes = fetch(start..range_end).await?; for range in ranges.iter().take(end_idx).skip(start_idx) { ret.push(bytes.slice(range.start - start..range.end - start)) } @@ -164,5 +172,11 @@ mod tests { let fetches = do_fetch(vec![0..1, 5..6, 7..9, 2..3, 4..6], 1).await; assert_eq!(fetches, vec![0..1, 5..9, 2..6]); + + let fetches = do_fetch(vec![0..1, 5..6, 7..9, 2..3, 4..6], 1).await; + assert_eq!(fetches, vec![0..1, 5..9, 2..6]); + + let fetches = do_fetch(vec![0..1, 6..7, 8..9, 10..14, 9..10], 4).await; + assert_eq!(fetches, vec![0..1, 6..14]); } } From ebdb6c40dea4972868d68689c236e094459963d4 Mon Sep 17 00:00:00 2001 From: Robert Pack <42610831+roeap@users.noreply.github.com> Date: Wed, 24 Aug 2022 19:08:36 +0200 Subject: [PATCH 029/397] Replace azure sdk with custom implementation (#2509) * feat: remove azure SDK * chore: remove httpdate crate * chore: some clippy * chore: include sas keys in builder * chore: cleanup credential a bit * feat: azure client credential auth with custom impl * fix: remove local test * feat: add cache to ClientSecretOAuthProvider * chore: fmt * chore: cleanup * fix: remove unwraps * chore: cleanup Cargo.toml * fix: fix azure multipart requests * feat: create builder from environment * Apply suggestions from code review Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> * fix: address PR comments * fix: service principal auth & multipart * fix: appen sas query pairs to copy url * Update object_store/src/azure/client.rs Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> * Fix SAS token escaping in CopyRequest Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Co-authored-by: Raphael Taylor-Davies --- Cargo.toml | 35 +- src/aws/credential.rs | 6 +- src/azure.rs | 886 --------------------------------------- src/azure/client.rs | 743 ++++++++++++++++++++++++++++++++ src/azure/credential.rs | 255 +++++++++++ src/azure/mod.rs | 705 +++++++++++++++++++++++++++++++ src/client/mod.rs | 2 +- src/client/oauth.rs | 85 +++- src/client/pagination.rs | 6 +- src/lib.rs | 38 +- src/util.rs | 11 +- 11 files changed, 1831 insertions(+), 941 deletions(-) delete mode 100644 src/azure.rs create mode 100644 src/azure/client.rs create mode 100644 src/azure/credential.rs create mode 100644 src/azure/mod.rs diff --git a/Cargo.toml b/Cargo.toml index 8c713d8..966c423 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -30,38 +30,33 @@ all-features = true [dependencies] # In alphabetical order async-trait = "0.1.53" -# Microsoft Azure Blob storage integration -azure_core = { version = "0.4", optional = true, default-features = false, features = ["enable_reqwest_rustls"] } -azure_identity = { version = "0.5", optional = true, default-features = false, features = ["enable_reqwest_rustls"]} -azure_storage = { version = "0.5", optional = true, default-features = false, features = ["enable_reqwest_rustls"]} -azure_storage_blobs = { version = "0.5", optional = true, default-features = false, features = ["enable_reqwest_rustls"] } bytes = "1.0" chrono = { version = "0.4", default-features = false, features = ["clock"] } -# Google Cloud Storage integration futures = "0.3" -serde = { version = "1.0", default-features = false, features = ["derive"], optional = true } -serde_json = { version = "1.0", default-features = false, optional = true } -quick-xml = { version = "0.23.0", features = ["serialize"], optional = true } -rustls-pemfile = { version = "1.0", default-features = false, optional = true } -ring = { version = "0.16", default-features = false, features = ["std"], optional = true } -base64 = { version = "0.13", default-features = false, optional = true } -rand = { version = "0.8", default-features = false, features = ["std", "std_rng"], optional = true } itertools = "0.10.1" +parking_lot = { version = "0.12" } percent-encoding = "2.1" snafu = "0.7" tokio = { version = "1.18", features = ["sync", "macros", "parking_lot", "rt-multi-thread", "time", "io-util"] } tracing = { version = "0.1" } -reqwest = { version = "0.11", default-features = false, features = ["rustls-tls"], optional = true } -parking_lot = { version = "0.12" } -# Filesystem integration url = "2.2" walkdir = "2" +# Cloud storage support +base64 = { version = "0.13", default-features = false, optional = true } +quick-xml = { version = "0.23.0", features = ["serialize"], optional = true } +serde = { version = "1.0", default-features = false, features = ["derive"], optional = true } +serde_json = { version = "1.0", default-features = false, optional = true } +rand = { version = "0.8", default-features = false, features = ["std", "std_rng"], optional = true } +reqwest = { version = "0.11", default-features = false, features = ["rustls-tls"], optional = true } +ring = { version = "0.16", default-features = false, features = ["std"], optional = true } +rustls-pemfile = { version = "1.0", default-features = false, optional = true } + [features] -azure = ["azure_core", "azure_storage_blobs", "azure_storage", "reqwest", "azure_identity"] -azure_test = ["azure", "azure_core/azurite_workaround", "azure_storage/azurite_workaround", "azure_storage_blobs/azurite_workaround"] -gcp = ["serde", "serde_json", "quick-xml", "reqwest", "reqwest/json", "reqwest/stream", "chrono/serde", "rustls-pemfile", "base64", "rand", "ring"] -aws = ["serde", "serde_json", "quick-xml", "reqwest", "reqwest/json", "reqwest/stream", "chrono/serde", "rustls-pemfile", "base64", "rand", "ring"] +cloud = ["serde", "serde_json", "quick-xml", "reqwest", "reqwest/json", "reqwest/stream", "chrono/serde", "rustls-pemfile", "base64", "rand", "ring"] +azure = ["cloud"] +gcp = ["cloud"] +aws = ["cloud"] [dev-dependencies] # In alphabetical order dotenv = "0.15.0" diff --git a/src/aws/credential.rs b/src/aws/credential.rs index b750059..e6c1bdd 100644 --- a/src/aws/credential.rs +++ b/src/aws/credential.rs @@ -17,6 +17,7 @@ use crate::client::retry::RetryExt; use crate::client::token::{TemporaryToken, TokenCache}; +use crate::util::hmac_sha256; use crate::{Result, RetryConfig}; use bytes::Buf; use chrono::{DateTime, Utc}; @@ -188,11 +189,6 @@ impl CredentialExt for RequestBuilder { } } -fn hmac_sha256(secret: impl AsRef<[u8]>, bytes: impl AsRef<[u8]>) -> ring::hmac::Tag { - let key = ring::hmac::Key::new(ring::hmac::HMAC_SHA256, secret.as_ref()); - ring::hmac::sign(&key, bytes.as_ref()) -} - /// Computes the SHA256 digest of `body` returned as a hex encoded string fn hex_digest(bytes: &[u8]) -> String { let digest = ring::digest::digest(&ring::digest::SHA256, bytes); diff --git a/src/azure.rs b/src/azure.rs deleted file mode 100644 index a9dbc53..0000000 --- a/src/azure.rs +++ /dev/null @@ -1,886 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -//! An object store implementation for Azure blob storage -//! -//! ## Streaming uploads -//! -//! [ObjectStore::put_multipart] will upload data in blocks and write a blob from those -//! blocks. Data is buffered internally to make blocks of at least 5MB and blocks -//! are uploaded concurrently. -//! -//! [ObjectStore::abort_multipart] is a no-op, since Azure Blob Store doesn't provide -//! a way to drop old blocks. Instead unused blocks are automatically cleaned up -//! after 7 days. -use crate::{ - multipart::{CloudMultiPartUpload, CloudMultiPartUploadImpl, UploadPart}, - path::{Path, DELIMITER}, - util::format_prefix, - GetResult, ListResult, MultipartId, ObjectMeta, ObjectStore, Result, -}; -use async_trait::async_trait; -use azure_core::{ - error::{Error as AzureError, ErrorKind as AzureErrorKind}, - prelude::*, - StatusCode, -}; -use azure_identity::{ - AutoRefreshingTokenCredential, ClientSecretCredential, TokenCredentialOptions, -}; -use azure_storage::core::clients::StorageClient; -use azure_storage_blobs::blob::Blob; -use azure_storage_blobs::container::operations::ListBlobsResponse; -use azure_storage_blobs::prelude::{ - AsContainerClient, ContainerClient, DeleteSnapshotsMethod, -}; -use bytes::Bytes; -use chrono::{TimeZone, Utc}; -use futures::{stream::BoxStream, StreamExt, TryStreamExt}; -use snafu::{ResultExt, Snafu}; -use std::collections::BTreeSet; -use std::fmt::{Debug, Formatter}; -use std::io; -use std::{convert::TryInto, sync::Arc}; -use tokio::io::AsyncWrite; -use url::Url; - -/// A specialized `Error` for Azure object store-related errors -#[derive(Debug, Snafu)] -#[allow(missing_docs)] -enum Error { - #[snafu(display( - "Unable to DELETE data. Container: {}, Location: {}, Error: {} ({:?})", - container, - path, - source, - source, - ))] - UnableToDeleteData { - source: AzureError, - container: String, - path: String, - }, - - #[snafu(display( - "Unable to GET data. Container: {}, Location: {}, Error: {} ({:?})", - container, - path, - source, - source, - ))] - UnableToGetData { - source: AzureError, - container: String, - path: String, - }, - - #[snafu(display( - "Unable to HEAD data. Container: {}, Location: {}, Error: {} ({:?})", - container, - path, - source, - source, - ))] - UnableToHeadData { - source: AzureError, - container: String, - path: String, - }, - - #[snafu(display( - "Unable to GET part of the data. Container: {}, Location: {}, Error: {} ({:?})", - container, - path, - source, - source, - ))] - UnableToGetPieceOfData { - source: AzureError, - container: String, - path: String, - }, - - #[snafu(display( - "Unable to PUT data. Bucket: {}, Location: {}, Error: {} ({:?})", - container, - path, - source, - source, - ))] - UnableToPutData { - source: AzureError, - container: String, - path: String, - }, - - #[snafu(display( - "Unable to list data. Bucket: {}, Error: {} ({:?})", - container, - source, - source, - ))] - UnableToListData { - source: AzureError, - container: String, - }, - - #[snafu(display( - "Unable to copy object. Container: {}, From: {}, To: {}, Error: {}", - container, - from, - to, - source - ))] - UnableToCopyFile { - source: AzureError, - container: String, - from: String, - to: String, - }, - - #[snafu(display( - "Unable parse source url. Container: {}, Error: {}", - container, - source - ))] - UnableToParseUrl { - source: url::ParseError, - container: String, - }, - - NotFound { - path: String, - source: AzureError, - }, - - AlreadyExists { - path: String, - source: AzureError, - }, - - #[cfg(not(feature = "azure_test"))] - #[snafu(display( - "Azurite (azure emulator) support not compiled in, please add `azure_test` feature" - ))] - NoEmulatorFeature, - - #[snafu(display( - "Unable parse emulator url {}={}, Error: {}", - env_name, - env_value, - source - ))] - UnableToParseEmulatorUrl { - env_name: String, - env_value: String, - source: url::ParseError, - }, - - #[snafu(display("Account must be specified"))] - MissingAccount {}, - - #[snafu(display("Container name must be specified"))] - MissingContainerName {}, - - #[snafu(display("At least one authorization option must be specified"))] - MissingCredentials {}, -} - -impl From for super::Error { - fn from(source: Error) -> Self { - match source { - Error::NotFound { path, source } => Self::NotFound { - path, - source: Box::new(source), - }, - Error::AlreadyExists { path, source } => Self::AlreadyExists { - path, - source: Box::new(source), - }, - _ => Self::Generic { - store: "Azure Blob Storage", - source: Box::new(source), - }, - } - } -} - -/// Interface for [Microsoft Azure Blob Storage](https://azure.microsoft.com/en-us/services/storage/blobs/). -#[derive(Debug)] -pub struct MicrosoftAzure { - container_client: Arc, - container_name: String, - blob_base_url: String, - is_emulator: bool, -} - -impl std::fmt::Display for MicrosoftAzure { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - match self.is_emulator { - true => write!(f, "MicrosoftAzureEmulator({})", self.container_name), - false => write!(f, "MicrosoftAzure({})", self.container_name), - } - } -} - -#[async_trait] -impl ObjectStore for MicrosoftAzure { - async fn put(&self, location: &Path, bytes: Bytes) -> Result<()> { - let bytes = bytes::BytesMut::from(&*bytes); - - self.container_client - .blob_client(location.as_ref()) - .put_block_blob(bytes) - .into_future() - .await - .context(UnableToPutDataSnafu { - container: &self.container_name, - path: location.to_owned(), - })?; - - Ok(()) - } - - async fn put_multipart( - &self, - location: &Path, - ) -> Result<(MultipartId, Box)> { - let inner = AzureMultiPartUpload { - container_client: Arc::clone(&self.container_client), - location: location.to_owned(), - }; - Ok((String::new(), Box::new(CloudMultiPartUpload::new(inner, 8)))) - } - - async fn abort_multipart( - &self, - _location: &Path, - _multipart_id: &MultipartId, - ) -> Result<()> { - // There is no way to drop blocks that have been uploaded. Instead, they simply - // expire in 7 days. - Ok(()) - } - - async fn get(&self, location: &Path) -> Result { - let loc = location.clone(); - let mut stream = self - .container_client - .blob_client(location.as_ref()) - .get() - .into_stream() - .and_then(|chunk| chunk.data.collect()) - .map_err(move |err| match err.kind() { - AzureErrorKind::HttpResponse { - status: StatusCode::NotFound, - .. - } => crate::Error::NotFound { - source: Box::new(err), - path: loc.to_string(), - }, - _ => crate::Error::Generic { - source: Box::new(err), - store: "MicrosoftAzure", - }, - }) - .boxed(); - - let first = stream.next().await.transpose()?.unwrap_or_default(); - Ok(GetResult::Stream(Box::pin( - futures::stream::once(async { Ok(first) }).chain(stream), - ))) - } - - async fn get_range( - &self, - location: &Path, - range: std::ops::Range, - ) -> Result { - let map_azure_err = |err: AzureError| match err.kind() { - AzureErrorKind::HttpResponse { - status: StatusCode::NotFound, - .. - } => Error::NotFound { - source: err, - path: location.to_string(), - }, - _ => Error::UnableToGetPieceOfData { - source: err, - container: self.container_name.clone(), - path: location.to_string(), - }, - }; - - let mut stream = self - .container_client - .blob_client(location.as_ref()) - .get() - .range(range) - .into_stream(); - - let mut chunk: Vec = vec![]; - while let Some(value) = stream.next().await { - let value = value - .map_err(map_azure_err)? - .data - .collect() - .await - .map_err(map_azure_err)?; - chunk.extend(&value); - } - - Ok(chunk.into()) - } - - async fn head(&self, location: &Path) -> Result { - let res = self - .container_client - .blob_client(location.as_ref()) - .get_properties() - .into_future() - .await - .map_err(|err| match err.kind() { - AzureErrorKind::HttpResponse { - status: StatusCode::NotFound, - .. - } => Error::NotFound { - source: err, - path: location.to_string(), - }, - _ => Error::UnableToHeadData { - source: err, - container: self.container_name.clone(), - path: location.to_string(), - }, - })?; - - convert_object_meta(res.blob)?.ok_or_else(|| super::Error::NotFound { - path: location.to_string(), - source: "is directory".to_string().into(), - }) - } - - async fn delete(&self, location: &Path) -> Result<()> { - self.container_client - .blob_client(location.as_ref()) - .delete() - .delete_snapshots_method(DeleteSnapshotsMethod::Include) - .into_future() - .await - .context(UnableToDeleteDataSnafu { - container: &self.container_name, - path: location.to_string(), - })?; - - Ok(()) - } - - async fn list( - &self, - prefix: Option<&Path>, - ) -> Result>> { - let stream = self - .list_impl(prefix, false) - .await? - .map_ok(|resp| { - let names = resp - .blobs - .blobs - .into_iter() - .filter_map(|blob| convert_object_meta(blob).transpose()); - futures::stream::iter(names) - }) - .try_flatten() - .boxed(); - - Ok(stream) - } - - async fn list_with_delimiter(&self, prefix: Option<&Path>) -> Result { - let mut stream = self.list_impl(prefix, true).await?; - - let mut common_prefixes = BTreeSet::new(); - let mut objects = Vec::new(); - - while let Some(res) = stream.next().await { - let response = res?; - - let prefixes = response.blobs.blob_prefix.unwrap_or_default(); - for p in prefixes { - common_prefixes.insert(Path::parse(&p.name)?); - } - - let blobs = response.blobs.blobs; - objects.reserve(blobs.len()); - for blob in blobs { - if let Some(meta) = convert_object_meta(blob)? { - objects.push(meta); - } - } - } - - Ok(ListResult { - common_prefixes: common_prefixes.into_iter().collect(), - objects, - }) - } - - async fn copy(&self, from: &Path, to: &Path) -> Result<()> { - let from_url = self.get_copy_from_url(from)?; - self.container_client - .blob_client(to.as_ref()) - .copy(from_url) - .into_future() - .await - .context(UnableToCopyFileSnafu { - container: &self.container_name, - from: from.as_ref(), - to: to.as_ref(), - })?; - Ok(()) - } - - async fn copy_if_not_exists(&self, from: &Path, to: &Path) -> Result<()> { - let from_url = self.get_copy_from_url(from)?; - self.container_client - .blob_client(to.as_ref()) - .copy(from_url) - .if_match(IfMatchCondition::NotMatch("*".to_string())) - .into_future() - .await - .map_err(|err| { - if let AzureErrorKind::HttpResponse { - status: StatusCode::Conflict, - .. - } = err.kind() - { - return Error::AlreadyExists { - source: err, - path: to.to_string(), - }; - }; - Error::UnableToCopyFile { - source: err, - container: self.container_name.clone(), - from: from.to_string(), - to: to.to_string(), - } - })?; - Ok(()) - } -} - -impl MicrosoftAzure { - /// helper function to create a source url for copy function - fn get_copy_from_url(&self, from: &Path) -> Result { - let mut url = - Url::parse(&format!("{}/{}", &self.blob_base_url, self.container_name)) - .context(UnableToParseUrlSnafu { - container: &self.container_name, - })?; - - url.path_segments_mut().unwrap().extend(from.parts()); - Ok(url) - } - - async fn list_impl( - &self, - prefix: Option<&Path>, - delimiter: bool, - ) -> Result>> { - let mut stream = self.container_client.list_blobs(); - if let Some(prefix_val) = format_prefix(prefix) { - stream = stream.prefix(prefix_val); - } - if delimiter { - stream = stream.delimiter(Delimiter::new(DELIMITER)); - } - - let stream = stream - .into_stream() - .map(|resp| match resp { - Ok(list_blobs) => Ok(list_blobs), - Err(err) => Err(crate::Error::from(Error::UnableToListData { - source: err, - container: self.container_name.clone(), - })), - }) - .boxed(); - - Ok(stream) - } -} - -/// Returns `None` if is a directory -fn convert_object_meta(blob: Blob) -> Result> { - let location = Path::parse(blob.name)?; - let last_modified = Utc.timestamp(blob.properties.last_modified.unix_timestamp(), 0); - let size = blob - .properties - .content_length - .try_into() - .expect("unsupported size on this platform"); - - // This is needed to filter out gen2 directories - // https://docs.microsoft.com/en-us/azure/storage/blobs/data-lake-storage-known-issues#blob-storage-apis - Ok((size > 0).then(|| ObjectMeta { - location, - last_modified, - size, - })) -} - -#[cfg(feature = "azure_test")] -fn check_if_emulator_works() -> Result<()> { - Ok(()) -} - -#[cfg(not(feature = "azure_test"))] -fn check_if_emulator_works() -> Result<()> { - Err(Error::NoEmulatorFeature.into()) -} - -/// Parses the contents of the environment variable `env_name` as a URL -/// if present, otherwise falls back to default_url -fn url_from_env(env_name: &str, default_url: &str) -> Result { - let url = match std::env::var(env_name) { - Ok(env_value) => { - Url::parse(&env_value).context(UnableToParseEmulatorUrlSnafu { - env_name, - env_value, - })? - } - Err(_) => Url::parse(default_url).expect("Failed to parse default URL"), - }; - Ok(url) -} - -/// Configure a connection to Microsoft Azure Blob Storage container using -/// the specified credentials. -/// -/// # Example -/// ``` -/// # let ACCOUNT = "foo"; -/// # let BUCKET_NAME = "foo"; -/// # let ACCESS_KEY = "foo"; -/// # use object_store::azure::MicrosoftAzureBuilder; -/// let azure = MicrosoftAzureBuilder::new() -/// .with_account(ACCOUNT) -/// .with_access_key(ACCESS_KEY) -/// .with_container_name(BUCKET_NAME) -/// .build(); -/// ``` -#[derive(Default)] -pub struct MicrosoftAzureBuilder { - account: Option, - access_key: Option, - container_name: Option, - bearer_token: Option, - client_id: Option, - client_secret: Option, - tenant_id: Option, - use_emulator: bool, -} - -impl Debug for MicrosoftAzureBuilder { - fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { - write!( - f, - "MicrosoftAzureBuilder {{ account: {:?}, container_name: {:?} }}", - self.account, self.container_name - ) - } -} - -impl MicrosoftAzureBuilder { - /// Create a new [`MicrosoftAzureBuilder`] with default values. - pub fn new() -> Self { - Default::default() - } - - /// Set the Azure Account (required) - pub fn with_account(mut self, account: impl Into) -> Self { - self.account = Some(account.into()); - self - } - - /// Set the Azure Access Key (required - one of access key, bearer token, or client credentials) - pub fn with_access_key(mut self, access_key: impl Into) -> Self { - self.access_key = Some(access_key.into()); - self - } - - /// Set a static bearer token to be used for authorizing requests - /// (required - one of access key, bearer token, or client credentials) - pub fn with_bearer_token(mut self, bearer_token: impl Into) -> Self { - self.bearer_token = Some(bearer_token.into()); - self - } - - /// Set the Azure Container Name (required) - pub fn with_container_name(mut self, container_name: impl Into) -> Self { - self.container_name = Some(container_name.into()); - self - } - - /// Set a client id used for client secret authorization - /// (required - one of access key, bearer token, or client credentials) - pub fn with_client_id(mut self, client_id: impl Into) -> Self { - self.client_id = Some(client_id.into()); - self - } - - /// Set a client secret used for client secret authorization - /// (required - one of access key, bearer token, or client credentials) - pub fn with_client_secret(mut self, client_secret: impl Into) -> Self { - self.client_secret = Some(client_secret.into()); - self - } - - /// Set the tenant id of the Azure AD tenant - /// (required - one of access key, bearer token, or client credentials) - pub fn with_tenant_id(mut self, tenant_id: impl Into) -> Self { - self.tenant_id = Some(tenant_id.into()); - self - } - - /// Set if the Azure emulator should be used (defaults to false) - pub fn with_use_emulator(mut self, use_emulator: bool) -> Self { - self.use_emulator = use_emulator; - self - } - - /// Configure a connection to container with given name on Microsoft Azure - /// Blob store. - pub fn build(self) -> Result { - let Self { - account, - access_key, - container_name, - bearer_token, - client_id, - client_secret, - tenant_id, - use_emulator, - } = self; - - let account = account.ok_or(Error::MissingAccount {})?; - let container_name = container_name.ok_or(Error::MissingContainerName {})?; - - let (is_emulator, storage_client) = if use_emulator { - check_if_emulator_works()?; - // Allow overriding defaults. Values taken from - // from https://docs.rs/azure_storage/0.2.0/src/azure_storage/core/clients/storage_account_client.rs.html#129-141 - let blob_storage_url = - url_from_env("AZURITE_BLOB_STORAGE_URL", "http://127.0.0.1:10000")?; - let queue_storage_url = - url_from_env("AZURITE_QUEUE_STORAGE_URL", "http://127.0.0.1:10001")?; - let table_storage_url = - url_from_env("AZURITE_TABLE_STORAGE_URL", "http://127.0.0.1:10002")?; - let filesystem_url = - url_from_env("AZURITE_TABLE_STORAGE_URL", "http://127.0.0.1:10004")?; - - let storage_client = StorageClient::new_emulator( - &blob_storage_url, - &table_storage_url, - &queue_storage_url, - &filesystem_url, - ); - - (true, storage_client) - } else { - let client = if let Some(bearer_token) = bearer_token { - Ok(StorageClient::new_bearer_token(&account, bearer_token)) - } else if let Some(access_key) = access_key { - Ok(StorageClient::new_access_key(&account, access_key)) - } else if let (Some(client_id), Some(client_secret), Some(tenant_id)) = - (tenant_id, client_id, client_secret) - { - let credential = Arc::new(AutoRefreshingTokenCredential::new(Arc::new( - ClientSecretCredential::new( - tenant_id, - client_id, - client_secret, - TokenCredentialOptions::default(), - ), - ))); - Ok(StorageClient::new_token_credential(&account, credential)) - } else { - Err(Error::MissingCredentials {}) - }?; - - (false, client) - }; - - let blob_base_url = storage_client - .blob_storage_url() - .as_ref() - // make url ending consistent between the emulator and remote storage account - .trim_end_matches('/') - .to_string(); - - let container_client = Arc::new(storage_client.container_client(&container_name)); - - Ok(MicrosoftAzure { - container_client, - container_name, - blob_base_url, - is_emulator, - }) - } -} - -// Relevant docs: https://azure.github.io/Storage/docs/application-and-user-data/basics/azure-blob-storage-upload-apis/ -// In Azure Blob Store, parts are "blocks" -// put_multipart_part -> PUT block -// complete -> PUT block list -// abort -> No equivalent; blocks are simply dropped after 7 days -#[derive(Debug, Clone)] -struct AzureMultiPartUpload { - container_client: Arc, - location: Path, -} - -impl AzureMultiPartUpload { - /// Gets the block id corresponding to the part index. - /// - /// In Azure, the user determines what id each block has. They must be - /// unique within an upload and of consistent length. - fn get_block_id(&self, part_idx: usize) -> String { - format!("{:20}", part_idx) - } -} - -#[async_trait] -impl CloudMultiPartUploadImpl for AzureMultiPartUpload { - async fn put_multipart_part( - &self, - buf: Vec, - part_idx: usize, - ) -> Result { - let block_id = self.get_block_id(part_idx); - - self.container_client - .blob_client(self.location.as_ref()) - .put_block(block_id.clone(), buf) - .into_future() - .await - .map_err(|err| io::Error::new(io::ErrorKind::Other, err))?; - - Ok(UploadPart { - content_id: block_id, - }) - } - - async fn complete(&self, completed_parts: Vec) -> Result<(), io::Error> { - let blocks = completed_parts - .into_iter() - .map(|part| { - azure_storage_blobs::blob::BlobBlockType::Uncommitted( - azure_storage_blobs::prelude::BlockId::new(part.content_id), - ) - }) - .collect(); - - let block_list = azure_storage_blobs::blob::BlockList { blocks }; - - self.container_client - .blob_client(self.location.as_ref()) - .put_block_list(block_list) - .into_future() - .await - .map_err(|err| io::Error::new(io::ErrorKind::Other, err))?; - - Ok(()) - } -} - -#[cfg(test)] -mod tests { - use super::*; - use crate::tests::{ - copy_if_not_exists, list_uses_directories_correctly, list_with_delimiter, - put_get_delete_list, rename_and_copy, - }; - use std::env; - - // Helper macro to skip tests if TEST_INTEGRATION and the Azure environment - // variables are not set. - macro_rules! maybe_skip_integration { - () => {{ - dotenv::dotenv().ok(); - - let use_emulator = std::env::var("AZURE_USE_EMULATOR").is_ok(); - - let mut required_vars = vec!["OBJECT_STORE_BUCKET"]; - if !use_emulator { - required_vars.push("AZURE_STORAGE_ACCOUNT"); - required_vars.push("AZURE_STORAGE_ACCESS_KEY"); - } - let unset_vars: Vec<_> = required_vars - .iter() - .filter_map(|&name| match env::var(name) { - Ok(_) => None, - Err(_) => Some(name), - }) - .collect(); - let unset_var_names = unset_vars.join(", "); - - let force = std::env::var("TEST_INTEGRATION"); - - if force.is_ok() && !unset_var_names.is_empty() { - panic!( - "TEST_INTEGRATION is set, \ - but variable(s) {} need to be set", - unset_var_names - ) - } else if force.is_err() { - eprintln!( - "skipping Azure integration test - set {}TEST_INTEGRATION to run", - if unset_var_names.is_empty() { - String::new() - } else { - format!("{} and ", unset_var_names) - } - ); - return; - } else { - MicrosoftAzureBuilder::new() - .with_account(env::var("AZURE_STORAGE_ACCOUNT").unwrap_or_default()) - .with_access_key( - env::var("AZURE_STORAGE_ACCESS_KEY").unwrap_or_default(), - ) - .with_container_name( - env::var("OBJECT_STORE_BUCKET") - .expect("already checked OBJECT_STORE_BUCKET"), - ) - .with_use_emulator(use_emulator) - } - }}; - } - - #[tokio::test] - async fn azure_blob_test() { - let integration = maybe_skip_integration!().build().unwrap(); - - put_get_delete_list(&integration).await; - list_uses_directories_correctly(&integration).await; - list_with_delimiter(&integration).await; - rename_and_copy(&integration).await; - copy_if_not_exists(&integration).await; - } -} diff --git a/src/azure/client.rs b/src/azure/client.rs new file mode 100644 index 0000000..5f37ea9 --- /dev/null +++ b/src/azure/client.rs @@ -0,0 +1,743 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use super::credential::{AzureCredential, CredentialProvider}; +use crate::azure::credential::*; +use crate::client::pagination::stream_paginated; +use crate::client::retry::RetryExt; +use crate::path::DELIMITER; +use crate::util::{format_http_range, format_prefix}; +use crate::{BoxStream, ListResult, ObjectMeta, Path, Result, RetryConfig, StreamExt}; +use bytes::{Buf, Bytes}; +use chrono::{DateTime, TimeZone, Utc}; +use itertools::Itertools; +use reqwest::{ + header::{HeaderValue, CONTENT_LENGTH, IF_NONE_MATCH, RANGE}, + Client as ReqwestClient, Method, Response, StatusCode, +}; +use serde::{Deserialize, Deserializer, Serialize}; +use snafu::{ResultExt, Snafu}; +use std::collections::HashMap; +use std::ops::Range; +use url::Url; + +/// A specialized `Error` for object store-related errors +#[derive(Debug, Snafu)] +#[allow(missing_docs)] +pub(crate) enum Error { + #[snafu(display("Error performing get request {}: {}", path, source))] + GetRequest { + source: reqwest::Error, + path: String, + }, + + #[snafu(display("Error performing put request {}: {}", path, source))] + PutRequest { + source: reqwest::Error, + path: String, + }, + + #[snafu(display("Error performing delete request {}: {}", path, source))] + DeleteRequest { + source: reqwest::Error, + path: String, + }, + + #[snafu(display("Error performing copy request {}: {}", path, source))] + CopyRequest { + source: reqwest::Error, + path: String, + }, + + #[snafu(display("Error performing list request: {}", source))] + ListRequest { source: reqwest::Error }, + + #[snafu(display("Error performing create multipart request: {}", source))] + CreateMultipartRequest { source: reqwest::Error }, + + #[snafu(display("Error performing complete multipart request: {}", source))] + CompleteMultipartRequest { source: reqwest::Error }, + + #[snafu(display("Got invalid list response: {}", source))] + InvalidListResponse { source: quick_xml::de::DeError }, + + #[snafu(display("Got invalid multipart response: {}", source))] + InvalidMultipartResponse { source: quick_xml::de::DeError }, + + #[snafu(display("Error authorizing request: {}", source))] + Authorization { source: crate::client::oauth::Error }, +} + +impl From for crate::Error { + fn from(err: Error) -> Self { + match err { + Error::GetRequest { source, path } + | Error::DeleteRequest { source, path } + | Error::CopyRequest { source, path } + | Error::PutRequest { source, path } + if matches!(source.status(), Some(StatusCode::NOT_FOUND)) => + { + Self::NotFound { + path, + source: Box::new(source), + } + } + Error::CopyRequest { source, path } + if matches!(source.status(), Some(StatusCode::CONFLICT)) => + { + Self::AlreadyExists { + path, + source: Box::new(source), + } + } + _ => Self::Generic { + store: "MicrosoftAzure", + source: Box::new(err), + }, + } + } +} + +/// Configuration for [AzureClient] +#[derive(Debug)] +pub struct AzureConfig { + pub account: String, + pub container: String, + pub credentials: CredentialProvider, + pub retry_config: RetryConfig, + pub allow_http: bool, + pub service: Url, + pub is_emulator: bool, +} + +impl AzureConfig { + fn path_url(&self, path: &Path) -> Url { + let mut url = self.service.clone(); + { + let mut path_mut = url.path_segments_mut().unwrap(); + if self.is_emulator { + path_mut.push(&self.account); + } + path_mut.push(&self.container).extend(path.parts()); + } + url + } +} + +#[derive(Debug)] +pub(crate) struct AzureClient { + config: AzureConfig, + client: ReqwestClient, +} + +impl AzureClient { + /// create a new instance of [AzureClient] + pub fn new(config: AzureConfig) -> Self { + let client = reqwest::ClientBuilder::new() + .https_only(!config.allow_http) + .build() + .unwrap(); + + Self { config, client } + } + + /// Returns the config + pub fn config(&self) -> &AzureConfig { + &self.config + } + + async fn get_credential(&self) -> Result { + match &self.config.credentials { + CredentialProvider::AccessKey(key) => { + Ok(AzureCredential::AccessKey(key.to_owned())) + } + CredentialProvider::ClientSecret(cred) => { + let token = cred + .fetch_token(&self.client, &self.config.retry_config) + .await + .context(AuthorizationSnafu)?; + Ok(AzureCredential::AuthorizationToken( + // we do the conversion to a HeaderValue here, since it is fallible + // and we wna to use it in an infallible function + HeaderValue::from_str(&format!("Bearer {}", token)).map_err( + |err| crate::Error::Generic { + store: "MicrosoftAzure", + source: Box::new(err), + }, + )?, + )) + } + CredentialProvider::SASToken(sas) => { + Ok(AzureCredential::SASToken(sas.clone())) + } + } + } + + /// Make an Azure PUT request + pub async fn put_request( + &self, + path: &Path, + bytes: Option, + is_block_op: bool, + query: &T, + ) -> Result { + let credential = self.get_credential().await?; + let url = self.config.path_url(path); + + let mut builder = self.client.request(Method::PUT, url); + + if !is_block_op { + builder = builder.header(&BLOB_TYPE, "BlockBlob").query(query); + } else { + builder = builder.query(query); + } + + if let Some(bytes) = bytes { + builder = builder + .header(CONTENT_LENGTH, HeaderValue::from(bytes.len())) + .body(bytes) + } else { + builder = builder.header(CONTENT_LENGTH, HeaderValue::from_static("0")); + } + + let response = builder + .with_azure_authorization(&credential, &self.config.account) + .send_retry(&self.config.retry_config) + .await + .context(PutRequestSnafu { + path: path.as_ref(), + })? + .error_for_status() + .context(PutRequestSnafu { + path: path.as_ref(), + })?; + + Ok(response) + } + + /// Make an Azure GET request + /// + /// + pub async fn get_request( + &self, + path: &Path, + range: Option>, + head: bool, + ) -> Result { + let credential = self.get_credential().await?; + let url = self.config.path_url(path); + let method = match head { + true => Method::HEAD, + false => Method::GET, + }; + + let mut builder = self + .client + .request(method, url) + .header(CONTENT_LENGTH, HeaderValue::from_static("0")) + .body(Bytes::new()); + + if let Some(range) = range { + builder = builder.header(RANGE, format_http_range(range)); + } + + let response = builder + .with_azure_authorization(&credential, &self.config.account) + .send_retry(&self.config.retry_config) + .await + .context(GetRequestSnafu { + path: path.as_ref(), + })? + .error_for_status() + .context(GetRequestSnafu { + path: path.as_ref(), + })?; + + Ok(response) + } + + /// Make an Azure Delete request + pub async fn delete_request( + &self, + path: &Path, + query: &T, + ) -> Result<()> { + let credential = self.get_credential().await?; + let url = self.config.path_url(path); + + self.client + .request(Method::DELETE, url) + .query(query) + .header(&DELETE_SNAPSHOTS, "include") + .with_azure_authorization(&credential, &self.config.account) + .send_retry(&self.config.retry_config) + .await + .context(DeleteRequestSnafu { + path: path.as_ref(), + })? + .error_for_status() + .context(DeleteRequestSnafu { + path: path.as_ref(), + })?; + + Ok(()) + } + + /// Make an Azure Copy request + pub async fn copy_request( + &self, + from: &Path, + to: &Path, + overwrite: bool, + ) -> Result<()> { + let credential = self.get_credential().await?; + let url = self.config.path_url(to); + let mut source = self.config.path_url(from); + + // If using SAS authorization must include the headers in the URL + // + if let AzureCredential::SASToken(pairs) = &credential { + source.query_pairs_mut().extend_pairs(pairs); + } + + let mut builder = self + .client + .request(Method::PUT, url) + .header(©_SOURCE, source.to_string()) + .header(CONTENT_LENGTH, HeaderValue::from_static("0")); + + if !overwrite { + builder = builder.header(IF_NONE_MATCH, "*"); + } + + builder + .with_azure_authorization(&credential, &self.config.account) + .send_retry(&self.config.retry_config) + .await + .context(CopyRequestSnafu { + path: from.as_ref(), + })? + .error_for_status() + .context(CopyRequestSnafu { + path: from.as_ref(), + })?; + + Ok(()) + } + + /// Make an Azure List request + async fn list_request( + &self, + prefix: Option<&str>, + delimiter: bool, + token: Option<&str>, + ) -> Result<(ListResult, Option)> { + let credential = self.get_credential().await?; + let url = self.config.path_url(&Path::default()); + + let mut query = Vec::with_capacity(5); + query.push(("restype", "container")); + query.push(("comp", "list")); + + if let Some(prefix) = prefix { + query.push(("prefix", prefix)) + } + + if delimiter { + query.push(("delimiter", DELIMITER)) + } + + if let Some(token) = token { + query.push(("marker", token)) + } + + let response = self + .client + .request(Method::GET, url) + .query(&query) + .with_azure_authorization(&credential, &self.config.account) + .send_retry(&self.config.retry_config) + .await + .context(ListRequestSnafu)? + .error_for_status() + .context(ListRequestSnafu)? + .bytes() + .await + .context(ListRequestSnafu)?; + + let mut response: ListResultInternal = + quick_xml::de::from_reader(response.reader()) + .context(InvalidListResponseSnafu)?; + let token = response.next_marker.take(); + + Ok((response.try_into()?, token)) + } + + /// Perform a list operation automatically handling pagination + pub fn list_paginated( + &self, + prefix: Option<&Path>, + delimiter: bool, + ) -> BoxStream<'_, Result> { + let prefix = format_prefix(prefix); + stream_paginated(prefix, move |prefix, token| async move { + let (r, next_token) = self + .list_request(prefix.as_deref(), delimiter, token.as_deref()) + .await?; + Ok((r, prefix, next_token)) + }) + .boxed() + } +} + +/// Raw / internal response from list requests +#[derive(Debug, Clone, PartialEq, Deserialize)] +#[serde(rename_all = "PascalCase")] +struct ListResultInternal { + pub prefix: Option, + pub max_results: Option, + pub delimiter: Option, + pub next_marker: Option, + pub blobs: Blobs, +} + +impl TryFrom for ListResult { + type Error = crate::Error; + + fn try_from(value: ListResultInternal) -> Result { + let common_prefixes = value + .blobs + .blob_prefix + .unwrap_or_default() + .into_iter() + .map(|x| Ok(Path::parse(&x.name)?)) + .collect::>()?; + + let objects = value + .blobs + .blobs + .into_iter() + .map(ObjectMeta::try_from) + // Note: workaround for gen2 accounts with hierarchical namespaces. These accounts also + // return path segments as "directories". When we cant directories, its always via + // the BlobPrefix mechanics. + .filter_map_ok(|obj| if obj.size > 0 { Some(obj) } else { None }) + .collect::>()?; + + Ok(Self { + common_prefixes, + objects, + }) + } +} + +/// Collection of blobs and potentially shared prefixes returned from list requests. +#[derive(Debug, Clone, PartialEq, Eq, Deserialize)] +#[serde(rename_all = "PascalCase")] +struct Blobs { + pub blob_prefix: Option>, + #[serde(rename = "Blob", default)] + pub blobs: Vec, +} + +/// Common prefix in list blobs response +#[derive(Debug, Clone, PartialEq, Eq, Deserialize)] +#[serde(rename_all = "PascalCase")] +struct BlobPrefix { + pub name: String, +} + +/// Details for a specific blob +#[derive(Debug, Clone, PartialEq, Eq, Deserialize)] +#[serde(rename_all = "PascalCase")] +struct Blob { + pub name: String, + pub version_id: Option, + pub is_current_version: Option, + pub deleted: Option, + pub properties: BlobProperties, + pub metadata: Option>, +} + +impl TryFrom for ObjectMeta { + type Error = crate::Error; + + fn try_from(value: Blob) -> Result { + Ok(Self { + location: Path::parse(value.name)?, + last_modified: value.properties.last_modified, + size: value.properties.content_length as usize, + }) + } +} + +/// Properties associated with individual blobs. The actual list +/// of returned properties is much more exhaustive, but we limit +/// the parsed fields to the ones relevant in this crate. +#[derive(Debug, Clone, PartialEq, Eq, Deserialize)] +#[serde(rename_all = "PascalCase")] +struct BlobProperties { + #[serde(deserialize_with = "deserialize_http_date", rename = "Last-Modified")] + pub last_modified: DateTime, + pub etag: String, + #[serde(rename = "Content-Length")] + pub content_length: u64, + #[serde(rename = "Content-Type")] + pub content_type: String, + #[serde(rename = "Content-Encoding")] + pub content_encoding: Option, + #[serde(rename = "Content-Language")] + pub content_language: Option, +} + +// deserialize dates used in Azure payloads according to rfc1123 +fn deserialize_http_date<'de, D>(deserializer: D) -> Result, D::Error> +where + D: Deserializer<'de>, +{ + let s = String::deserialize(deserializer)?; + Utc.datetime_from_str(&s, RFC1123_FMT) + .map_err(serde::de::Error::custom) +} + +#[derive(Debug, Clone, PartialEq, Eq)] +pub(crate) struct BlockId(Bytes); + +impl BlockId { + pub fn new(block_id: impl Into) -> Self { + Self(block_id.into()) + } +} + +impl From for BlockId +where + B: Into, +{ + fn from(v: B) -> Self { + Self::new(v) + } +} + +impl AsRef<[u8]> for BlockId { + fn as_ref(&self) -> &[u8] { + self.0.as_ref() + } +} + +#[derive(Default, Debug, Clone, PartialEq, Eq)] +pub(crate) struct BlockList { + pub blocks: Vec, +} + +impl BlockList { + pub fn to_xml(&self) -> String { + let mut s = String::new(); + s.push_str("\n\n"); + for block_id in &self.blocks { + let node = format!( + "\t{}\n", + base64::encode(block_id) + ); + s.push_str(&node); + } + + s.push_str(""); + s + } +} + +#[cfg(test)] +mod tests { + use bytes::Bytes; + + use super::*; + + #[test] + fn deserde_azure() { + const S: &str = " + + + + blob0.txt + + Thu, 01 Jul 2021 10:44:59 GMT + Thu, 01 Jul 2021 10:44:59 GMT + Thu, 07 Jul 2022 14:38:48 GMT + 0x8D93C7D4629C227 + 8 + text/plain + + + + rvr3UC1SmUw7AZV2NqPN0g== + + + BlockBlob + Hot + true + unlocked + available + true + + uservalue + + + + blob1.txt + + Thu, 01 Jul 2021 10:44:59 GMT + Thu, 01 Jul 2021 10:44:59 GMT + 0x8D93C7D463004D6 + 8 + text/plain + + + + rvr3UC1SmUw7AZV2NqPN0g== + + + BlockBlob + Hot + true + unlocked + available + true + + + + + blob2.txt + + Thu, 01 Jul 2021 10:44:59 GMT + Thu, 01 Jul 2021 10:44:59 GMT + 0x8D93C7D4636478A + 8 + text/plain + + + + rvr3UC1SmUw7AZV2NqPN0g== + + + BlockBlob + Hot + true + unlocked + available + true + + + + + +"; + + let bytes = Bytes::from(S); + let mut _list_blobs_response_internal: ListResultInternal = + quick_xml::de::from_slice(bytes.as_ref()).unwrap(); + } + + #[test] + fn deserde_azurite() { + const S: &str = " + + + + 5000 + + + + blob0.txt + + Thu, 01 Jul 2021 10:45:02 GMT + Thu, 01 Jul 2021 10:45:02 GMT + 0x228281B5D517B20 + 8 + text/plain + rvr3UC1SmUw7AZV2NqPN0g== + BlockBlob + unlocked + available + true + Hot + true + Thu, 01 Jul 2021 10:45:02 GMT + + + + blob1.txt + + Thu, 01 Jul 2021 10:45:02 GMT + Thu, 01 Jul 2021 10:45:02 GMT + 0x1DD959381A8A860 + 8 + text/plain + rvr3UC1SmUw7AZV2NqPN0g== + BlockBlob + unlocked + available + true + Hot + true + Thu, 01 Jul 2021 10:45:02 GMT + + + + blob2.txt + + Thu, 01 Jul 2021 10:45:02 GMT + Thu, 01 Jul 2021 10:45:02 GMT + 0x1FBE9C9B0C7B650 + 8 + text/plain + rvr3UC1SmUw7AZV2NqPN0g== + BlockBlob + unlocked + available + true + Hot + true + Thu, 01 Jul 2021 10:45:02 GMT + + + + +"; + + let bytes = Bytes::from(S); + let mut _list_blobs_response_internal: ListResultInternal = + quick_xml::de::from_slice(bytes.as_ref()).unwrap(); + } + + #[test] + fn to_xml() { + const S: &str = " + +\tbnVtZXJvMQ== +\tbnVtZXJvMg== +\tbnVtZXJvMw== +"; + let mut blocks = BlockList { blocks: Vec::new() }; + blocks.blocks.push(Bytes::from_static(b"numero1").into()); + blocks.blocks.push("numero2".into()); + blocks.blocks.push("numero3".into()); + + let res: &str = &blocks.to_xml(); + + assert_eq!(res, S) + } +} diff --git a/src/azure/credential.rs b/src/azure/credential.rs new file mode 100644 index 0000000..9357e80 --- /dev/null +++ b/src/azure/credential.rs @@ -0,0 +1,255 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::client::oauth::ClientSecretOAuthProvider; +use crate::util::hmac_sha256; +use chrono::Utc; +use reqwest::{ + header::{ + HeaderMap, HeaderName, HeaderValue, AUTHORIZATION, CONTENT_ENCODING, + CONTENT_LANGUAGE, CONTENT_LENGTH, CONTENT_TYPE, DATE, IF_MATCH, + IF_MODIFIED_SINCE, IF_NONE_MATCH, IF_UNMODIFIED_SINCE, RANGE, + }, + Method, RequestBuilder, +}; +use std::borrow::Cow; +use std::str; +use url::Url; + +static AZURE_VERSION: HeaderValue = HeaderValue::from_static("2021-08-06"); +static VERSION: HeaderName = HeaderName::from_static("x-ms-version"); +pub(crate) static BLOB_TYPE: HeaderName = HeaderName::from_static("x-ms-blob-type"); +pub(crate) static DELETE_SNAPSHOTS: HeaderName = + HeaderName::from_static("x-ms-delete-snapshots"); +pub(crate) static COPY_SOURCE: HeaderName = HeaderName::from_static("x-ms-copy-source"); +static CONTENT_MD5: HeaderName = HeaderName::from_static("content-md5"); +pub(crate) static RFC1123_FMT: &str = "%a, %d %h %Y %T GMT"; + +/// Provides credentials for use when signing requests +#[derive(Debug)] +pub enum CredentialProvider { + AccessKey(String), + SASToken(Vec<(String, String)>), + ClientSecret(ClientSecretOAuthProvider), +} + +pub(crate) enum AzureCredential { + AccessKey(String), + SASToken(Vec<(String, String)>), + AuthorizationToken(HeaderValue), +} + +/// A list of known Azure authority hosts +pub mod authority_hosts { + /// China-based Azure Authority Host + pub const AZURE_CHINA: &str = "https://login.chinacloudapi.cn"; + /// Germany-based Azure Authority Host + pub const AZURE_GERMANY: &str = "https://login.microsoftonline.de"; + /// US Government Azure Authority Host + pub const AZURE_GOVERNMENT: &str = "https://login.microsoftonline.us"; + /// Public Cloud Azure Authority Host + pub const AZURE_PUBLIC_CLOUD: &str = "https://login.microsoftonline.com"; +} + +pub(crate) trait CredentialExt { + /// Apply authorization to requests against azure storage accounts + /// + fn with_azure_authorization( + self, + credential: &AzureCredential, + account: &str, + ) -> Self; +} + +impl CredentialExt for RequestBuilder { + fn with_azure_authorization( + mut self, + credential: &AzureCredential, + account: &str, + ) -> Self { + // rfc2822 string should never contain illegal characters + let date = Utc::now(); + let date_str = date.format(RFC1123_FMT).to_string(); + // we formatted the data string ourselves, so unwrapping should be fine + let date_val = HeaderValue::from_str(&date_str).unwrap(); + self = self + .header(DATE, &date_val) + .header(&VERSION, &AZURE_VERSION); + + // Hack around lack of access to underlying request + // https://github.com/seanmonstar/reqwest/issues/1212 + let request = self + .try_clone() + .expect("not stream") + .build() + .expect("request valid"); + + match credential { + AzureCredential::AccessKey(key) => { + let signature = generate_authorization( + request.headers(), + request.url(), + request.method(), + account, + key.as_str(), + ); + self = self + // "signature" is a base 64 encoded string so it should never contain illegal characters. + .header( + AUTHORIZATION, + HeaderValue::from_str(signature.as_str()).unwrap(), + ); + } + AzureCredential::AuthorizationToken(token) => { + self = self.header(AUTHORIZATION, token); + } + AzureCredential::SASToken(query_pairs) => { + self = self.query(&query_pairs); + } + }; + + self + } +} + +/// Generate signed key for authorization via access keys +/// +fn generate_authorization( + h: &HeaderMap, + u: &Url, + method: &Method, + account: &str, + key: &str, +) -> String { + let str_to_sign = string_to_sign(h, u, method, account); + let auth = hmac_sha256(base64::decode(key).unwrap(), &str_to_sign); + format!("SharedKey {}:{}", account, base64::encode(auth)) +} + +fn add_if_exists<'a>(h: &'a HeaderMap, key: &HeaderName) -> &'a str { + h.get(key) + .map(|s| s.to_str()) + .transpose() + .ok() + .flatten() + .unwrap_or_default() +} + +/// +fn string_to_sign(h: &HeaderMap, u: &Url, method: &Method, account: &str) -> String { + // content length must only be specified if != 0 + // this is valid from 2015-02-21 + let content_length = h + .get(&CONTENT_LENGTH) + .map(|s| s.to_str()) + .transpose() + .ok() + .flatten() + .filter(|&v| v != "0") + .unwrap_or_default(); + format!( + "{}\n{}\n{}\n{}\n{}\n{}\n{}\n{}\n{}\n{}\n{}\n{}\n{}{}", + method.as_ref(), + add_if_exists(h, &CONTENT_ENCODING), + add_if_exists(h, &CONTENT_LANGUAGE), + content_length, + add_if_exists(h, &CONTENT_MD5), + add_if_exists(h, &CONTENT_TYPE), + add_if_exists(h, &DATE), + add_if_exists(h, &IF_MODIFIED_SINCE), + add_if_exists(h, &IF_MATCH), + add_if_exists(h, &IF_NONE_MATCH), + add_if_exists(h, &IF_UNMODIFIED_SINCE), + add_if_exists(h, &RANGE), + canonicalize_header(h), + canonicalized_resource(account, u) + ) +} + +/// +fn canonicalize_header(headers: &HeaderMap) -> String { + let mut names = headers + .iter() + .filter_map(|(k, _)| { + (k.as_str().starts_with("x-ms")) + // TODO remove unwraps + .then(|| (k.as_str(), headers.get(k).unwrap().to_str().unwrap())) + }) + .collect::>(); + names.sort_unstable(); + + let mut result = String::new(); + for (name, value) in names { + result.push_str(name); + result.push(':'); + result.push_str(value); + result.push('\n'); + } + result +} + +/// +fn canonicalized_resource(account: &str, uri: &Url) -> String { + let mut can_res: String = String::new(); + can_res.push('/'); + can_res.push_str(account); + can_res.push_str(uri.path().to_string().as_str()); + can_res.push('\n'); + + // query parameters + let query_pairs = uri.query_pairs(); + { + let mut qps: Vec = Vec::new(); + for (q, _) in query_pairs { + if !(qps.iter().any(|x| x == &*q)) { + qps.push(q.into_owned()); + } + } + + qps.sort(); + + for qparam in qps { + // find correct parameter + let ret = lexy_sort(query_pairs, &qparam); + + can_res = can_res + &qparam.to_lowercase() + ":"; + + for (i, item) in ret.iter().enumerate() { + if i > 0 { + can_res.push(','); + } + can_res.push_str(item); + } + + can_res.push('\n'); + } + }; + + can_res[0..can_res.len() - 1].to_owned() +} + +fn lexy_sort<'a>( + vec: impl Iterator, Cow<'a, str>)> + 'a, + query_param: &str, +) -> Vec> { + let mut values = vec + .filter(|(k, _)| *k == query_param) + .map(|(_, v)| v) + .collect::>(); + values.sort_unstable(); + values +} diff --git a/src/azure/mod.rs b/src/azure/mod.rs new file mode 100644 index 0000000..53e7ed6 --- /dev/null +++ b/src/azure/mod.rs @@ -0,0 +1,705 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! An object store implementation for Azure blob storage +//! +//! ## Streaming uploads +//! +//! [ObjectStore::put_multipart] will upload data in blocks and write a blob from those +//! blocks. Data is buffered internally to make blocks of at least 5MB and blocks +//! are uploaded concurrently. +//! +//! [ObjectStore::abort_multipart] is a no-op, since Azure Blob Store doesn't provide +//! a way to drop old blocks. Instead unused blocks are automatically cleaned up +//! after 7 days. +use self::client::{BlockId, BlockList}; +use crate::{ + multipart::{CloudMultiPartUpload, CloudMultiPartUploadImpl, UploadPart}, + path::Path, + GetResult, ListResult, MultipartId, ObjectMeta, ObjectStore, Result, RetryConfig, +}; +use async_trait::async_trait; +use bytes::Bytes; +use chrono::{TimeZone, Utc}; +use futures::{stream::BoxStream, StreamExt, TryStreamExt}; +use snafu::{ResultExt, Snafu}; +use std::collections::BTreeSet; +use std::fmt::{Debug, Formatter}; +use std::io; +use std::ops::Range; +use std::sync::Arc; +use tokio::io::AsyncWrite; +use url::Url; + +pub use credential::authority_hosts; + +mod client; +mod credential; + +/// The well-known account used by Azurite and the legacy Azure Storage Emulator. +/// +const EMULATOR_ACCOUNT: &str = "devstoreaccount1"; + +/// The well-known account key used by Azurite and the legacy Azure Storage Emulator. +/// +const EMULATOR_ACCOUNT_KEY: &str = + "Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw=="; + +/// A specialized `Error` for Azure object store-related errors +#[derive(Debug, Snafu)] +#[allow(missing_docs)] +enum Error { + #[snafu(display("Last-Modified Header missing from response"))] + MissingLastModified, + + #[snafu(display("Content-Length Header missing from response"))] + MissingContentLength, + + #[snafu(display("Invalid last modified '{}': {}", last_modified, source))] + InvalidLastModified { + last_modified: String, + source: chrono::ParseError, + }, + + #[snafu(display("Invalid content length '{}': {}", content_length, source))] + InvalidContentLength { + content_length: String, + source: std::num::ParseIntError, + }, + + #[snafu(display("Received header containing non-ASCII data"))] + BadHeader { source: reqwest::header::ToStrError }, + + #[snafu(display("Unable parse source url. Url: {}, Error: {}", url, source))] + UnableToParseUrl { + source: url::ParseError, + url: String, + }, + + #[snafu(display( + "Unable parse emulator url {}={}, Error: {}", + env_name, + env_value, + source + ))] + UnableToParseEmulatorUrl { + env_name: String, + env_value: String, + source: url::ParseError, + }, + + #[snafu(display("Account must be specified"))] + MissingAccount {}, + + #[snafu(display("Container name must be specified"))] + MissingContainerName {}, + + #[snafu(display("At least one authorization option must be specified"))] + MissingCredentials {}, +} + +impl From for super::Error { + fn from(source: Error) -> Self { + Self::Generic { + store: "MicrosoftAzure", + source: Box::new(source), + } + } +} + +/// Interface for [Microsoft Azure Blob Storage](https://azure.microsoft.com/en-us/services/storage/blobs/). +#[derive(Debug)] +pub struct MicrosoftAzure { + client: Arc, +} + +impl std::fmt::Display for MicrosoftAzure { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!( + f, + "MicrosoftAzure {{ account: {}, container: {} }}", + self.client.config().account, + self.client.config().container + ) + } +} + +#[async_trait] +impl ObjectStore for MicrosoftAzure { + async fn put(&self, location: &Path, bytes: Bytes) -> Result<()> { + self.client + .put_request(location, Some(bytes), false, &()) + .await?; + Ok(()) + } + + async fn put_multipart( + &self, + location: &Path, + ) -> Result<(MultipartId, Box)> { + let inner = AzureMultiPartUpload { + client: Arc::clone(&self.client), + location: location.to_owned(), + }; + Ok((String::new(), Box::new(CloudMultiPartUpload::new(inner, 8)))) + } + + async fn abort_multipart( + &self, + _location: &Path, + _multipart_id: &MultipartId, + ) -> Result<()> { + // There is no way to drop blocks that have been uploaded. Instead, they simply + // expire in 7 days. + Ok(()) + } + + async fn get(&self, location: &Path) -> Result { + let response = self.client.get_request(location, None, false).await?; + let stream = response + .bytes_stream() + .map_err(|source| crate::Error::Generic { + store: "MicrosoftAzure", + source: Box::new(source), + }) + .boxed(); + + Ok(GetResult::Stream(stream)) + } + + async fn get_range(&self, location: &Path, range: Range) -> Result { + let bytes = self + .client + .get_request(location, Some(range), false) + .await? + .bytes() + .await + .map_err(|source| client::Error::GetRequest { + source, + path: location.to_string(), + })?; + Ok(bytes) + } + + async fn head(&self, location: &Path) -> Result { + use reqwest::header::{CONTENT_LENGTH, LAST_MODIFIED}; + + // Extract meta from headers + // https://docs.microsoft.com/en-us/rest/api/storageservices/get-blob-properties + let response = self.client.get_request(location, None, true).await?; + let headers = response.headers(); + + let last_modified = headers + .get(LAST_MODIFIED) + .ok_or(Error::MissingLastModified)? + .to_str() + .context(BadHeaderSnafu)?; + let last_modified = Utc + .datetime_from_str(last_modified, credential::RFC1123_FMT) + .context(InvalidLastModifiedSnafu { last_modified })?; + + let content_length = headers + .get(CONTENT_LENGTH) + .ok_or(Error::MissingContentLength)? + .to_str() + .context(BadHeaderSnafu)?; + let content_length = content_length + .parse() + .context(InvalidContentLengthSnafu { content_length })?; + + Ok(ObjectMeta { + location: location.clone(), + last_modified, + size: content_length, + }) + } + + async fn delete(&self, location: &Path) -> Result<()> { + self.client.delete_request(location, &()).await + } + + async fn list( + &self, + prefix: Option<&Path>, + ) -> Result>> { + let stream = self + .client + .list_paginated(prefix, false) + .map_ok(|r| futures::stream::iter(r.objects.into_iter().map(Ok))) + .try_flatten() + .boxed(); + + Ok(stream) + } + + async fn list_with_delimiter(&self, prefix: Option<&Path>) -> Result { + let mut stream = self.client.list_paginated(prefix, true); + + let mut common_prefixes = BTreeSet::new(); + let mut objects = Vec::new(); + + while let Some(result) = stream.next().await { + let response = result?; + common_prefixes.extend(response.common_prefixes.into_iter()); + objects.extend(response.objects.into_iter()); + } + + Ok(ListResult { + common_prefixes: common_prefixes.into_iter().collect(), + objects, + }) + } + + async fn copy(&self, from: &Path, to: &Path) -> Result<()> { + self.client.copy_request(from, to, true).await + } + + async fn copy_if_not_exists(&self, from: &Path, to: &Path) -> Result<()> { + self.client.copy_request(from, to, false).await + } +} + +/// Relevant docs: +/// In Azure Blob Store, parts are "blocks" +/// put_multipart_part -> PUT block +/// complete -> PUT block list +/// abort -> No equivalent; blocks are simply dropped after 7 days +#[derive(Debug, Clone)] +struct AzureMultiPartUpload { + client: Arc, + location: Path, +} + +#[async_trait] +impl CloudMultiPartUploadImpl for AzureMultiPartUpload { + async fn put_multipart_part( + &self, + buf: Vec, + part_idx: usize, + ) -> Result { + let content_id = format!("{:20}", part_idx); + let block_id: BlockId = content_id.clone().into(); + + self.client + .put_request( + &self.location, + Some(buf.into()), + true, + &[("comp", "block"), ("blockid", &base64::encode(block_id))], + ) + .await?; + + Ok(UploadPart { content_id }) + } + + async fn complete(&self, completed_parts: Vec) -> Result<(), io::Error> { + let blocks = completed_parts + .into_iter() + .map(|part| BlockId::from(part.content_id)) + .collect(); + + let block_list = BlockList { blocks }; + let block_xml = block_list.to_xml(); + + self.client + .put_request( + &self.location, + Some(block_xml.into()), + true, + &[("comp", "blocklist")], + ) + .await?; + + Ok(()) + } +} + +/// Configure a connection to Microsoft Azure Blob Storage container using +/// the specified credentials. +/// +/// # Example +/// ``` +/// # let ACCOUNT = "foo"; +/// # let BUCKET_NAME = "foo"; +/// # let ACCESS_KEY = "foo"; +/// # use object_store::azure::MicrosoftAzureBuilder; +/// let azure = MicrosoftAzureBuilder::new() +/// .with_account(ACCOUNT) +/// .with_access_key(ACCESS_KEY) +/// .with_container_name(BUCKET_NAME) +/// .build(); +/// ``` +#[derive(Default)] +pub struct MicrosoftAzureBuilder { + account_name: Option, + access_key: Option, + container_name: Option, + bearer_token: Option, + client_id: Option, + client_secret: Option, + tenant_id: Option, + sas_query_pairs: Option>, + authority_host: Option, + use_emulator: bool, + retry_config: RetryConfig, + allow_http: bool, +} + +impl Debug for MicrosoftAzureBuilder { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!( + f, + "MicrosoftAzureBuilder {{ account: {:?}, container_name: {:?} }}", + self.account_name, self.container_name + ) + } +} + +impl MicrosoftAzureBuilder { + /// Create a new [`MicrosoftAzureBuilder`] with default values. + pub fn new() -> Self { + Default::default() + } + + /// Create an instance of [MicrosoftAzureBuilder] with values pre-populated from environment variables. + /// + /// Variables extracted from environment: + /// * AZURE_STORAGE_ACCOUNT_NAME: storage account name + /// * AZURE_STORAGE_ACCOUNT_KEY: storage account master key + /// * AZURE_STORAGE_ACCESS_KEY: alias for AZURE_STORAGE_ACCOUNT_KEY + /// * AZURE_STORAGE_CLIENT_ID -> client id for service principal authorization + /// * AZURE_STORAGE_CLIENT_SECRET -> client secret for service principal authorization + /// * AZURE_STORAGE_TENANT_ID -> tenant id used in oauth flows + /// # Example + /// ``` + /// use object_store::azure::MicrosoftAzureBuilder; + /// + /// let azure = MicrosoftAzureBuilder::from_env() + /// .with_container_name("foo") + /// .build(); + /// ``` + pub fn from_env() -> Self { + let mut builder = Self::default(); + + if let Ok(account_name) = std::env::var("AZURE_STORAGE_ACCOUNT_NAME") { + builder.account_name = Some(account_name); + } + + if let Ok(access_key) = std::env::var("AZURE_STORAGE_ACCOUNT_KEY") { + builder.access_key = Some(access_key); + } else if let Ok(access_key) = std::env::var("AZURE_STORAGE_ACCESS_KEY") { + builder.access_key = Some(access_key); + } + + if let Ok(client_id) = std::env::var("AZURE_STORAGE_CLIENT_ID") { + builder.client_id = Some(client_id); + } + + if let Ok(client_secret) = std::env::var("AZURE_STORAGE_CLIENT_SECRET") { + builder.client_secret = Some(client_secret); + } + + if let Ok(tenant_id) = std::env::var("AZURE_STORAGE_TENANT_ID") { + builder.tenant_id = Some(tenant_id); + } + + builder + } + + /// Set the Azure Account (required) + pub fn with_account(mut self, account: impl Into) -> Self { + self.account_name = Some(account.into()); + self + } + + /// Set the Azure Container Name (required) + pub fn with_container_name(mut self, container_name: impl Into) -> Self { + self.container_name = Some(container_name.into()); + self + } + + /// Set the Azure Access Key (required - one of access key, bearer token, or client credentials) + pub fn with_access_key(mut self, access_key: impl Into) -> Self { + self.access_key = Some(access_key.into()); + self + } + + /// Set a static bearer token to be used for authorizing requests + pub fn with_bearer_token_authorization( + mut self, + bearer_token: impl Into, + ) -> Self { + self.bearer_token = Some(bearer_token.into()); + self + } + + /// Set a client secret used for client secret authorization + pub fn with_client_secret_authorization( + mut self, + client_id: impl Into, + client_secret: impl Into, + tenant_id: impl Into, + ) -> Self { + self.client_id = Some(client_id.into()); + self.client_secret = Some(client_secret.into()); + self.tenant_id = Some(tenant_id.into()); + self + } + + /// Set query pairs appended to the url for shared access signature authorization + pub fn with_sas_authorization( + mut self, + query_pairs: impl Into>, + ) -> Self { + self.sas_query_pairs = Some(query_pairs.into()); + self + } + + /// Set if the Azure emulator should be used (defaults to false) + pub fn with_use_emulator(mut self, use_emulator: bool) -> Self { + self.use_emulator = use_emulator; + self + } + + /// Sets what protocol is allowed. If `allow_http` is : + /// * false (default): Only HTTPS is allowed + /// * true: HTTP and HTTPS are allowed + pub fn with_allow_http(mut self, allow_http: bool) -> Self { + self.allow_http = allow_http; + self + } + + /// Sets an alternative authority host for OAuth based authorization + /// common hosts for azure clouds are defined in [authority_hosts]. + /// Defaults to + pub fn with_authority_host(mut self, authority_host: String) -> Self { + self.authority_host = Some(authority_host); + self + } + + /// Set the retry configuration + pub fn with_retry(mut self, retry_config: RetryConfig) -> Self { + self.retry_config = retry_config; + self + } + + /// Configure a connection to container with given name on Microsoft Azure + /// Blob store. + pub fn build(self) -> Result { + let Self { + account_name, + access_key, + container_name, + bearer_token, + client_id, + client_secret, + tenant_id, + sas_query_pairs, + use_emulator, + retry_config, + allow_http, + authority_host, + } = self; + + let container = container_name.ok_or(Error::MissingContainerName {})?; + + let (is_emulator, allow_http, storage_url, auth, account) = if use_emulator { + let account_name = + account_name.unwrap_or_else(|| EMULATOR_ACCOUNT.to_string()); + // Allow overriding defaults. Values taken from + // from https://docs.rs/azure_storage/0.2.0/src/azure_storage/core/clients/storage_account_client.rs.html#129-141 + let url = url_from_env("AZURITE_BLOB_STORAGE_URL", "http://127.0.0.1:10000")?; + let account_key = + access_key.unwrap_or_else(|| EMULATOR_ACCOUNT_KEY.to_string()); + let credential = credential::CredentialProvider::AccessKey(account_key); + (true, true, url, credential, account_name) + } else { + let account_name = account_name.ok_or(Error::MissingAccount {})?; + let account_url = format!("https://{}.blob.core.windows.net", &account_name); + let url = Url::parse(&account_url) + .context(UnableToParseUrlSnafu { url: account_url })?; + let credential = if let Some(bearer_token) = bearer_token { + Ok(credential::CredentialProvider::AccessKey(bearer_token)) + } else if let Some(access_key) = access_key { + Ok(credential::CredentialProvider::AccessKey(access_key)) + } else if let (Some(client_id), Some(client_secret), Some(tenant_id)) = + (client_id, client_secret, tenant_id) + { + let client_credential = + crate::client::oauth::ClientSecretOAuthProvider::new_azure( + client_id, + client_secret, + tenant_id, + authority_host, + ); + Ok(credential::CredentialProvider::ClientSecret( + client_credential, + )) + } else if let Some(query_pairs) = sas_query_pairs { + Ok(credential::CredentialProvider::SASToken(query_pairs)) + } else { + Err(Error::MissingCredentials {}) + }?; + (false, allow_http, url, credential, account_name) + }; + + let config = client::AzureConfig { + account, + allow_http, + retry_config, + service: storage_url, + container, + credentials: auth, + is_emulator, + }; + + let client = Arc::new(client::AzureClient::new(config)); + + Ok(MicrosoftAzure { client }) + } +} + +/// Parses the contents of the environment variable `env_name` as a URL +/// if present, otherwise falls back to default_url +fn url_from_env(env_name: &str, default_url: &str) -> Result { + let url = match std::env::var(env_name) { + Ok(env_value) => { + Url::parse(&env_value).context(UnableToParseEmulatorUrlSnafu { + env_name, + env_value, + })? + } + Err(_) => Url::parse(default_url).expect("Failed to parse default URL"), + }; + Ok(url) +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::tests::{ + copy_if_not_exists, list_uses_directories_correctly, list_with_delimiter, + put_get_delete_list, rename_and_copy, stream_get, + }; + use std::env; + + // Helper macro to skip tests if TEST_INTEGRATION and the Azure environment + // variables are not set. + macro_rules! maybe_skip_integration { + () => {{ + dotenv::dotenv().ok(); + + let use_emulator = std::env::var("AZURE_USE_EMULATOR").is_ok(); + + let mut required_vars = vec!["OBJECT_STORE_BUCKET"]; + if !use_emulator { + required_vars.push("AZURE_STORAGE_ACCOUNT"); + required_vars.push("AZURE_STORAGE_ACCESS_KEY"); + } + let unset_vars: Vec<_> = required_vars + .iter() + .filter_map(|&name| match env::var(name) { + Ok(_) => None, + Err(_) => Some(name), + }) + .collect(); + let unset_var_names = unset_vars.join(", "); + + let force = std::env::var("TEST_INTEGRATION"); + + if force.is_ok() && !unset_var_names.is_empty() { + panic!( + "TEST_INTEGRATION is set, \ + but variable(s) {} need to be set", + unset_var_names + ) + } else if force.is_err() { + eprintln!( + "skipping Azure integration test - set {}TEST_INTEGRATION to run", + if unset_var_names.is_empty() { + String::new() + } else { + format!("{} and ", unset_var_names) + } + ); + return; + } else { + let builder = MicrosoftAzureBuilder::new() + .with_container_name( + env::var("OBJECT_STORE_BUCKET") + .expect("already checked OBJECT_STORE_BUCKET"), + ) + .with_use_emulator(use_emulator); + if !use_emulator { + builder + .with_account( + env::var("AZURE_STORAGE_ACCOUNT").unwrap_or_default(), + ) + .with_access_key( + env::var("AZURE_STORAGE_ACCESS_KEY").unwrap_or_default(), + ) + } else { + builder + } + } + }}; + } + + #[tokio::test] + async fn azure_blob_test() { + let integration = maybe_skip_integration!().build().unwrap(); + + put_get_delete_list(&integration).await; + list_uses_directories_correctly(&integration).await; + list_with_delimiter(&integration).await; + rename_and_copy(&integration).await; + copy_if_not_exists(&integration).await; + stream_get(&integration).await; + } + + // test for running integration test against actual blob service with service principal + // credentials. To run make sure all environment variables are set and remove the ignore + #[tokio::test] + #[ignore] + async fn azure_blob_test_sp() { + dotenv::dotenv().ok(); + let builder = MicrosoftAzureBuilder::new() + .with_account( + env::var("AZURE_STORAGE_ACCOUNT") + .expect("must be set AZURE_STORAGE_ACCOUNT"), + ) + .with_container_name( + env::var("OBJECT_STORE_BUCKET").expect("must be set OBJECT_STORE_BUCKET"), + ) + .with_client_secret_authorization( + env::var("AZURE_STORAGE_CLIENT_ID") + .expect("must be set AZURE_STORAGE_CLIENT_ID"), + env::var("AZURE_STORAGE_CLIENT_SECRET") + .expect("must be set AZURE_STORAGE_CLIENT_SECRET"), + env::var("AZURE_STORAGE_TENANT_ID") + .expect("must be set AZURE_STORAGE_TENANT_ID"), + ); + let integration = builder.build().unwrap(); + + put_get_delete_list(&integration).await; + list_uses_directories_correctly(&integration).await; + list_with_delimiter(&integration).await; + rename_and_copy(&integration).await; + copy_if_not_exists(&integration).await; + stream_get(&integration).await; + } +} diff --git a/src/client/mod.rs b/src/client/mod.rs index 7241002..10e8d91 100644 --- a/src/client/mod.rs +++ b/src/client/mod.rs @@ -18,7 +18,7 @@ //! Generic utilities reqwest based ObjectStore implementations pub mod backoff; -#[cfg(feature = "gcp")] +#[cfg(any(feature = "gcp", feature = "azure"))] pub mod oauth; pub mod pagination; pub mod retry; diff --git a/src/client/oauth.rs b/src/client/oauth.rs index 88e7a7b..2209406 100644 --- a/src/client/oauth.rs +++ b/src/client/oauth.rs @@ -16,13 +16,17 @@ // under the License. use crate::client::retry::RetryExt; -use crate::client::token::TemporaryToken; +use crate::client::token::{TemporaryToken, TokenCache}; use crate::RetryConfig; +use reqwest::header::{HeaderMap, HeaderValue, ACCEPT}; use reqwest::{Client, Method}; use ring::signature::RsaKeyPair; use snafu::{ResultExt, Snafu}; use std::time::{Duration, Instant}; +const CONTENT_TYPE_JSON: &str = "application/json"; +const AZURE_STORAGE_TOKEN_SCOPE: &str = "https://storage.azure.com/.default"; + #[derive(Debug, Snafu)] pub enum Error { #[snafu(display("No RSA key found in pem file"))] @@ -219,3 +223,82 @@ fn b64_encode_obj(obj: &T) -> Result { let string = serde_json::to_string(obj).context(EncodeSnafu)?; Ok(base64::encode_config(string, base64::URL_SAFE_NO_PAD)) } + +/// Encapsulates the logic to perform an OAuth token challenge +#[derive(Debug)] +pub struct ClientSecretOAuthProvider { + scope: String, + token_url: String, + client_id: String, + client_secret: String, + cache: TokenCache, +} + +impl ClientSecretOAuthProvider { + /// Create a new [`ClientSecretOAuthProvider`] for an azure backed store + pub fn new_azure( + client_id: String, + client_secret: String, + tenant_id: String, + authority_host: Option, + ) -> Self { + let authority_host = authority_host.unwrap_or_else(|| { + crate::azure::authority_hosts::AZURE_PUBLIC_CLOUD.to_owned() + }); + + Self { + scope: AZURE_STORAGE_TOKEN_SCOPE.to_owned(), + token_url: format!("{}/{}/oauth2/v2.0/token", authority_host, tenant_id), + client_id, + client_secret, + cache: TokenCache::default(), + } + } + + /// Fetch a token + pub async fn fetch_token( + &self, + client: &Client, + retry: &RetryConfig, + ) -> Result { + self.cache + .get_or_insert_with(|| self.fetch_token_inner(client, retry)) + .await + } + + /// Fetch a fresh token + async fn fetch_token_inner( + &self, + client: &Client, + retry: &RetryConfig, + ) -> Result> { + let mut headers = HeaderMap::new(); + headers.append(ACCEPT, HeaderValue::from_static(CONTENT_TYPE_JSON)); + + let mut params = std::collections::HashMap::new(); + params.insert("client_id", self.client_id.as_str()); + params.insert("client_secret", self.client_secret.as_str()); + params.insert("scope", self.scope.as_str()); + params.insert("grant_type", "client_credentials"); + + let response: TokenResponse = client + .request(Method::POST, &self.token_url) + .headers(headers) + .form(¶ms) + .send_retry(retry) + .await + .context(TokenRequestSnafu)? + .error_for_status() + .context(TokenRequestSnafu)? + .json() + .await + .context(TokenRequestSnafu)?; + + let token = TemporaryToken { + token: response.access_token, + expiry: Instant::now() + Duration::from_secs(response.expires_in), + }; + + Ok(token) + } +} diff --git a/src/client/pagination.rs b/src/client/pagination.rs index 3ab17fe..1febe3a 100644 --- a/src/client/pagination.rs +++ b/src/client/pagination.rs @@ -49,8 +49,10 @@ where futures::stream::unfold(PaginationState::Start(state), move |state| async move { let (s, page_token) = match state { PaginationState::Start(s) => (s, None), - PaginationState::HasMore(s, page_token) => (s, Some(page_token)), - PaginationState::Done => { + PaginationState::HasMore(s, page_token) if !page_token.is_empty() => { + (s, Some(page_token)) + } + _ => { return None; } }; diff --git a/src/lib.rs b/src/lib.rs index 374f559..5811eba 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -165,10 +165,10 @@ pub mod memory; pub mod path; pub mod throttle; -#[cfg(any(feature = "gcp", feature = "aws"))] +#[cfg(any(feature = "gcp", feature = "aws", feature = "azure"))] mod client; -#[cfg(any(feature = "gcp", feature = "aws"))] +#[cfg(any(feature = "gcp", feature = "aws", feature = "azure"))] pub use client::{backoff::BackoffConfig, retry::RetryConfig}; #[cfg(any(feature = "azure", feature = "aws", feature = "gcp"))] @@ -506,8 +506,6 @@ mod tests { use tokio::io::AsyncWriteExt; pub(crate) async fn put_get_delete_list(storage: &DynObjectStore) { - let store_str = storage.to_string(); - delete_fixtures(storage).await; let content_list = flatten_list_stream(storage, None).await.unwrap(); @@ -565,26 +563,16 @@ mod tests { let out_of_range = 200..300; let out_of_range_result = storage.get_range(&location, out_of_range).await; - if store_str.starts_with("MicrosoftAzureEmulator") { - // Azurite doesn't support x-ms-range-get-content-crc64 set by Azure SDK - // https://github.com/Azure/Azurite/issues/444 - let err = range_result.unwrap_err().to_string(); - assert!(err.contains("x-ms-range-get-content-crc64 header or parameter is not supported in Azurite strict mode"), "{}", err); - - let err = out_of_range_result.unwrap_err().to_string(); - assert!(err.contains("x-ms-range-get-content-crc64 header or parameter is not supported in Azurite strict mode"), "{}", err); - } else { - let bytes = range_result.unwrap(); - assert_eq!(bytes, expected_data.slice(range)); - - // Should be a non-fatal error - out_of_range_result.unwrap_err(); - - let ranges = vec![0..1, 2..3, 0..5]; - let bytes = storage.get_ranges(&location, &ranges).await.unwrap(); - for (range, bytes) in ranges.iter().zip(bytes) { - assert_eq!(bytes, expected_data.slice(range.clone())) - } + let bytes = range_result.unwrap(); + assert_eq!(bytes, expected_data.slice(range)); + + // Should be a non-fatal error + out_of_range_result.unwrap_err(); + + let ranges = vec![0..1, 2..3, 0..5]; + let bytes = storage.get_ranges(&location, &ranges).await.unwrap(); + for (range, bytes) in ranges.iter().zip(bytes) { + assert_eq!(bytes, expected_data.slice(range.clone())) } let head = storage.head(&location).await.unwrap(); @@ -725,7 +713,7 @@ mod tests { let location = Path::from("test_dir/test_upload_file.txt"); // Can write to storage - let data = get_vec_of_bytes(5_000_000, 10); + let data = get_vec_of_bytes(5_000, 10); let bytes_expected = data.concat(); let (_, mut writer) = storage.put_multipart(&location).await.unwrap(); for chunk in &data { diff --git a/src/util.rs b/src/util.rs index f548ed4..1c95214 100644 --- a/src/util.rs +++ b/src/util.rs @@ -30,11 +30,20 @@ pub fn format_prefix(prefix: Option<&crate::path::Path>) -> Option { /// Returns a formatted HTTP range header as per /// -#[cfg(any(feature = "aws", feature = "gcp"))] +#[cfg(any(feature = "aws", feature = "gcp", feature = "azure"))] pub fn format_http_range(range: std::ops::Range) -> String { format!("bytes={}-{}", range.start, range.end.saturating_sub(1)) } +#[cfg(any(feature = "aws", feature = "azure"))] +pub(crate) fn hmac_sha256( + secret: impl AsRef<[u8]>, + bytes: impl AsRef<[u8]>, +) -> ring::hmac::Tag { + let key = ring::hmac::Key::new(ring::hmac::HMAC_SHA256, secret.as_ref()); + ring::hmac::sign(&key, bytes.as_ref()) +} + /// Collect a stream into [`Bytes`] avoiding copying in the event of a single chunk pub async fn collect_bytes(mut stream: S, size_hint: Option) -> Result where From 3989bf0f216375236f6d9b2b60d3c5e5b7277697 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Wed, 24 Aug 2022 19:14:49 +0100 Subject: [PATCH 030/397] Run clippy on object store tests (#2575) --- src/aws/mod.rs | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/aws/mod.rs b/src/aws/mod.rs index 9a7a5b8..f7955ad 100644 --- a/src/aws/mod.rs +++ b/src/aws/mod.rs @@ -622,17 +622,17 @@ mod tests { #[test] fn s3_test_config_from_env() { let aws_access_key_id = env::var("AWS_ACCESS_KEY_ID") - .unwrap_or("object_store:fake_access_key_id".into()); + .unwrap_or_else(|_| "object_store:fake_access_key_id".into()); let aws_secret_access_key = env::var("AWS_SECRET_ACCESS_KEY") - .unwrap_or("object_store:fake_secret_key".into()); + .unwrap_or_else(|_| "object_store:fake_secret_key".into()); let aws_default_region = env::var("AWS_DEFAULT_REGION") - .unwrap_or("object_store:fake_default_region".into()); + .unwrap_or_else(|_| "object_store:fake_default_region".into()); - let aws_endpoint = - env::var("AWS_ENDPOINT").unwrap_or("object_store:fake_endpoint".into()); + let aws_endpoint = env::var("AWS_ENDPOINT") + .unwrap_or_else(|_| "object_store:fake_endpoint".into()); let aws_session_token = env::var("AWS_SESSION_TOKEN") - .unwrap_or("object_store:fake_session_token".into()); + .unwrap_or_else(|_| "object_store:fake_session_token".into()); // required env::set_var("AWS_ACCESS_KEY_ID", &aws_access_key_id); From f186ed7a4dcf080ab992499b3f612a9150eb42c6 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 25 Aug 2022 12:27:45 +0100 Subject: [PATCH 031/397] Improve coalesce_ranges (#2561) (#2562) (#2563) * Improve coalesce_ranges (#2561) (#2562) * Review feedback * Tweak fuzz test --- Cargo.toml | 1 + src/util.rs | 160 +++++++++++++++++++++++++++++++++++++++------------- 2 files changed, 121 insertions(+), 40 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 966c423..b5c5ef6 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -62,3 +62,4 @@ aws = ["cloud"] dotenv = "0.15.0" tempfile = "3.1.0" futures-test = "0.3" +rand = "0.8" diff --git a/src/util.rs b/src/util.rs index 1c95214..2814ca2 100644 --- a/src/util.rs +++ b/src/util.rs @@ -18,7 +18,7 @@ //! Common logic for interacting with remote object stores use super::Result; use bytes::Bytes; -use futures::{stream::StreamExt, Stream}; +use futures::{stream::StreamExt, Stream, TryStreamExt}; /// Returns the prefix to be passed to an object store #[cfg(any(feature = "aws", feature = "gcp", feature = "azure"))] @@ -85,21 +85,60 @@ where /// will be coalesced into a single request by [`coalesce_ranges`] pub const OBJECT_STORE_COALESCE_DEFAULT: usize = 1024 * 1024; -/// Takes a function to fetch ranges and coalesces adjacent ranges if they are -/// less than `coalesce` bytes apart. Out of order `ranges` are not coalesced +/// Up to this number of range requests will be performed in parallel by [`coalesce_ranges`] +pub const OBJECT_STORE_COALESCE_PARALLEL: usize = 10; + +/// Takes a function `fetch` that can fetch a range of bytes and uses this to +/// fetch the provided byte `ranges` +/// +/// To improve performance it will: +/// +/// * Combine ranges less than `coalesce` bytes apart into a single call to `fetch` +/// * Make multiple `fetch` requests in parallel (up to maximum of 10) +/// pub async fn coalesce_ranges( ranges: &[std::ops::Range], - mut fetch: F, + fetch: F, coalesce: usize, ) -> Result> where F: Send + FnMut(std::ops::Range) -> Fut, Fut: std::future::Future> + Send, { + let fetch_ranges = merge_ranges(ranges, coalesce); + + let fetched: Vec<_> = futures::stream::iter(fetch_ranges.iter().cloned()) + .map(fetch) + .buffered(OBJECT_STORE_COALESCE_PARALLEL) + .try_collect() + .await?; + + Ok(ranges + .iter() + .map(|range| { + let idx = fetch_ranges.partition_point(|v| v.start <= range.start) - 1; + let fetch_range = &fetch_ranges[idx]; + let fetch_bytes = &fetched[idx]; + + let start = range.start - fetch_range.start; + let end = range.end - fetch_range.start; + fetch_bytes.slice(start..end) + }) + .collect()) +} + +/// Returns a sorted list of ranges that cover `ranges` +fn merge_ranges( + ranges: &[std::ops::Range], + coalesce: usize, +) -> Vec> { if ranges.is_empty() { - return Ok(vec![]); + return vec![]; } + let mut ranges = ranges.to_vec(); + ranges.sort_unstable_by_key(|range| range.start); + let mut ret = Vec::with_capacity(ranges.len()); let mut start_idx = 0; let mut end_idx = 1; @@ -110,57 +149,59 @@ where while end_idx != ranges.len() && ranges[end_idx] .start - .checked_sub(ranges[start_idx].end) + .checked_sub(range_end) .map(|delta| delta <= coalesce) - .unwrap_or(false) + .unwrap_or(true) { - if ranges[end_idx].end > range_end { - range_end = ranges[end_idx].end; - } + range_end = range_end.max(ranges[end_idx].end); end_idx += 1; } let start = ranges[start_idx].start; - let bytes = fetch(start..range_end).await?; - for range in ranges.iter().take(end_idx).skip(start_idx) { - ret.push(bytes.slice(range.start - start..range.end - start)) - } + let end = range_end; + ret.push(start..end); + start_idx = end_idx; end_idx += 1; } - Ok(ret) + + ret } #[cfg(test)] mod tests { use super::*; + use rand::{thread_rng, Rng}; use std::ops::Range; + /// Calls coalesce_ranges and validates the returned data is correct + /// + /// Returns the fetched ranges + async fn do_fetch(ranges: Vec>, coalesce: usize) -> Vec> { + let max = ranges.iter().map(|x| x.end).max().unwrap_or(0); + let src: Vec<_> = (0..max).map(|x| x as u8).collect(); + + let mut fetches = vec![]; + let coalesced = coalesce_ranges( + &ranges, + |range| { + fetches.push(range.clone()); + futures::future::ready(Ok(Bytes::from(src[range].to_vec()))) + }, + coalesce, + ) + .await + .unwrap(); + + assert_eq!(ranges.len(), coalesced.len()); + for (range, bytes) in ranges.iter().zip(coalesced) { + assert_eq!(bytes.as_ref(), &src[range.clone()]); + } + fetches + } + #[tokio::test] async fn test_coalesce_ranges() { - let do_fetch = |ranges: Vec>, coalesce: usize| async move { - let max = ranges.iter().map(|x| x.end).max().unwrap_or(0); - let src: Vec<_> = (0..max).map(|x| x as u8).collect(); - - let mut fetches = vec![]; - let coalesced = coalesce_ranges( - &ranges, - |range| { - fetches.push(range.clone()); - futures::future::ready(Ok(Bytes::from(src[range].to_vec()))) - }, - coalesce, - ) - .await - .unwrap(); - - assert_eq!(ranges.len(), coalesced.len()); - for (range, bytes) in ranges.iter().zip(coalesced) { - assert_eq!(bytes.as_ref(), &src[range.clone()]); - } - fetches - }; - let fetches = do_fetch(vec![], 0).await; assert_eq!(fetches, vec![]); @@ -180,12 +221,51 @@ mod tests { assert_eq!(fetches, vec![0..1, 56..75]); let fetches = do_fetch(vec![0..1, 5..6, 7..9, 2..3, 4..6], 1).await; - assert_eq!(fetches, vec![0..1, 5..9, 2..6]); + assert_eq!(fetches, vec![0..9]); let fetches = do_fetch(vec![0..1, 5..6, 7..9, 2..3, 4..6], 1).await; - assert_eq!(fetches, vec![0..1, 5..9, 2..6]); + assert_eq!(fetches, vec![0..9]); let fetches = do_fetch(vec![0..1, 6..7, 8..9, 10..14, 9..10], 4).await; assert_eq!(fetches, vec![0..1, 6..14]); } + + #[tokio::test] + async fn test_coalesce_fuzz() { + let mut rand = thread_rng(); + for _ in 0..100 { + let object_len = rand.gen_range(10..250); + let range_count = rand.gen_range(0..10); + let ranges: Vec<_> = (0..range_count) + .map(|_| { + let start = rand.gen_range(0..object_len); + let max_len = 20.min(object_len - start); + let len = rand.gen_range(0..max_len); + start..start + len + }) + .collect(); + + let coalesce = rand.gen_range(1..5); + let fetches = do_fetch(ranges.clone(), coalesce).await; + + for fetch in fetches.windows(2) { + assert!( + fetch[0].start <= fetch[1].start, + "fetches should be sorted, {:?} vs {:?}", + fetch[0], + fetch[1] + ); + + let delta = fetch[1].end - fetch[0].end; + assert!( + delta > coalesce, + "fetches should not overlap by {}, {:?} vs {:?} for {:?}", + coalesce, + fetch[0], + fetch[1], + ranges + ); + } + } + } } From c6f86c2fefc84946fdd07cf6b01866fc6aad4455 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 25 Aug 2022 14:14:20 +0100 Subject: [PATCH 032/397] Fix retry logic (#2573) (#2572) (#2574) * Fix retry logic (#2573) (#2572) * Fix logical conflicts * Rework tests --- Cargo.toml | 1 + src/aws/client.rs | 50 ++++----- src/aws/mod.rs | 2 +- src/azure/client.rs | 37 +++---- src/azure/mod.rs | 2 +- src/client/oauth.rs | 13 +-- src/client/retry.rs | 242 +++++++++++++++++++++++++++++++++++++++++--- src/gcp.rs | 83 +++++---------- 8 files changed, 297 insertions(+), 133 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index b5c5ef6..2be233c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -63,3 +63,4 @@ dotenv = "0.15.0" tempfile = "3.1.0" futures-test = "0.3" rand = "0.8" +hyper = { version = "0.14", features = ["server"] } diff --git a/src/aws/client.rs b/src/aws/client.rs index 36ba9ad..d8ab3bb 100644 --- a/src/aws/client.rs +++ b/src/aws/client.rs @@ -52,36 +52,48 @@ const STRICT_PATH_ENCODE_SET: AsciiSet = STRICT_ENCODE_SET.remove(b'/'); pub(crate) enum Error { #[snafu(display("Error performing get request {}: {}", path, source))] GetRequest { + source: crate::client::retry::Error, + path: String, + }, + + #[snafu(display("Error fetching get response body {}: {}", path, source))] + GetResponseBody { source: reqwest::Error, path: String, }, #[snafu(display("Error performing put request {}: {}", path, source))] PutRequest { - source: reqwest::Error, + source: crate::client::retry::Error, path: String, }, #[snafu(display("Error performing delete request {}: {}", path, source))] DeleteRequest { - source: reqwest::Error, + source: crate::client::retry::Error, path: String, }, #[snafu(display("Error performing copy request {}: {}", path, source))] CopyRequest { - source: reqwest::Error, + source: crate::client::retry::Error, path: String, }, #[snafu(display("Error performing list request: {}", source))] - ListRequest { source: reqwest::Error }, + ListRequest { source: crate::client::retry::Error }, + + #[snafu(display("Error getting list response body: {}", source))] + ListResponseBody { source: reqwest::Error }, #[snafu(display("Error performing create multipart request: {}", source))] - CreateMultipartRequest { source: reqwest::Error }, + CreateMultipartRequest { source: crate::client::retry::Error }, + + #[snafu(display("Error getting create multipart response body: {}", source))] + CreateMultipartResponseBody { source: reqwest::Error }, #[snafu(display("Error performing complete multipart request: {}", source))] - CompleteMultipartRequest { source: reqwest::Error }, + CompleteMultipartRequest { source: crate::client::retry::Error }, #[snafu(display("Got invalid list response: {}", source))] InvalidListResponse { source: quick_xml::de::DeError }, @@ -259,10 +271,6 @@ impl S3Client { .with_aws_sigv4(credential.as_ref(), &self.config.region, "s3") .send_retry(&self.config.retry_config) .await - .context(GetRequestSnafu { - path: path.as_ref(), - })? - .error_for_status() .context(GetRequestSnafu { path: path.as_ref(), })?; @@ -290,10 +298,6 @@ impl S3Client { .with_aws_sigv4(credential.as_ref(), &self.config.region, "s3") .send_retry(&self.config.retry_config) .await - .context(PutRequestSnafu { - path: path.as_ref(), - })? - .error_for_status() .context(PutRequestSnafu { path: path.as_ref(), })?; @@ -316,10 +320,6 @@ impl S3Client { .with_aws_sigv4(credential.as_ref(), &self.config.region, "s3") .send_retry(&self.config.retry_config) .await - .context(DeleteRequestSnafu { - path: path.as_ref(), - })? - .error_for_status() .context(DeleteRequestSnafu { path: path.as_ref(), })?; @@ -339,10 +339,6 @@ impl S3Client { .with_aws_sigv4(credential.as_ref(), &self.config.region, "s3") .send_retry(&self.config.retry_config) .await - .context(CopyRequestSnafu { - path: from.as_ref(), - })? - .error_for_status() .context(CopyRequestSnafu { path: from.as_ref(), })?; @@ -385,11 +381,9 @@ impl S3Client { .send_retry(&self.config.retry_config) .await .context(ListRequestSnafu)? - .error_for_status() - .context(ListRequestSnafu)? .bytes() .await - .context(ListRequestSnafu)?; + .context(ListResponseBodySnafu)?; let mut response: ListResponse = quick_xml::de::from_reader(response.reader()) .context(InvalidListResponseSnafu)?; @@ -430,11 +424,9 @@ impl S3Client { .send_retry(&self.config.retry_config) .await .context(CreateMultipartRequestSnafu)? - .error_for_status() - .context(CreateMultipartRequestSnafu)? .bytes() .await - .context(CreateMultipartRequestSnafu)?; + .context(CreateMultipartResponseBodySnafu)?; let response: InitiateMultipart = quick_xml::de::from_reader(response.reader()) .context(InvalidMultipartResponseSnafu)?; @@ -470,8 +462,6 @@ impl S3Client { .with_aws_sigv4(credential.as_ref(), &self.config.region, "s3") .send_retry(&self.config.retry_config) .await - .context(CompleteMultipartRequestSnafu)? - .error_for_status() .context(CompleteMultipartRequestSnafu)?; Ok(()) diff --git a/src/aws/mod.rs b/src/aws/mod.rs index f7955ad..ab90afa 100644 --- a/src/aws/mod.rs +++ b/src/aws/mod.rs @@ -175,7 +175,7 @@ impl ObjectStore for AmazonS3 { .await? .bytes() .await - .map_err(|source| client::Error::GetRequest { + .map_err(|source| client::Error::GetResponseBody { source, path: location.to_string(), })?; diff --git a/src/azure/client.rs b/src/azure/client.rs index 5f37ea9..722f676 100644 --- a/src/azure/client.rs +++ b/src/azure/client.rs @@ -41,30 +41,39 @@ use url::Url; pub(crate) enum Error { #[snafu(display("Error performing get request {}: {}", path, source))] GetRequest { + source: crate::client::retry::Error, + path: String, + }, + + #[snafu(display("Error getting get response body {}: {}", path, source))] + GetResponseBody { source: reqwest::Error, path: String, }, #[snafu(display("Error performing put request {}: {}", path, source))] PutRequest { - source: reqwest::Error, + source: crate::client::retry::Error, path: String, }, #[snafu(display("Error performing delete request {}: {}", path, source))] DeleteRequest { - source: reqwest::Error, + source: crate::client::retry::Error, path: String, }, #[snafu(display("Error performing copy request {}: {}", path, source))] CopyRequest { - source: reqwest::Error, + source: crate::client::retry::Error, path: String, }, #[snafu(display("Error performing list request: {}", source))] - ListRequest { source: reqwest::Error }, + ListRequest { source: crate::client::retry::Error }, + + #[snafu(display("Error getting list response body: {}", source))] + ListResponseBody { source: reqwest::Error }, #[snafu(display("Error performing create multipart request: {}", source))] CreateMultipartRequest { source: reqwest::Error }, @@ -218,10 +227,6 @@ impl AzureClient { .with_azure_authorization(&credential, &self.config.account) .send_retry(&self.config.retry_config) .await - .context(PutRequestSnafu { - path: path.as_ref(), - })? - .error_for_status() .context(PutRequestSnafu { path: path.as_ref(), })?; @@ -259,10 +264,6 @@ impl AzureClient { .with_azure_authorization(&credential, &self.config.account) .send_retry(&self.config.retry_config) .await - .context(GetRequestSnafu { - path: path.as_ref(), - })? - .error_for_status() .context(GetRequestSnafu { path: path.as_ref(), })?; @@ -286,10 +287,6 @@ impl AzureClient { .with_azure_authorization(&credential, &self.config.account) .send_retry(&self.config.retry_config) .await - .context(DeleteRequestSnafu { - path: path.as_ref(), - })? - .error_for_status() .context(DeleteRequestSnafu { path: path.as_ref(), })?; @@ -328,10 +325,6 @@ impl AzureClient { .with_azure_authorization(&credential, &self.config.account) .send_retry(&self.config.retry_config) .await - .context(CopyRequestSnafu { - path: from.as_ref(), - })? - .error_for_status() .context(CopyRequestSnafu { path: from.as_ref(), })?; @@ -373,11 +366,9 @@ impl AzureClient { .send_retry(&self.config.retry_config) .await .context(ListRequestSnafu)? - .error_for_status() - .context(ListRequestSnafu)? .bytes() .await - .context(ListRequestSnafu)?; + .context(ListResponseBodySnafu)?; let mut response: ListResultInternal = quick_xml::de::from_reader(response.reader()) diff --git a/src/azure/mod.rs b/src/azure/mod.rs index 53e7ed6..c659e1f 100644 --- a/src/azure/mod.rs +++ b/src/azure/mod.rs @@ -188,7 +188,7 @@ impl ObjectStore for MicrosoftAzure { .await? .bytes() .await - .map_err(|source| client::Error::GetRequest { + .map_err(|source| client::Error::GetResponseBody { source, path: location.to_string(), })?; diff --git a/src/client/oauth.rs b/src/client/oauth.rs index 2209406..6b3acea 100644 --- a/src/client/oauth.rs +++ b/src/client/oauth.rs @@ -45,7 +45,10 @@ pub enum Error { UnsupportedKey { encoding: String }, #[snafu(display("Error performing token request: {}", source))] - TokenRequest { source: reqwest::Error }, + TokenRequest { source: crate::client::retry::Error }, + + #[snafu(display("Error getting token response body: {}", source))] + TokenResponseBody { source: reqwest::Error }, } pub type Result = std::result::Result; @@ -181,11 +184,9 @@ impl OAuthProvider { .send_retry(retry) .await .context(TokenRequestSnafu)? - .error_for_status() - .context(TokenRequestSnafu)? .json() .await - .context(TokenRequestSnafu)?; + .context(TokenResponseBodySnafu)?; let token = TemporaryToken { token: response.access_token, @@ -289,10 +290,10 @@ impl ClientSecretOAuthProvider { .await .context(TokenRequestSnafu)? .error_for_status() - .context(TokenRequestSnafu)? + .context(TokenResponseBodySnafu)? .json() .await - .context(TokenRequestSnafu)?; + .context(TokenResponseBodySnafu)?; let token = TemporaryToken { token: response.access_token, diff --git a/src/client/retry.rs b/src/client/retry.rs index c4dd6ee..44d7835 100644 --- a/src/client/retry.rs +++ b/src/client/retry.rs @@ -20,10 +20,55 @@ use crate::client::backoff::{Backoff, BackoffConfig}; use futures::future::BoxFuture; use futures::FutureExt; -use reqwest::{Response, Result}; +use reqwest::{Response, StatusCode}; +use snafu::Snafu; use std::time::{Duration, Instant}; use tracing::info; +/// Retry request error +#[derive(Debug, Snafu)] +#[snafu(display( + "response error \"{}\", after {} retries: {}", + message, + retries, + source +))] +pub struct Error { + retries: usize, + message: String, + source: reqwest::Error, +} + +impl Error { + /// Returns the status code associated with this error if any + pub fn status(&self) -> Option { + self.source.status() + } +} + +impl From for std::io::Error { + fn from(err: Error) -> Self { + use std::io::ErrorKind; + if err.source.is_builder() || err.source.is_request() { + Self::new(ErrorKind::InvalidInput, err) + } else if let Some(s) = err.source.status() { + match s { + StatusCode::NOT_FOUND => Self::new(ErrorKind::NotFound, err), + StatusCode::BAD_REQUEST => Self::new(ErrorKind::InvalidInput, err), + _ => Self::new(ErrorKind::Other, err), + } + } else if err.source.is_timeout() { + Self::new(ErrorKind::TimedOut, err) + } else if err.source.is_connect() { + Self::new(ErrorKind::NotConnected, err) + } else { + Self::new(ErrorKind::Other, err) + } + } +} + +pub type Result = std::result::Result; + /// Contains the configuration for how to respond to server errors /// /// By default they will be retried up to some limit, using exponential @@ -85,22 +130,195 @@ impl RetryExt for reqwest::RequestBuilder { loop { let s = self.try_clone().expect("request body must be cloneable"); match s.send().await { - Err(e) - if retries < max_retries - && now.elapsed() < retry_timeout - && e.status() - .map(|s| s.is_server_error()) - .unwrap_or(false) => + Ok(r) => match r.error_for_status_ref() { + Ok(_) => return Ok(r), + Err(e) => { + let status = r.status(); + + if retries == max_retries + || now.elapsed() > retry_timeout + || !status.is_server_error() { + + // Get the response message if returned a client error + let message = match status.is_client_error() { + true => match r.text().await { + Ok(message) if !message.is_empty() => message, + Ok(_) => "No Body".to_string(), + Err(e) => format!("error getting response body: {}", e) + } + false => status.to_string(), + }; + + return Err(Error{ + message, + retries, + source: e, + }) + + } + + let sleep = backoff.next(); + retries += 1; + info!("Encountered server error, backing off for {} seconds, retry {} of {}", sleep.as_secs_f32(), retries, max_retries); + tokio::time::sleep(sleep).await; + } + }, + Err(e) => { - let sleep = backoff.next(); - retries += 1; - info!("Encountered server error, backing off for {} seconds, retry {} of {}", sleep.as_secs_f32(), retries, max_retries); - tokio::time::sleep(sleep).await; + return Err(Error{ + retries, + message: "request error".to_string(), + source: e + }) } - r => return r, } } } .boxed() } } + +#[cfg(test)] +mod tests { + use crate::client::retry::RetryExt; + use crate::RetryConfig; + use hyper::header::LOCATION; + use hyper::service::{make_service_fn, service_fn}; + use hyper::{Body, Response, Server}; + use parking_lot::Mutex; + use reqwest::{Client, Method, StatusCode}; + use std::collections::VecDeque; + use std::convert::Infallible; + use std::net::SocketAddr; + use std::sync::Arc; + use std::time::Duration; + + #[tokio::test] + async fn test_retry() { + let responses: Arc>>> = + Arc::new(Mutex::new(VecDeque::with_capacity(10))); + + let r = Arc::clone(&responses); + let make_service = make_service_fn(move |_conn| { + let r = Arc::clone(&r); + async move { + Ok::<_, Infallible>(service_fn(move |_req| { + let r = Arc::clone(&r); + async move { + Ok::<_, Infallible>(match r.lock().pop_front() { + Some(r) => r, + None => Response::new(Body::from("Hello World")), + }) + } + })) + } + }); + + let (tx, rx) = tokio::sync::oneshot::channel::<()>(); + let server = + Server::bind(&SocketAddr::from(([127, 0, 0, 1], 0))).serve(make_service); + + let url = format!("http://{}", server.local_addr()); + + let server_handle = tokio::spawn(async move { + server + .with_graceful_shutdown(async { + rx.await.ok(); + }) + .await + .unwrap() + }); + + let retry = RetryConfig { + backoff: Default::default(), + max_retries: 2, + retry_timeout: Duration::from_secs(1000), + }; + + let client = Client::new(); + let do_request = || client.request(Method::GET, &url).send_retry(&retry); + + // Simple request should work + let r = do_request().await.unwrap(); + assert_eq!(r.status(), StatusCode::OK); + + // Returns client errors immediately with status message + responses.lock().push_back( + Response::builder() + .status(StatusCode::BAD_REQUEST) + .body(Body::from("cupcakes")) + .unwrap(), + ); + + let e = do_request().await.unwrap_err(); + assert_eq!(e.status().unwrap(), StatusCode::BAD_REQUEST); + assert_eq!(e.retries, 0); + assert_eq!(&e.message, "cupcakes"); + + // Handles client errors with no payload + responses.lock().push_back( + Response::builder() + .status(StatusCode::BAD_REQUEST) + .body(Body::empty()) + .unwrap(), + ); + + let e = do_request().await.unwrap_err(); + assert_eq!(e.status().unwrap(), StatusCode::BAD_REQUEST); + assert_eq!(e.retries, 0); + assert_eq!(&e.message, "No Body"); + + // Should retry server error request + responses.lock().push_back( + Response::builder() + .status(StatusCode::BAD_GATEWAY) + .body(Body::empty()) + .unwrap(), + ); + + let r = do_request().await.unwrap(); + assert_eq!(r.status(), StatusCode::OK); + + // Accepts 204 status code + responses.lock().push_back( + Response::builder() + .status(StatusCode::NO_CONTENT) + .body(Body::empty()) + .unwrap(), + ); + + let r = do_request().await.unwrap(); + assert_eq!(r.status(), StatusCode::NO_CONTENT); + + // Follows redirects + responses.lock().push_back( + Response::builder() + .status(StatusCode::FOUND) + .header(LOCATION, "/foo") + .body(Body::empty()) + .unwrap(), + ); + + let r = do_request().await.unwrap(); + assert_eq!(r.status(), StatusCode::OK); + assert_eq!(r.url().path(), "/foo"); + + // Gives up after the retrying the specified number of times + for _ in 0..=retry.max_retries { + responses.lock().push_back( + Response::builder() + .status(StatusCode::BAD_GATEWAY) + .body(Body::from("ignored")) + .unwrap(), + ); + } + + let e = do_request().await.unwrap_err(); + assert_eq!(e.retries, retry.max_retries); + assert_eq!(e.message, "502 Bad Gateway"); + + // Shutdown + let _ = tx.send(()); + server_handle.await.unwrap(); + } +} diff --git a/src/gcp.rs b/src/gcp.rs index c9bb633..e9c7d02 100644 --- a/src/gcp.rs +++ b/src/gcp.rs @@ -72,28 +72,40 @@ enum Error { }, #[snafu(display("Error performing list request: {}", source))] - ListRequest { source: reqwest::Error }, + ListRequest { source: crate::client::retry::Error }, + + #[snafu(display("Error getting list response body: {}", source))] + ListResponseBody { source: reqwest::Error }, #[snafu(display("Error performing get request {}: {}", path, source))] GetRequest { + source: crate::client::retry::Error, + path: String, + }, + + #[snafu(display("Error getting get response body {}: {}", path, source))] + GetResponseBody { source: reqwest::Error, path: String, }, #[snafu(display("Error performing delete request {}: {}", path, source))] DeleteRequest { - source: reqwest::Error, + source: crate::client::retry::Error, path: String, }, #[snafu(display("Error performing copy request {}: {}", path, source))] CopyRequest { - source: reqwest::Error, + source: crate::client::retry::Error, path: String, }, #[snafu(display("Error performing put request: {}", source))] - PutRequest { source: reqwest::Error }, + PutRequest { source: crate::client::retry::Error }, + + #[snafu(display("Error getting put response body: {}", source))] + PutResponseBody { source: reqwest::Error }, #[snafu(display("Error decoding object size: {}", source))] InvalidSize { source: std::num::ParseIntError }, @@ -269,10 +281,6 @@ impl GoogleCloudStorageClient { .query(&[("alt", alt)]) .send_retry(&self.retry_config) .await - .context(GetRequestSnafu { - path: path.as_ref(), - })? - .error_for_status() .context(GetRequestSnafu { path: path.as_ref(), })?; @@ -297,8 +305,6 @@ impl GoogleCloudStorageClient { .body(payload) .send_retry(&self.retry_config) .await - .context(PutRequestSnafu)? - .error_for_status() .context(PutRequestSnafu)?; Ok(()) @@ -318,11 +324,9 @@ impl GoogleCloudStorageClient { .query(&[("uploads", "")]) .send_retry(&self.retry_config) .await - .context(PutRequestSnafu)? - .error_for_status() .context(PutRequestSnafu)?; - let data = response.bytes().await.context(PutRequestSnafu)?; + let data = response.bytes().await.context(PutResponseBodySnafu)?; let result: InitiateMultipartUploadResult = quick_xml::de::from_reader( data.as_ref().reader(), ) @@ -352,8 +356,6 @@ impl GoogleCloudStorageClient { .query(&[("uploadId", multipart_id)]) .send_retry(&self.retry_config) .await - .context(PutRequestSnafu)? - .error_for_status() .context(PutRequestSnafu)?; Ok(()) @@ -369,10 +371,6 @@ impl GoogleCloudStorageClient { .bearer_auth(token) .send_retry(&self.retry_config) .await - .context(DeleteRequestSnafu { - path: path.as_ref(), - })? - .error_for_status() .context(DeleteRequestSnafu { path: path.as_ref(), })?; @@ -412,10 +410,6 @@ impl GoogleCloudStorageClient { .bearer_auth(token) .send_retry(&self.retry_config) .await - .context(CopyRequestSnafu { - path: from.as_ref(), - })? - .error_for_status() .context(CopyRequestSnafu { path: from.as_ref(), })?; @@ -462,11 +456,9 @@ impl GoogleCloudStorageClient { .send_retry(&self.retry_config) .await .context(ListRequestSnafu)? - .error_for_status() - .context(ListRequestSnafu)? .json() .await - .context(ListRequestSnafu)?; + .context(ListResponseBodySnafu)?; Ok(response) } @@ -489,27 +481,6 @@ impl GoogleCloudStorageClient { } } -fn reqwest_error_as_io(err: reqwest::Error) -> io::Error { - if err.is_builder() || err.is_request() { - io::Error::new(io::ErrorKind::InvalidInput, err) - } else if err.is_status() { - match err.status() { - Some(StatusCode::NOT_FOUND) => io::Error::new(io::ErrorKind::NotFound, err), - Some(StatusCode::BAD_REQUEST) => { - io::Error::new(io::ErrorKind::InvalidInput, err) - } - Some(_) => io::Error::new(io::ErrorKind::Other, err), - None => io::Error::new(io::ErrorKind::Other, err), - } - } else if err.is_timeout() { - io::Error::new(io::ErrorKind::TimedOut, err) - } else if err.is_connect() { - io::Error::new(io::ErrorKind::NotConnected, err) - } else { - io::Error::new(io::ErrorKind::Other, err) - } -} - struct GCSMultipartUpload { client: Arc, encoded_path: String, @@ -549,10 +520,7 @@ impl CloudMultiPartUploadImpl for GCSMultipartUpload { .header(header::CONTENT_LENGTH, format!("{}", buf.len())) .body(buf) .send_retry(&self.client.retry_config) - .await - .map_err(reqwest_error_as_io)? - .error_for_status() - .map_err(reqwest_error_as_io)?; + .await?; let content_id = response .headers() @@ -609,10 +577,7 @@ impl CloudMultiPartUploadImpl for GCSMultipartUpload { .query(&[("uploadId", upload_id)]) .body(data) .send_retry(&self.client.retry_config) - .await - .map_err(reqwest_error_as_io)? - .error_for_status() - .map_err(reqwest_error_as_io)?; + .await?; Ok(()) } @@ -672,14 +637,14 @@ impl ObjectStore for GoogleCloudStorage { .client .get_request(location, Some(range), false) .await?; - Ok(response.bytes().await.context(GetRequestSnafu { + Ok(response.bytes().await.context(GetResponseBodySnafu { path: location.as_ref(), })?) } async fn head(&self, location: &Path) -> Result { let response = self.client.get_request(location, None, true).await?; - let object = response.json().await.context(GetRequestSnafu { + let object = response.json().await.context(GetResponseBodySnafu { path: location.as_ref(), })?; convert_object_meta(&object) @@ -1057,9 +1022,7 @@ mod test { .unwrap_err() .to_string(); assert!( - err.contains( - "Error performing put request: HTTP status client error (404 Not Found)" - ), + err.contains("HTTP status client error (404 Not Found)"), "{}", err ) From 3e046bdfd6e6f0801e1de5735bd783c0e2d46a2c Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Sat, 27 Aug 2022 17:54:22 +0100 Subject: [PATCH 033/397] ObjectStore cleanup (#2587) (#2590) * ObjectStore cleanup (#2587) * Fix CI --- Cargo.toml | 4 +- src/azure/client.rs | 13 +-- src/azure/credential.rs | 99 +++++++++++++++++++++- src/azure/mod.rs | 16 ++-- src/client/mod.rs | 2 - src/gcp.rs | 15 +++- src/{client/oauth.rs => gcp/credential.rs} | 85 +------------------ src/lib.rs | 4 - 8 files changed, 124 insertions(+), 114 deletions(-) rename src/{client/oauth.rs => gcp/credential.rs} (72%) diff --git a/Cargo.toml b/Cargo.toml index 2be233c..ccfe123 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -53,9 +53,9 @@ ring = { version = "0.16", default-features = false, features = ["std"], optiona rustls-pemfile = { version = "1.0", default-features = false, optional = true } [features] -cloud = ["serde", "serde_json", "quick-xml", "reqwest", "reqwest/json", "reqwest/stream", "chrono/serde", "rustls-pemfile", "base64", "rand", "ring"] +cloud = ["serde", "serde_json", "quick-xml", "reqwest", "reqwest/json", "reqwest/stream", "chrono/serde", "base64", "rand", "ring"] azure = ["cloud"] -gcp = ["cloud"] +gcp = ["cloud", "rustls-pemfile"] aws = ["cloud"] [dev-dependencies] # In alphabetical order diff --git a/src/azure/client.rs b/src/azure/client.rs index 722f676..9f87a88 100644 --- a/src/azure/client.rs +++ b/src/azure/client.rs @@ -75,20 +75,13 @@ pub(crate) enum Error { #[snafu(display("Error getting list response body: {}", source))] ListResponseBody { source: reqwest::Error }, - #[snafu(display("Error performing create multipart request: {}", source))] - CreateMultipartRequest { source: reqwest::Error }, - - #[snafu(display("Error performing complete multipart request: {}", source))] - CompleteMultipartRequest { source: reqwest::Error }, - #[snafu(display("Got invalid list response: {}", source))] InvalidListResponse { source: quick_xml::de::DeError }, - #[snafu(display("Got invalid multipart response: {}", source))] - InvalidMultipartResponse { source: quick_xml::de::DeError }, - #[snafu(display("Error authorizing request: {}", source))] - Authorization { source: crate::client::oauth::Error }, + Authorization { + source: crate::azure::credential::Error, + }, } impl From for crate::Error { diff --git a/src/azure/credential.rs b/src/azure/credential.rs index 9357e80..721fcae 100644 --- a/src/azure/credential.rs +++ b/src/azure/credential.rs @@ -15,19 +15,24 @@ // specific language governing permissions and limitations // under the License. -use crate::client::oauth::ClientSecretOAuthProvider; +use crate::client::retry::RetryExt; +use crate::client::token::{TemporaryToken, TokenCache}; use crate::util::hmac_sha256; +use crate::RetryConfig; use chrono::Utc; +use reqwest::header::ACCEPT; use reqwest::{ header::{ HeaderMap, HeaderName, HeaderValue, AUTHORIZATION, CONTENT_ENCODING, CONTENT_LANGUAGE, CONTENT_LENGTH, CONTENT_TYPE, DATE, IF_MATCH, IF_MODIFIED_SINCE, IF_NONE_MATCH, IF_UNMODIFIED_SINCE, RANGE, }, - Method, RequestBuilder, + Client, Method, RequestBuilder, }; +use snafu::{ResultExt, Snafu}; use std::borrow::Cow; use std::str; +use std::time::{Duration, Instant}; use url::Url; static AZURE_VERSION: HeaderValue = HeaderValue::from_static("2021-08-06"); @@ -38,6 +43,18 @@ pub(crate) static DELETE_SNAPSHOTS: HeaderName = pub(crate) static COPY_SOURCE: HeaderName = HeaderName::from_static("x-ms-copy-source"); static CONTENT_MD5: HeaderName = HeaderName::from_static("content-md5"); pub(crate) static RFC1123_FMT: &str = "%a, %d %h %Y %T GMT"; +const CONTENT_TYPE_JSON: &str = "application/json"; + +#[derive(Debug, Snafu)] +pub enum Error { + #[snafu(display("Error performing token request: {}", source))] + TokenRequest { source: crate::client::retry::Error }, + + #[snafu(display("Error getting token response body: {}", source))] + TokenResponseBody { source: reqwest::Error }, +} + +pub type Result = std::result::Result; /// Provides credentials for use when signing requests #[derive(Debug)] @@ -253,3 +270,81 @@ fn lexy_sort<'a>( values.sort_unstable(); values } + +#[derive(serde::Deserialize, Debug)] +struct TokenResponse { + access_token: String, + expires_in: u64, +} + +/// Encapsulates the logic to perform an OAuth token challenge +#[derive(Debug)] +pub struct ClientSecretOAuthProvider { + scope: String, + token_url: String, + client_id: String, + client_secret: String, + cache: TokenCache, +} + +impl ClientSecretOAuthProvider { + /// Create a new [`ClientSecretOAuthProvider`] for an azure backed store + pub fn new( + client_id: String, + client_secret: String, + tenant_id: String, + authority_host: Option, + ) -> Self { + let authority_host = authority_host + .unwrap_or_else(|| authority_hosts::AZURE_PUBLIC_CLOUD.to_owned()); + + Self { + scope: "https://storage.azure.com/.default".to_owned(), + token_url: format!("{}/{}/oauth2/v2.0/token", authority_host, tenant_id), + client_id, + client_secret, + cache: TokenCache::default(), + } + } + + /// Fetch a token + pub async fn fetch_token( + &self, + client: &Client, + retry: &RetryConfig, + ) -> Result { + self.cache + .get_or_insert_with(|| self.fetch_token_inner(client, retry)) + .await + } + + /// Fetch a fresh token + async fn fetch_token_inner( + &self, + client: &Client, + retry: &RetryConfig, + ) -> Result> { + let response: TokenResponse = client + .request(Method::POST, &self.token_url) + .header(ACCEPT, HeaderValue::from_static(CONTENT_TYPE_JSON)) + .form(&[ + ("client_id", self.client_id.as_str()), + ("client_secret", self.client_secret.as_str()), + ("scope", self.scope.as_str()), + ("grant_type", "client_credentials"), + ]) + .send_retry(retry) + .await + .context(TokenRequestSnafu)? + .json() + .await + .context(TokenResponseBodySnafu)?; + + let token = TemporaryToken { + token: response.access_token, + expiry: Instant::now() + Duration::from_secs(response.expires_in), + }; + + Ok(token) + } +} diff --git a/src/azure/mod.rs b/src/azure/mod.rs index c659e1f..dd1cde9 100644 --- a/src/azure/mod.rs +++ b/src/azure/mod.rs @@ -110,6 +110,9 @@ enum Error { #[snafu(display("At least one authorization option must be specified"))] MissingCredentials {}, + + #[snafu(display("Azure credential error: {}", source), context(false))] + Credential { source: credential::Error }, } impl From for super::Error { @@ -539,13 +542,12 @@ impl MicrosoftAzureBuilder { } else if let (Some(client_id), Some(client_secret), Some(tenant_id)) = (client_id, client_secret, tenant_id) { - let client_credential = - crate::client::oauth::ClientSecretOAuthProvider::new_azure( - client_id, - client_secret, - tenant_id, - authority_host, - ); + let client_credential = credential::ClientSecretOAuthProvider::new( + client_id, + client_secret, + tenant_id, + authority_host, + ); Ok(credential::CredentialProvider::ClientSecret( client_credential, )) diff --git a/src/client/mod.rs b/src/client/mod.rs index 10e8d91..e6de3e9 100644 --- a/src/client/mod.rs +++ b/src/client/mod.rs @@ -18,8 +18,6 @@ //! Generic utilities reqwest based ObjectStore implementations pub mod backoff; -#[cfg(any(feature = "gcp", feature = "azure"))] -pub mod oauth; pub mod pagination; pub mod retry; pub mod token; diff --git a/src/gcp.rs b/src/gcp.rs index e9c7d02..65adf91 100644 --- a/src/gcp.rs +++ b/src/gcp.rs @@ -48,13 +48,17 @@ use tokio::io::AsyncWrite; use crate::client::pagination::stream_paginated; use crate::client::retry::RetryExt; use crate::{ - client::{oauth::OAuthProvider, token::TokenCache}, + client::token::TokenCache, multipart::{CloudMultiPartUpload, CloudMultiPartUploadImpl, UploadPart}, path::{Path, DELIMITER}, util::{format_http_range, format_prefix}, GetResult, ListResult, MultipartId, ObjectMeta, ObjectStore, Result, RetryConfig, }; +use credential::OAuthProvider; + +mod credential; + #[derive(Debug, Snafu)] enum Error { #[snafu(display("Unable to open service account file: {}", source))] @@ -115,6 +119,9 @@ enum Error { #[snafu(display("Missing service account path"))] MissingServiceAccountPath, + + #[snafu(display("GCP credential error: {}", source))] + Credential { source: credential::Error }, } impl From for super::Error { @@ -240,7 +247,8 @@ impl GoogleCloudStorageClient { .get_or_insert_with(|| { oauth_provider.fetch_token(&self.client, &self.retry_config) }) - .await?) + .await + .context(CredentialSnafu)?) } else { Ok("".to_owned()) } @@ -818,7 +826,8 @@ impl GoogleCloudStorageBuilder { audience, ) }) - .transpose()?; + .transpose() + .context(CredentialSnafu)?; let encoded_bucket_name = percent_encode(bucket_name.as_bytes(), NON_ALPHANUMERIC).to_string(); diff --git a/src/client/oauth.rs b/src/gcp/credential.rs similarity index 72% rename from src/client/oauth.rs rename to src/gcp/credential.rs index 6b3acea..5b8cdb8 100644 --- a/src/client/oauth.rs +++ b/src/gcp/credential.rs @@ -16,17 +16,13 @@ // under the License. use crate::client::retry::RetryExt; -use crate::client::token::{TemporaryToken, TokenCache}; +use crate::client::token::TemporaryToken; use crate::RetryConfig; -use reqwest::header::{HeaderMap, HeaderValue, ACCEPT}; use reqwest::{Client, Method}; use ring::signature::RsaKeyPair; use snafu::{ResultExt, Snafu}; use std::time::{Duration, Instant}; -const CONTENT_TYPE_JSON: &str = "application/json"; -const AZURE_STORAGE_TOKEN_SCOPE: &str = "https://storage.azure.com/.default"; - #[derive(Debug, Snafu)] pub enum Error { #[snafu(display("No RSA key found in pem file"))] @@ -224,82 +220,3 @@ fn b64_encode_obj(obj: &T) -> Result { let string = serde_json::to_string(obj).context(EncodeSnafu)?; Ok(base64::encode_config(string, base64::URL_SAFE_NO_PAD)) } - -/// Encapsulates the logic to perform an OAuth token challenge -#[derive(Debug)] -pub struct ClientSecretOAuthProvider { - scope: String, - token_url: String, - client_id: String, - client_secret: String, - cache: TokenCache, -} - -impl ClientSecretOAuthProvider { - /// Create a new [`ClientSecretOAuthProvider`] for an azure backed store - pub fn new_azure( - client_id: String, - client_secret: String, - tenant_id: String, - authority_host: Option, - ) -> Self { - let authority_host = authority_host.unwrap_or_else(|| { - crate::azure::authority_hosts::AZURE_PUBLIC_CLOUD.to_owned() - }); - - Self { - scope: AZURE_STORAGE_TOKEN_SCOPE.to_owned(), - token_url: format!("{}/{}/oauth2/v2.0/token", authority_host, tenant_id), - client_id, - client_secret, - cache: TokenCache::default(), - } - } - - /// Fetch a token - pub async fn fetch_token( - &self, - client: &Client, - retry: &RetryConfig, - ) -> Result { - self.cache - .get_or_insert_with(|| self.fetch_token_inner(client, retry)) - .await - } - - /// Fetch a fresh token - async fn fetch_token_inner( - &self, - client: &Client, - retry: &RetryConfig, - ) -> Result> { - let mut headers = HeaderMap::new(); - headers.append(ACCEPT, HeaderValue::from_static(CONTENT_TYPE_JSON)); - - let mut params = std::collections::HashMap::new(); - params.insert("client_id", self.client_id.as_str()); - params.insert("client_secret", self.client_secret.as_str()); - params.insert("scope", self.scope.as_str()); - params.insert("grant_type", "client_credentials"); - - let response: TokenResponse = client - .request(Method::POST, &self.token_url) - .headers(headers) - .form(¶ms) - .send_retry(retry) - .await - .context(TokenRequestSnafu)? - .error_for_status() - .context(TokenResponseBodySnafu)? - .json() - .await - .context(TokenResponseBodySnafu)?; - - let token = TemporaryToken { - token: response.access_token, - expiry: Instant::now() + Duration::from_secs(response.expires_in), - }; - - Ok(token) - } -} diff --git a/src/lib.rs b/src/lib.rs index 5811eba..9ed9db9 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -465,10 +465,6 @@ pub enum Error { #[snafu(display("Operation not yet implemented."))] NotImplemented, - - #[cfg(feature = "gcp")] - #[snafu(display("OAuth error: {}", source), context(false))] - OAuth { source: client::oauth::Error }, } impl From for std::io::Error { From e4b444278b1a243ead0c77254a93d8db21e1b24b Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Tue, 30 Aug 2022 17:42:26 +0100 Subject: [PATCH 034/397] Add IMDSv1 fallback (#2609) (#2610) * Add IMDSv1 fallback (#2609) * Add config option --- src/aws/credential.rs | 165 ++++++++++++++++++++++++++++++++------ src/aws/mod.rs | 19 +++++ src/client/mock_server.rs | 105 ++++++++++++++++++++++++ src/client/mod.rs | 2 + src/client/retry.rs | 60 +++----------- 5 files changed, 276 insertions(+), 75 deletions(-) create mode 100644 src/client/mock_server.rs diff --git a/src/aws/credential.rs b/src/aws/credential.rs index e6c1bdd..1abf42b 100644 --- a/src/aws/credential.rs +++ b/src/aws/credential.rs @@ -23,11 +23,12 @@ use bytes::Buf; use chrono::{DateTime, Utc}; use futures::TryFutureExt; use reqwest::header::{HeaderMap, HeaderValue}; -use reqwest::{Client, Method, Request, RequestBuilder}; +use reqwest::{Client, Method, Request, RequestBuilder, StatusCode}; use serde::Deserialize; use std::collections::BTreeMap; use std::sync::Arc; use std::time::Instant; +use tracing::warn; type StdError = Box; @@ -284,6 +285,7 @@ pub struct InstanceCredentialProvider { pub cache: TokenCache>, pub client: Client, pub retry_config: RetryConfig, + pub imdsv1_fallback: bool, } impl InstanceCredentialProvider { @@ -291,11 +293,16 @@ impl InstanceCredentialProvider { self.cache .get_or_insert_with(|| { const METADATA_ENDPOINT: &str = "http://169.254.169.254"; - instance_creds(&self.client, &self.retry_config, METADATA_ENDPOINT) - .map_err(|source| crate::Error::Generic { - store: "S3", - source, - }) + instance_creds( + &self.client, + &self.retry_config, + METADATA_ENDPOINT, + self.imdsv1_fallback, + ) + .map_err(|source| crate::Error::Generic { + store: "S3", + source, + }) }) .await } @@ -360,36 +367,47 @@ async fn instance_creds( client: &Client, retry_config: &RetryConfig, endpoint: &str, + imdsv1_fallback: bool, ) -> Result>, StdError> { const CREDENTIALS_PATH: &str = "latest/meta-data/iam/security-credentials"; const AWS_EC2_METADATA_TOKEN_HEADER: &str = "X-aws-ec2-metadata-token"; let token_url = format!("{}/latest/api/token", endpoint); - let token = client + + let token_result = client .request(Method::PUT, token_url) .header("X-aws-ec2-metadata-token-ttl-seconds", "600") // 10 minute TTL .send_retry(retry_config) - .await? - .text() - .await?; + .await; + + let token = match token_result { + Ok(t) => Some(t.text().await?), + Err(e) + if imdsv1_fallback && matches!(e.status(), Some(StatusCode::FORBIDDEN)) => + { + warn!("received 403 from metadata endpoint, falling back to IMDSv1"); + None + } + Err(e) => return Err(e.into()), + }; let role_url = format!("{}/{}/", endpoint, CREDENTIALS_PATH); - let role = client - .request(Method::GET, role_url) - .header(AWS_EC2_METADATA_TOKEN_HEADER, &token) - .send_retry(retry_config) - .await? - .text() - .await?; + let mut role_request = client.request(Method::GET, role_url); + + if let Some(token) = &token { + role_request = role_request.header(AWS_EC2_METADATA_TOKEN_HEADER, token); + } + + let role = role_request.send_retry(retry_config).await?.text().await?; let creds_url = format!("{}/{}/{}", endpoint, CREDENTIALS_PATH, role); - let creds: InstanceCredentials = client - .request(Method::GET, creds_url) - .header(AWS_EC2_METADATA_TOKEN_HEADER, &token) - .send_retry(retry_config) - .await? - .json() - .await?; + let mut creds_request = client.request(Method::GET, creds_url); + if let Some(token) = &token { + creds_request = creds_request.header(AWS_EC2_METADATA_TOKEN_HEADER, token); + } + + let creds: InstanceCredentials = + creds_request.send_retry(retry_config).await?.json().await?; let now = Utc::now(); let ttl = (creds.expiration - now).to_std().unwrap_or_default(); @@ -470,6 +488,8 @@ async fn web_identity( #[cfg(test)] mod tests { use super::*; + use crate::client::mock_server::MockServer; + use hyper::{Body, Response}; use reqwest::{Client, Method}; use std::env; @@ -567,11 +587,11 @@ mod tests { assert_eq!( resp.status(), - reqwest::StatusCode::UNAUTHORIZED, + StatusCode::UNAUTHORIZED, "Ensure metadata endpoint is set to only allow IMDSv2" ); - let creds = instance_creds(&client, &retry_config, &endpoint) + let creds = instance_creds(&client, &retry_config, &endpoint, false) .await .unwrap(); @@ -583,4 +603,97 @@ mod tests { assert!(!secret.is_empty()); assert!(!token.is_empty()) } + + #[tokio::test] + async fn test_mock() { + let server = MockServer::new(); + + const IMDSV2_HEADER: &str = "X-aws-ec2-metadata-token"; + + let secret_access_key = "SECRET"; + let access_key_id = "KEYID"; + let token = "TOKEN"; + + let endpoint = server.url(); + let client = Client::new(); + let retry_config = RetryConfig::default(); + + // Test IMDSv2 + server.push_fn(|req| { + assert_eq!(req.uri().path(), "/latest/api/token"); + assert_eq!(req.method(), &Method::PUT); + Response::new(Body::from("cupcakes")) + }); + server.push_fn(|req| { + assert_eq!( + req.uri().path(), + "/latest/meta-data/iam/security-credentials/" + ); + assert_eq!(req.method(), &Method::GET); + let t = req.headers().get(IMDSV2_HEADER).unwrap().to_str().unwrap(); + assert_eq!(t, "cupcakes"); + Response::new(Body::from("myrole")) + }); + server.push_fn(|req| { + assert_eq!(req.uri().path(), "/latest/meta-data/iam/security-credentials/myrole"); + assert_eq!(req.method(), &Method::GET); + let t = req.headers().get(IMDSV2_HEADER).unwrap().to_str().unwrap(); + assert_eq!(t, "cupcakes"); + Response::new(Body::from(r#"{"AccessKeyId":"KEYID","Code":"Success","Expiration":"2022-08-30T10:51:04Z","LastUpdated":"2022-08-30T10:21:04Z","SecretAccessKey":"SECRET","Token":"TOKEN","Type":"AWS-HMAC"}"#)) + }); + + let creds = instance_creds(&client, &retry_config, endpoint, true) + .await + .unwrap(); + + assert_eq!(creds.token.token.as_deref().unwrap(), token); + assert_eq!(&creds.token.key_id, access_key_id); + assert_eq!(&creds.token.secret_key, secret_access_key); + + // Test IMDSv1 fallback + server.push_fn(|req| { + assert_eq!(req.uri().path(), "/latest/api/token"); + assert_eq!(req.method(), &Method::PUT); + Response::builder() + .status(StatusCode::FORBIDDEN) + .body(Body::empty()) + .unwrap() + }); + server.push_fn(|req| { + assert_eq!( + req.uri().path(), + "/latest/meta-data/iam/security-credentials/" + ); + assert_eq!(req.method(), &Method::GET); + assert!(req.headers().get(IMDSV2_HEADER).is_none()); + Response::new(Body::from("myrole")) + }); + server.push_fn(|req| { + assert_eq!(req.uri().path(), "/latest/meta-data/iam/security-credentials/myrole"); + assert_eq!(req.method(), &Method::GET); + assert!(req.headers().get(IMDSV2_HEADER).is_none()); + Response::new(Body::from(r#"{"AccessKeyId":"KEYID","Code":"Success","Expiration":"2022-08-30T10:51:04Z","LastUpdated":"2022-08-30T10:21:04Z","SecretAccessKey":"SECRET","Token":"TOKEN","Type":"AWS-HMAC"}"#)) + }); + + let creds = instance_creds(&client, &retry_config, endpoint, true) + .await + .unwrap(); + + assert_eq!(creds.token.token.as_deref().unwrap(), token); + assert_eq!(&creds.token.key_id, access_key_id); + assert_eq!(&creds.token.secret_key, secret_access_key); + + // Test IMDSv1 fallback disabled + server.push( + Response::builder() + .status(StatusCode::FORBIDDEN) + .body(Body::empty()) + .unwrap(), + ); + + // Should fail + instance_creds(&client, &retry_config, endpoint, false) + .await + .unwrap_err(); + } } diff --git a/src/aws/mod.rs b/src/aws/mod.rs index ab90afa..d1d0a12 100644 --- a/src/aws/mod.rs +++ b/src/aws/mod.rs @@ -339,6 +339,7 @@ pub struct AmazonS3Builder { token: Option, retry_config: RetryConfig, allow_http: bool, + imdsv1_fallback: bool, } impl AmazonS3Builder { @@ -446,6 +447,23 @@ impl AmazonS3Builder { self } + /// By default instance credentials will only be fetched over [IMDSv2], as AWS recommends + /// against having IMDSv1 enabled on EC2 instances as it is vulnerable to [SSRF attack] + /// + /// However, certain deployment environments, such as those running old versions of kube2iam, + /// may not support IMDSv2. This option will enable automatic fallback to using IMDSv1 + /// if the token endpoint returns a 403 error indicating that IMDSv2 is not supported. + /// + /// This option has no effect if not using instance credentials + /// + /// [IMDSv2]: [https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/configuring-instance-metadata-service.html] + /// [SSRF attack]: [https://aws.amazon.com/blogs/security/defense-in-depth-open-firewalls-reverse-proxies-ssrf-vulnerabilities-ec2-instance-metadata-service/] + /// + pub fn with_imdsv1_fallback(mut self) -> Self { + self.imdsv1_fallback = true; + self + } + /// Create a [`AmazonS3`] instance from the provided values, /// consuming `self`. pub fn build(self) -> Result { @@ -503,6 +521,7 @@ impl AmazonS3Builder { cache: Default::default(), client, retry_config: self.retry_config.clone(), + imdsv1_fallback: self.imdsv1_fallback, }) } }, diff --git a/src/client/mock_server.rs b/src/client/mock_server.rs new file mode 100644 index 0000000..adb7e0f --- /dev/null +++ b/src/client/mock_server.rs @@ -0,0 +1,105 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use hyper::service::{make_service_fn, service_fn}; +use hyper::{Body, Request, Response, Server}; +use parking_lot::Mutex; +use std::collections::VecDeque; +use std::convert::Infallible; +use std::net::SocketAddr; +use std::sync::Arc; +use tokio::sync::oneshot; +use tokio::task::JoinHandle; + +pub type ResponseFn = Box) -> Response + Send>; + +/// A mock server +pub struct MockServer { + responses: Arc>>, + shutdown: oneshot::Sender<()>, + handle: JoinHandle<()>, + url: String, +} + +impl MockServer { + pub fn new() -> Self { + let responses: Arc>> = + Arc::new(Mutex::new(VecDeque::with_capacity(10))); + + let r = Arc::clone(&responses); + let make_service = make_service_fn(move |_conn| { + let r = Arc::clone(&r); + async move { + Ok::<_, Infallible>(service_fn(move |req| { + let r = Arc::clone(&r); + async move { + Ok::<_, Infallible>(match r.lock().pop_front() { + Some(r) => r(req), + None => Response::new(Body::from("Hello World")), + }) + } + })) + } + }); + + let (shutdown, rx) = oneshot::channel::<()>(); + let server = + Server::bind(&SocketAddr::from(([127, 0, 0, 1], 0))).serve(make_service); + + let url = format!("http://{}", server.local_addr()); + + let handle = tokio::spawn(async move { + server + .with_graceful_shutdown(async { + rx.await.ok(); + }) + .await + .unwrap() + }); + + Self { + responses, + shutdown, + handle, + url, + } + } + + /// The url of the mock server + pub fn url(&self) -> &str { + &self.url + } + + /// Add a response + pub fn push(&self, response: Response) { + self.push_fn(|_| response) + } + + /// Add a response function + pub fn push_fn(&self, f: F) + where + F: FnOnce(Request) -> Response + Send + 'static, + { + self.responses.lock().push_back(Box::new(f)) + } + + /// Shutdown the mock server + pub async fn shutdown(self) { + let _ = self.shutdown.send(()); + self.handle.await.unwrap() + } +} diff --git a/src/client/mod.rs b/src/client/mod.rs index e6de3e9..c93c68a 100644 --- a/src/client/mod.rs +++ b/src/client/mod.rs @@ -18,6 +18,8 @@ //! Generic utilities reqwest based ObjectStore implementations pub mod backoff; +#[cfg(test)] +pub mod mock_server; pub mod pagination; pub mod retry; pub mod token; diff --git a/src/client/retry.rs b/src/client/retry.rs index 44d7835..d66628a 100644 --- a/src/client/retry.rs +++ b/src/client/retry.rs @@ -180,54 +180,17 @@ impl RetryExt for reqwest::RequestBuilder { #[cfg(test)] mod tests { + use crate::client::mock_server::MockServer; use crate::client::retry::RetryExt; use crate::RetryConfig; use hyper::header::LOCATION; - use hyper::service::{make_service_fn, service_fn}; - use hyper::{Body, Response, Server}; - use parking_lot::Mutex; + use hyper::{Body, Response}; use reqwest::{Client, Method, StatusCode}; - use std::collections::VecDeque; - use std::convert::Infallible; - use std::net::SocketAddr; - use std::sync::Arc; use std::time::Duration; #[tokio::test] async fn test_retry() { - let responses: Arc>>> = - Arc::new(Mutex::new(VecDeque::with_capacity(10))); - - let r = Arc::clone(&responses); - let make_service = make_service_fn(move |_conn| { - let r = Arc::clone(&r); - async move { - Ok::<_, Infallible>(service_fn(move |_req| { - let r = Arc::clone(&r); - async move { - Ok::<_, Infallible>(match r.lock().pop_front() { - Some(r) => r, - None => Response::new(Body::from("Hello World")), - }) - } - })) - } - }); - - let (tx, rx) = tokio::sync::oneshot::channel::<()>(); - let server = - Server::bind(&SocketAddr::from(([127, 0, 0, 1], 0))).serve(make_service); - - let url = format!("http://{}", server.local_addr()); - - let server_handle = tokio::spawn(async move { - server - .with_graceful_shutdown(async { - rx.await.ok(); - }) - .await - .unwrap() - }); + let mock = MockServer::new(); let retry = RetryConfig { backoff: Default::default(), @@ -236,14 +199,14 @@ mod tests { }; let client = Client::new(); - let do_request = || client.request(Method::GET, &url).send_retry(&retry); + let do_request = || client.request(Method::GET, mock.url()).send_retry(&retry); // Simple request should work let r = do_request().await.unwrap(); assert_eq!(r.status(), StatusCode::OK); // Returns client errors immediately with status message - responses.lock().push_back( + mock.push( Response::builder() .status(StatusCode::BAD_REQUEST) .body(Body::from("cupcakes")) @@ -256,7 +219,7 @@ mod tests { assert_eq!(&e.message, "cupcakes"); // Handles client errors with no payload - responses.lock().push_back( + mock.push( Response::builder() .status(StatusCode::BAD_REQUEST) .body(Body::empty()) @@ -269,7 +232,7 @@ mod tests { assert_eq!(&e.message, "No Body"); // Should retry server error request - responses.lock().push_back( + mock.push( Response::builder() .status(StatusCode::BAD_GATEWAY) .body(Body::empty()) @@ -280,7 +243,7 @@ mod tests { assert_eq!(r.status(), StatusCode::OK); // Accepts 204 status code - responses.lock().push_back( + mock.push( Response::builder() .status(StatusCode::NO_CONTENT) .body(Body::empty()) @@ -291,7 +254,7 @@ mod tests { assert_eq!(r.status(), StatusCode::NO_CONTENT); // Follows redirects - responses.lock().push_back( + mock.push( Response::builder() .status(StatusCode::FOUND) .header(LOCATION, "/foo") @@ -305,7 +268,7 @@ mod tests { // Gives up after the retrying the specified number of times for _ in 0..=retry.max_retries { - responses.lock().push_back( + mock.push( Response::builder() .status(StatusCode::BAD_GATEWAY) .body(Body::from("ignored")) @@ -318,7 +281,6 @@ mod tests { assert_eq!(e.message, "502 Bad Gateway"); // Shutdown - let _ = tx.send(()); - server_handle.await.unwrap(); + mock.shutdown().await } } From b9719e727aed17ccaf1e352287bb5c615ecc3668 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Wed, 31 Aug 2022 19:58:39 +0100 Subject: [PATCH 035/397] Move gcp.rs (#2619) --- src/{gcp.rs => gcp/mod.rs} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename src/{gcp.rs => gcp/mod.rs} (100%) diff --git a/src/gcp.rs b/src/gcp/mod.rs similarity index 100% rename from src/gcp.rs rename to src/gcp/mod.rs From 46d050685e166f70da01b560511fbf5d555c5c7c Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 1 Sep 2022 17:07:21 +0100 Subject: [PATCH 036/397] Update quick-xml 0.24 (#2625) --- Cargo.toml | 2 +- src/azure/client.rs | 6 ++---- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index ccfe123..b0201e2 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -44,7 +44,7 @@ walkdir = "2" # Cloud storage support base64 = { version = "0.13", default-features = false, optional = true } -quick-xml = { version = "0.23.0", features = ["serialize"], optional = true } +quick-xml = { version = "0.24.0", features = ["serialize"], optional = true } serde = { version = "1.0", default-features = false, features = ["derive"], optional = true } serde_json = { version = "1.0", default-features = false, optional = true } rand = { version = "0.8", default-features = false, features = ["std", "std_rng"], optional = true } diff --git a/src/azure/client.rs b/src/azure/client.rs index 9f87a88..ece0785 100644 --- a/src/azure/client.rs +++ b/src/azure/client.rs @@ -630,9 +630,8 @@ mod tests { "; - let bytes = Bytes::from(S); let mut _list_blobs_response_internal: ListResultInternal = - quick_xml::de::from_slice(bytes.as_ref()).unwrap(); + quick_xml::de::from_str(S).unwrap(); } #[test] @@ -702,9 +701,8 @@ mod tests { "; - let bytes = Bytes::from(S); let mut _list_blobs_response_internal: ListResultInternal = - quick_xml::de::from_slice(bytes.as_ref()).unwrap(); + quick_xml::de::from_str(S).unwrap(); } #[test] From 1866ddaf817773a2d5c1e3e223a615cc92055523 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Fri, 2 Sep 2022 12:34:22 +0100 Subject: [PATCH 037/397] Use http with fake-gcs (#2632) --- src/gcp/mod.rs | 21 --------------------- 1 file changed, 21 deletions(-) diff --git a/src/gcp/mod.rs b/src/gcp/mod.rs index 65adf91..0ef4d35 100644 --- a/src/gcp/mod.rs +++ b/src/gcp/mod.rs @@ -782,20 +782,6 @@ impl GoogleCloudStorageBuilder { self } - /// Use the specified http [`Client`] (defaults to [`Client::new`]) - /// - /// This allows you to set custom client options such as allowing - /// non secure connections or custom headers. - /// - /// NOTE: Currently only available in `test`s to facilitate - /// testing, to avoid leaking details and preserve our ability to - /// make changes to the implementation. - #[cfg(test)] - pub fn with_client(mut self, client: Client) -> Self { - self.client = Some(client); - self - } - /// Configure a connection to Google Cloud Storage, returning a /// new [`GoogleCloudStorage`] and consuming `self` pub fn build(self) -> Result { @@ -923,13 +909,6 @@ mod test { env::var("GOOGLE_SERVICE_ACCOUNT") .expect("already checked GOOGLE_SERVICE_ACCOUNT") ) - .with_client( - // ignore HTTPS errors in tests so we can use fake-gcs server - Client::builder() - .danger_accept_invalid_certs(true) - .build() - .expect("Error creating http client for testing") - ) } }}; } From ad60ecf41335949ff453c70dc9203094de66c836 Mon Sep 17 00:00:00 2001 From: JanKaul Date: Mon, 5 Sep 2022 18:52:19 +0200 Subject: [PATCH 038/397] update doc for object_store copy_if_not_exists (#2653) --- src/lib.rs | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/lib.rs b/src/lib.rs index 9ed9db9..16f0c6f 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -290,6 +290,10 @@ pub trait ObjectStore: std::fmt::Display + Send + Sync + Debug + 'static { /// Copy an object from one path to another, only if destination is empty. /// /// Will return an error if the destination already has an object. + /// + /// Performs an atomic operation if the underlying object storage supports it. + /// If atomic operations are not supported by the underlying object storage (like S3) + /// it will return an error. async fn copy_if_not_exists(&self, from: &Path, to: &Path) -> Result<()>; /// Move an object from one path to another in the same object store. From bd48558a968ca91cb7497d00051f580fcf83d509 Mon Sep 17 00:00:00 2001 From: Marko Grujic Date: Thu, 8 Sep 2022 11:05:50 +0200 Subject: [PATCH 039/397] Fix multiple part uploads at once making vector size inconsistent (#2681) * Fix multiple part uploads at once making vector size inconsistent * Calculate total_parts prior to resizing the vector --- src/multipart.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/multipart.rs b/src/multipart.rs index 1985d86..102d8be 100644 --- a/src/multipart.rs +++ b/src/multipart.rs @@ -94,9 +94,9 @@ where if self.tasks.is_empty() { return Ok(()); } - let total_parts = self.completed_parts.len(); while let Poll::Ready(Some(res)) = self.tasks.poll_next_unpin(cx) { let (part_idx, part) = res?; + let total_parts = self.completed_parts.len(); self.completed_parts .resize(std::cmp::max(part_idx + 1, total_parts), None); self.completed_parts[part_idx] = Some(part); From 622e56b5832d88b97f3af6bf5ac9633a60e289ef Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 8 Sep 2022 12:20:03 +0100 Subject: [PATCH 040/397] Prepare object_store 0.5 release (#2682) * Prepare object_store 0.5 release * Review feedback --- .github_changelog_generator | 2 +- CHANGELOG-old.md | 71 ++++++++++++++++++++++++++++++++ CHANGELOG.md | 63 ++++++++++++---------------- Cargo.toml | 2 +- dev/release/update_change_log.sh | 4 +- 5 files changed, 102 insertions(+), 40 deletions(-) create mode 100644 CHANGELOG-old.md diff --git a/.github_changelog_generator b/.github_changelog_generator index cbd8aa0..69b574a 100644 --- a/.github_changelog_generator +++ b/.github_changelog_generator @@ -23,5 +23,5 @@ add-sections={"documentation":{"prefix":"**Documentation updates:**","labels":[" # so that the component is shown associated with the issue issue-line-labels=object-store # skip non object_store issues -exclude-labels=development-process,invalid,arrow,parquet,arrow-flight +exclude-labels=development-process,invalid,arrow,parquet,arrow-flight,parquet-derive breaking_labels=api-change diff --git a/CHANGELOG-old.md b/CHANGELOG-old.md new file mode 100644 index 0000000..a6bda3c --- /dev/null +++ b/CHANGELOG-old.md @@ -0,0 +1,71 @@ + + +# Historical Changelog + +## [object_store_0.4.0](https://github.com/apache/arrow-rs/tree/object_store_0.4.0) (2022-08-10) + +[Full Changelog](https://github.com/apache/arrow-rs/compare/object_store_0.3.0...object_store_0.4.0) + +**Implemented enhancements:** + +- Relax Path Validation to Allow Any Percent-Encoded Sequence [\#2355](https://github.com/apache/arrow-rs/issues/2355) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Support get\_multi\_ranges in ObjectStore [\#2293](https://github.com/apache/arrow-rs/issues/2293) +- object\_store: Create explicit test for symlinks [\#2206](https://github.com/apache/arrow-rs/issues/2206) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- object\_store: Make builder style configuration for object stores [\#2203](https://github.com/apache/arrow-rs/issues/2203) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- object\_store: Add example in the main documentation readme [\#2202](https://github.com/apache/arrow-rs/issues/2202) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] + +**Fixed bugs:** + +- Azure/S3 Storage Fails to Copy Blob with URL-encoded Path [\#2353](https://github.com/apache/arrow-rs/issues/2353) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Accessing a file with a percent-encoded name on the filesystem with ObjectStore LocalFileSystem [\#2349](https://github.com/apache/arrow-rs/issues/2349) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] + +**Documentation updates:** + +- Improve `object_store crate` documentation [\#2260](https://github.com/apache/arrow-rs/pull/2260) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([alamb](https://github.com/alamb)) + +**Merged pull requests:** + +- Canonicalize filesystem paths in user-facing APIs \(\#2370\) [\#2371](https://github.com/apache/arrow-rs/pull/2371) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Fix object\_store lint [\#2367](https://github.com/apache/arrow-rs/pull/2367) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Relax path validation \(\#2355\) [\#2356](https://github.com/apache/arrow-rs/pull/2356) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Fix Copy from percent-encoded path \(\#2353\) [\#2354](https://github.com/apache/arrow-rs/pull/2354) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Add ObjectStore::get\_ranges \(\#2293\) [\#2336](https://github.com/apache/arrow-rs/pull/2336) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Remove vestigal ` object_store/.circleci/` [\#2337](https://github.com/apache/arrow-rs/pull/2337) ([alamb](https://github.com/alamb)) +- Handle symlinks in LocalFileSystem \(\#2206\) [\#2269](https://github.com/apache/arrow-rs/pull/2269) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Retry GCP requests on server error [\#2243](https://github.com/apache/arrow-rs/pull/2243) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Add LimitStore \(\#2175\) [\#2242](https://github.com/apache/arrow-rs/pull/2242) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Only trigger `arrow` CI on changes to arrow [\#2227](https://github.com/apache/arrow-rs/pull/2227) ([alamb](https://github.com/alamb)) +- Update instructions on how to join the Slack channel [\#2219](https://github.com/apache/arrow-rs/pull/2219) ([HaoYang670](https://github.com/HaoYang670)) +- Add Builder style config objects for object\_store [\#2204](https://github.com/apache/arrow-rs/pull/2204) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([alamb](https://github.com/alamb)) +- Ignore broken symlinks for LocalFileSystem object store [\#2195](https://github.com/apache/arrow-rs/pull/2195) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([jccampagne](https://github.com/jccampagne)) +- Change CI names to match crate names [\#2189](https://github.com/apache/arrow-rs/pull/2189) ([alamb](https://github.com/alamb)) +- Split most arrow specific CI checks into their own workflows \(reduce common CI time to 21 minutes\) [\#2168](https://github.com/apache/arrow-rs/pull/2168) ([alamb](https://github.com/alamb)) +- Remove another attempt to cache target directory in action.yaml [\#2167](https://github.com/apache/arrow-rs/pull/2167) ([alamb](https://github.com/alamb)) +- Run actions on push to master, pull requests [\#2166](https://github.com/apache/arrow-rs/pull/2166) ([alamb](https://github.com/alamb)) +- Break parquet\_derive and arrow\_flight tests into their own workflows [\#2165](https://github.com/apache/arrow-rs/pull/2165) ([alamb](https://github.com/alamb)) +- Only run integration tests when `arrow` changes [\#2152](https://github.com/apache/arrow-rs/pull/2152) ([alamb](https://github.com/alamb)) +- Break out docs CI job to its own github action [\#2151](https://github.com/apache/arrow-rs/pull/2151) ([alamb](https://github.com/alamb)) +- Do not pretend to cache rust build artifacts, speed up CI by ~20% [\#2150](https://github.com/apache/arrow-rs/pull/2150) ([alamb](https://github.com/alamb)) +- Port `object_store` integration tests, use github actions [\#2148](https://github.com/apache/arrow-rs/pull/2148) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([alamb](https://github.com/alamb)) +- Port Add stream upload \(multi-part upload\) [\#2147](https://github.com/apache/arrow-rs/pull/2147) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([alamb](https://github.com/alamb)) +- Increase upper wait time to reduce flakyness of object store test [\#2142](https://github.com/apache/arrow-rs/pull/2142) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([viirya](https://github.com/viirya)) + +\* *This Changelog was automatically generated by [github_changelog_generator](https://github.com/github-changelog-generator/github-changelog-generator)* + diff --git a/CHANGELOG.md b/CHANGELOG.md index 93faa67..538eebf 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,52 +19,43 @@ # Changelog -## [object_store_0.4.0](https://github.com/apache/arrow-rs/tree/object_store_0.4.0) (2022-08-10) +## [object_store_0.5.0](https://github.com/apache/arrow-rs/tree/object_store_0.5.0) (2022-09-08) -[Full Changelog](https://github.com/apache/arrow-rs/compare/object_store_0.3.0...object_store_0.4.0) +[Full Changelog](https://github.com/apache/arrow-rs/compare/object_store_0.4.0...object_store_0.5.0) + +**Breaking changes:** + +- Replace azure sdk with custom implementation [\#2509](https://github.com/apache/arrow-rs/pull/2509) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([roeap](https://github.com/roeap)) +- Replace rusoto with custom implementation for AWS \(\#2176\) [\#2352](https://github.com/apache/arrow-rs/pull/2352) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) **Implemented enhancements:** -- Relax Path Validation to Allow Any Percent-Encoded Sequence [\#2355](https://github.com/apache/arrow-rs/issues/2355) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- Support get\_multi\_ranges in ObjectStore [\#2293](https://github.com/apache/arrow-rs/issues/2293) -- object\_store: Create explicit test for symlinks [\#2206](https://github.com/apache/arrow-rs/issues/2206) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- object\_store: Make builder style configuration for object stores [\#2203](https://github.com/apache/arrow-rs/issues/2203) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- object\_store: Add example in the main documentation readme [\#2202](https://github.com/apache/arrow-rs/issues/2202) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- IMDSv1 Fallback for S3 [\#2609](https://github.com/apache/arrow-rs/issues/2609) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Print Response Body On Error [\#2572](https://github.com/apache/arrow-rs/issues/2572) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Coalesce Ranges Parallel Fetch [\#2562](https://github.com/apache/arrow-rs/issues/2562) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Support Coalescing Out-of-Order Ranges [\#2561](https://github.com/apache/arrow-rs/issues/2561) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- object\_store: Add TokenProvider authorization to azure [\#2373](https://github.com/apache/arrow-rs/issues/2373) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- AmazonS3Builder::from\_env to populate credentials from environment [\#2361](https://github.com/apache/arrow-rs/issues/2361) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- AmazonS3 Support IMDSv2 [\#2350](https://github.com/apache/arrow-rs/issues/2350) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] **Fixed bugs:** -- Azure/S3 Storage Fails to Copy Blob with URL-encoded Path [\#2353](https://github.com/apache/arrow-rs/issues/2353) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- Accessing a file with a percent-encoded name on the filesystem with ObjectStore LocalFileSystem [\#2349](https://github.com/apache/arrow-rs/issues/2349) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Retry Logic Fails to Retry Server Errors [\#2573](https://github.com/apache/arrow-rs/issues/2573) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Fix multiple part uploads at once making vector size inconsistent [\#2681](https://github.com/apache/arrow-rs/pull/2681) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([gruuya](https://github.com/gruuya)) +- Fix panic in `object_store::util::coalesce_ranges` [\#2554](https://github.com/apache/arrow-rs/pull/2554) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([thinkharderdev](https://github.com/thinkharderdev)) -**Documentation updates:** +**Merged pull requests:** -- Improve `object_store crate` documentation [\#2260](https://github.com/apache/arrow-rs/pull/2260) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([alamb](https://github.com/alamb)) +- update doc for object\_store copy\_if\_not\_exists [\#2653](https://github.com/apache/arrow-rs/pull/2653) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([JanKaul](https://github.com/JanKaul)) +- Update quick-xml 0.24 [\#2625](https://github.com/apache/arrow-rs/pull/2625) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Add IMDSv1 fallback \(\#2609\) [\#2610](https://github.com/apache/arrow-rs/pull/2610) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- ObjectStore cleanup \(\#2587\) [\#2590](https://github.com/apache/arrow-rs/pull/2590) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Fix retry logic \(\#2573\) \(\#2572\) [\#2574](https://github.com/apache/arrow-rs/pull/2574) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Improve coalesce\_ranges \(\#2561\) \(\#2562\) [\#2563](https://github.com/apache/arrow-rs/pull/2563) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Update environment variable name for amazonS3builder in integration \(\#2550\) [\#2553](https://github.com/apache/arrow-rs/pull/2553) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([amrltqt](https://github.com/amrltqt)) +- Build AmazonS3builder from environment variables \(\#2361\) [\#2536](https://github.com/apache/arrow-rs/pull/2536) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([amrltqt](https://github.com/amrltqt)) +- feat: add token provider authorization to azure store [\#2374](https://github.com/apache/arrow-rs/pull/2374) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([roeap](https://github.com/roeap)) -**Merged pull requests:** -- Canonicalize filesystem paths in user-facing APIs \(\#2370\) [\#2371](https://github.com/apache/arrow-rs/pull/2371) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- Fix object\_store lint [\#2367](https://github.com/apache/arrow-rs/pull/2367) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- Relax path validation \(\#2355\) [\#2356](https://github.com/apache/arrow-rs/pull/2356) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- Fix Copy from percent-encoded path \(\#2353\) [\#2354](https://github.com/apache/arrow-rs/pull/2354) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- Add ObjectStore::get\_ranges \(\#2293\) [\#2336](https://github.com/apache/arrow-rs/pull/2336) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- Remove vestigal ` object_store/.circleci/` [\#2337](https://github.com/apache/arrow-rs/pull/2337) ([alamb](https://github.com/alamb)) -- Handle symlinks in LocalFileSystem \(\#2206\) [\#2269](https://github.com/apache/arrow-rs/pull/2269) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- Retry GCP requests on server error [\#2243](https://github.com/apache/arrow-rs/pull/2243) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- Add LimitStore \(\#2175\) [\#2242](https://github.com/apache/arrow-rs/pull/2242) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- Only trigger `arrow` CI on changes to arrow [\#2227](https://github.com/apache/arrow-rs/pull/2227) ([alamb](https://github.com/alamb)) -- Update instructions on how to join the Slack channel [\#2219](https://github.com/apache/arrow-rs/pull/2219) ([HaoYang670](https://github.com/HaoYang670)) -- Add Builder style config objects for object\_store [\#2204](https://github.com/apache/arrow-rs/pull/2204) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([alamb](https://github.com/alamb)) -- Ignore broken symlinks for LocalFileSystem object store [\#2195](https://github.com/apache/arrow-rs/pull/2195) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([jccampagne](https://github.com/jccampagne)) -- Change CI names to match crate names [\#2189](https://github.com/apache/arrow-rs/pull/2189) ([alamb](https://github.com/alamb)) -- Split most arrow specific CI checks into their own workflows \(reduce common CI time to 21 minutes\) [\#2168](https://github.com/apache/arrow-rs/pull/2168) ([alamb](https://github.com/alamb)) -- Remove another attempt to cache target directory in action.yaml [\#2167](https://github.com/apache/arrow-rs/pull/2167) ([alamb](https://github.com/alamb)) -- Run actions on push to master, pull requests [\#2166](https://github.com/apache/arrow-rs/pull/2166) ([alamb](https://github.com/alamb)) -- Break parquet\_derive and arrow\_flight tests into their own workflows [\#2165](https://github.com/apache/arrow-rs/pull/2165) ([alamb](https://github.com/alamb)) -- Only run integration tests when `arrow` changes [\#2152](https://github.com/apache/arrow-rs/pull/2152) ([alamb](https://github.com/alamb)) -- Break out docs CI job to its own github action [\#2151](https://github.com/apache/arrow-rs/pull/2151) ([alamb](https://github.com/alamb)) -- Do not pretend to cache rust build artifacts, speed up CI by ~20% [\#2150](https://github.com/apache/arrow-rs/pull/2150) ([alamb](https://github.com/alamb)) -- Port `object_store` integration tests, use github actions [\#2148](https://github.com/apache/arrow-rs/pull/2148) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([alamb](https://github.com/alamb)) -- Port Add stream upload \(multi-part upload\) [\#2147](https://github.com/apache/arrow-rs/pull/2147) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([alamb](https://github.com/alamb)) -- Increase upper wait time to reduce flakyness of object store test [\#2142](https://github.com/apache/arrow-rs/pull/2142) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([viirya](https://github.com/viirya)) \* *This Changelog was automatically generated by [github_changelog_generator](https://github.com/github-changelog-generator/github-changelog-generator)* diff --git a/Cargo.toml b/Cargo.toml index b0201e2..0f5b0fd 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "object_store" -version = "0.4.0" +version = "0.5.0" edition = "2021" license = "MIT/Apache-2.0" readme = "README.md" diff --git a/dev/release/update_change_log.sh b/dev/release/update_change_log.sh index ebd50df..e737e04 100755 --- a/dev/release/update_change_log.sh +++ b/dev/release/update_change_log.sh @@ -29,8 +29,8 @@ set -e -SINCE_TAG="object_store_0.3.0" -FUTURE_RELEASE="object_store_0.4.0" +SINCE_TAG="object_store_0.4.0" +FUTURE_RELEASE="object_store_0.5.0" SOURCE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" SOURCE_TOP_DIR="$(cd "${SOURCE_DIR}/../../" && pwd)" From 89cf41114e67a2e507db3288e764888b11537e50 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Sat, 10 Sep 2022 18:55:46 +0100 Subject: [PATCH 041/397] Update quick-xml to 0.25 (#2695) --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index 0f5b0fd..9e4e68d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -44,7 +44,7 @@ walkdir = "2" # Cloud storage support base64 = { version = "0.13", default-features = false, optional = true } -quick-xml = { version = "0.24.0", features = ["serialize"], optional = true } +quick-xml = { version = "0.25.0", features = ["serialize"], optional = true } serde = { version = "1.0", default-features = false, features = ["derive"], optional = true } serde_json = { version = "1.0", default-features = false, optional = true } rand = { version = "0.8", default-features = false, features = ["std", "std_rng"], optional = true } From b90bd57f5e7db1eb57161d2f16aa6119e880f15c Mon Sep 17 00:00:00 2001 From: Artjoms Iskovs Date: Thu, 15 Sep 2022 17:22:53 +0100 Subject: [PATCH 042/397] Fix multipart uploads on Minio (#2731) The official Minio SDK uses "uploads=" as the URL when it initiates a multipart upload instead of "uploads". This affects the AWSV4 signature and causes object_store to fail a signature check when initiating the upload to Minio. It's possible that this contradicts the AWS S3 API docs: https://docs.aws.amazon.com/AmazonS3/latest/API/API_CreateMultipartUpload.html#API_CreateMultipartUpload_RequestSyntax and we need to instead keep the URL as `?uploads` and change the URL that goes into the signature instead. --- src/aws/client.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/aws/client.rs b/src/aws/client.rs index d8ab3bb..f800fec 100644 --- a/src/aws/client.rs +++ b/src/aws/client.rs @@ -411,7 +411,7 @@ impl S3Client { pub async fn create_multipart(&self, location: &Path) -> Result { let credential = self.get_credential().await?; let url = format!( - "{}/{}/{}?uploads", + "{}/{}/{}?uploads=", self.config.endpoint, self.config.bucket, encode_path(location) From 8ad0a188e7474becaaaa58ea75450dd375be9ab1 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 29 Sep 2022 07:33:31 +0100 Subject: [PATCH 043/397] Handle incomplete HTTP redirects missing LOCATION (#2795) (#2796) --- src/client/retry.rs | 119 ++++++++++++++++++++++++++++++++++---------- 1 file changed, 92 insertions(+), 27 deletions(-) diff --git a/src/client/retry.rs b/src/client/retry.rs index d66628a..cee86b3 100644 --- a/src/client/retry.rs +++ b/src/client/retry.rs @@ -20,49 +20,62 @@ use crate::client::backoff::{Backoff, BackoffConfig}; use futures::future::BoxFuture; use futures::FutureExt; +use reqwest::header::LOCATION; use reqwest::{Response, StatusCode}; -use snafu::Snafu; use std::time::{Duration, Instant}; use tracing::info; /// Retry request error -#[derive(Debug, Snafu)] -#[snafu(display( - "response error \"{}\", after {} retries: {}", - message, - retries, - source -))] +#[derive(Debug)] pub struct Error { retries: usize, message: String, - source: reqwest::Error, + source: Option, +} + +impl std::fmt::Display for Error { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!( + f, + "response error \"{}\", after {} retries", + self.message, self.retries + )?; + if let Some(source) = &self.source { + write!(f, ": {}", source)?; + } + Ok(()) + } +} + +impl std::error::Error for Error { + fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { + self.source.as_ref().map(|e| e as _) + } } impl Error { /// Returns the status code associated with this error if any pub fn status(&self) -> Option { - self.source.status() + self.source.as_ref().and_then(|e| e.status()) } } impl From for std::io::Error { fn from(err: Error) -> Self { use std::io::ErrorKind; - if err.source.is_builder() || err.source.is_request() { - Self::new(ErrorKind::InvalidInput, err) - } else if let Some(s) = err.source.status() { - match s { - StatusCode::NOT_FOUND => Self::new(ErrorKind::NotFound, err), - StatusCode::BAD_REQUEST => Self::new(ErrorKind::InvalidInput, err), - _ => Self::new(ErrorKind::Other, err), + match (&err.source, err.status()) { + (Some(source), _) if source.is_builder() || source.is_request() => { + Self::new(ErrorKind::InvalidInput, err) + } + (_, Some(StatusCode::NOT_FOUND)) => Self::new(ErrorKind::NotFound, err), + (_, Some(StatusCode::BAD_REQUEST)) => Self::new(ErrorKind::InvalidInput, err), + (Some(source), None) if source.is_timeout() => { + Self::new(ErrorKind::TimedOut, err) + } + (Some(source), None) if source.is_connect() => { + Self::new(ErrorKind::NotConnected, err) } - } else if err.source.is_timeout() { - Self::new(ErrorKind::TimedOut, err) - } else if err.source.is_connect() { - Self::new(ErrorKind::NotConnected, err) - } else { - Self::new(ErrorKind::Other, err) + _ => Self::new(ErrorKind::Other, err), } } } @@ -131,7 +144,21 @@ impl RetryExt for reqwest::RequestBuilder { let s = self.try_clone().expect("request body must be cloneable"); match s.send().await { Ok(r) => match r.error_for_status_ref() { - Ok(_) => return Ok(r), + Ok(_) if r.status().is_success() => return Ok(r), + Ok(r) => { + let is_bare_redirect = r.status().is_redirection() && !r.headers().contains_key(LOCATION); + let message = match is_bare_redirect { + true => "Received redirect without LOCATION, this normally indicates an incorrectly configured region".to_string(), + // Not actually sure if this is reachable, but here for completeness + false => format!("request unsuccessful: {}", r.status()), + }; + + return Err(Error{ + message, + retries, + source: None, + }) + } Err(e) => { let status = r.status(); @@ -152,7 +179,7 @@ impl RetryExt for reqwest::RequestBuilder { return Err(Error{ message, retries, - source: e, + source: Some(e), }) } @@ -168,7 +195,7 @@ impl RetryExt for reqwest::RequestBuilder { return Err(Error{ retries, message: "request error".to_string(), - source: e + source: Some(e) }) } } @@ -253,7 +280,7 @@ mod tests { let r = do_request().await.unwrap(); assert_eq!(r.status(), StatusCode::NO_CONTENT); - // Follows redirects + // Follows 402 redirects mock.push( Response::builder() .status(StatusCode::FOUND) @@ -266,6 +293,44 @@ mod tests { assert_eq!(r.status(), StatusCode::OK); assert_eq!(r.url().path(), "/foo"); + // Follows 401 redirects + mock.push( + Response::builder() + .status(StatusCode::FOUND) + .header(LOCATION, "/bar") + .body(Body::empty()) + .unwrap(), + ); + + let r = do_request().await.unwrap(); + assert_eq!(r.status(), StatusCode::OK); + assert_eq!(r.url().path(), "/bar"); + + // Handles redirect loop + for _ in 0..10 { + mock.push( + Response::builder() + .status(StatusCode::FOUND) + .header(LOCATION, "/bar") + .body(Body::empty()) + .unwrap(), + ); + } + + let e = do_request().await.unwrap_err().to_string(); + assert!(e.ends_with("too many redirects"), "{}", e); + + // Handles redirect missing location + mock.push( + Response::builder() + .status(StatusCode::FOUND) + .body(Body::empty()) + .unwrap(), + ); + + let e = do_request().await.unwrap_err(); + assert_eq!(e.message, "Received redirect without LOCATION, this normally indicates an incorrectly configured region"); + // Gives up after the retrying the specified number of times for _ in 0..=retry.max_retries { mock.push( From f0293af024b2086d083486b0e86871d80b78beea Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 29 Sep 2022 15:46:03 +0100 Subject: [PATCH 044/397] Fix S3 query canonicalization (#2800) (#2801) * Fix S3 query canonicalization (#2800) * Disable listing with spaces on azurite and localstack --- src/aws/client.rs | 16 ++-------------- src/aws/credential.rs | 37 ++++++++++++++++++++++++++++++++++++- src/aws/mod.rs | 20 ++++++++++++++++++-- src/azure/mod.rs | 15 ++++++--------- src/lib.rs | 22 ++++++++++++++++++++++ src/path/mod.rs | 11 +++++++++++ 6 files changed, 95 insertions(+), 26 deletions(-) diff --git a/src/aws/client.rs b/src/aws/client.rs index f800fec..5ec9390 100644 --- a/src/aws/client.rs +++ b/src/aws/client.rs @@ -16,6 +16,7 @@ // under the License. use crate::aws::credential::{AwsCredential, CredentialExt, CredentialProvider}; +use crate::aws::STRICT_PATH_ENCODE_SET; use crate::client::pagination::stream_paginated; use crate::client::retry::RetryExt; use crate::multipart::UploadPart; @@ -26,26 +27,13 @@ use crate::{ }; use bytes::{Buf, Bytes}; use chrono::{DateTime, Utc}; -use percent_encoding::{utf8_percent_encode, AsciiSet, PercentEncode, NON_ALPHANUMERIC}; +use percent_encoding::{utf8_percent_encode, PercentEncode}; use reqwest::{Client as ReqwestClient, Method, Response, StatusCode}; use serde::{Deserialize, Serialize}; use snafu::{ResultExt, Snafu}; use std::ops::Range; use std::sync::Arc; -// http://docs.aws.amazon.com/general/latest/gr/sigv4-create-canonical-request.html -// -// Do not URI-encode any of the unreserved characters that RFC 3986 defines: -// A-Z, a-z, 0-9, hyphen ( - ), underscore ( _ ), period ( . ), and tilde ( ~ ). -const STRICT_ENCODE_SET: AsciiSet = NON_ALPHANUMERIC - .remove(b'-') - .remove(b'.') - .remove(b'_') - .remove(b'~'); - -/// This struct is used to maintain the URI path encoding -const STRICT_PATH_ENCODE_SET: AsciiSet = STRICT_ENCODE_SET.remove(b'/'); - /// A specialized `Error` for object store-related errors #[derive(Debug, Snafu)] #[allow(missing_docs)] diff --git a/src/aws/credential.rs b/src/aws/credential.rs index 1abf42b..d446164 100644 --- a/src/aws/credential.rs +++ b/src/aws/credential.rs @@ -15,6 +15,7 @@ // specific language governing permissions and limitations // under the License. +use crate::aws::STRICT_ENCODE_SET; use crate::client::retry::RetryExt; use crate::client::token::{TemporaryToken, TokenCache}; use crate::util::hmac_sha256; @@ -22,6 +23,7 @@ use crate::{Result, RetryConfig}; use bytes::Buf; use chrono::{DateTime, Utc}; use futures::TryFutureExt; +use percent_encoding::utf8_percent_encode; use reqwest::header::{HeaderMap, HeaderValue}; use reqwest::{Client, Method, Request, RequestBuilder, StatusCode}; use serde::Deserialize; @@ -29,6 +31,7 @@ use std::collections::BTreeMap; use std::sync::Arc; use std::time::Instant; use tracing::warn; +use url::Url; type StdError = Box; @@ -103,13 +106,14 @@ impl<'a> RequestSigner<'a> { request.headers_mut().insert(HASH_HEADER, header_digest); let (signed_headers, canonical_headers) = canonicalize_headers(request.headers()); + let canonical_query = canonicalize_query(request.url()); // https://docs.aws.amazon.com/general/latest/gr/sigv4-create-canonical-request.html let canonical_request = format!( "{}\n{}\n{}\n{}\n{}\n{}", request.method().as_str(), request.url().path(), // S3 doesn't percent encode this like other services - request.url().query().unwrap_or(""), // This assumes the query pairs are in order + canonical_query, canonical_headers, signed_headers, digest @@ -207,6 +211,37 @@ fn hex_encode(bytes: &[u8]) -> String { out } +/// Canonicalizes query parameters into the AWS canonical form +/// +/// +fn canonicalize_query(url: &Url) -> String { + use std::fmt::Write; + + let capacity = match url.query() { + Some(q) if !q.is_empty() => q.len(), + _ => return String::new(), + }; + let mut encoded = String::with_capacity(capacity + 1); + + let mut headers = url.query_pairs().collect::>(); + headers.sort_unstable_by(|(a, _), (b, _)| a.cmp(b)); + + let mut first = true; + for (k, v) in headers { + if !first { + encoded.push('&'); + } + first = false; + let _ = write!( + encoded, + "{}={}", + utf8_percent_encode(k.as_ref(), &STRICT_ENCODE_SET), + utf8_percent_encode(v.as_ref(), &STRICT_ENCODE_SET) + ); + } + encoded +} + /// Canonicalizes headers into the AWS Canonical Form. /// /// diff --git a/src/aws/mod.rs b/src/aws/mod.rs index d1d0a12..d186c7f 100644 --- a/src/aws/mod.rs +++ b/src/aws/mod.rs @@ -58,6 +58,20 @@ use crate::{ mod client; mod credential; +// http://docs.aws.amazon.com/general/latest/gr/sigv4-create-canonical-request.html +// +// Do not URI-encode any of the unreserved characters that RFC 3986 defines: +// A-Z, a-z, 0-9, hyphen ( - ), underscore ( _ ), period ( . ), and tilde ( ~ ). +pub(crate) const STRICT_ENCODE_SET: percent_encoding::AsciiSet = + percent_encoding::NON_ALPHANUMERIC + .remove(b'-') + .remove(b'.') + .remove(b'_') + .remove(b'~'); + +/// This struct is used to maintain the URI path encoding +const STRICT_PATH_ENCODE_SET: percent_encoding::AsciiSet = STRICT_ENCODE_SET.remove(b'/'); + /// A specialized `Error` for object store-related errors #[derive(Debug, Snafu)] #[allow(missing_docs)] @@ -551,7 +565,7 @@ mod tests { use super::*; use crate::tests::{ get_nonexistent_object, list_uses_directories_correctly, list_with_delimiter, - put_get_delete_list, rename_and_copy, stream_get, + put_get_delete_list_opts, rename_and_copy, stream_get, }; use bytes::Bytes; use std::env; @@ -677,9 +691,11 @@ mod tests { #[tokio::test] async fn s3_test() { let config = maybe_skip_integration!(); + let is_local = matches!(&config.endpoint, Some(e) if e.starts_with("http://")); let integration = config.build().unwrap(); - put_get_delete_list(&integration).await; + // Localstack doesn't support listing with spaces https://github.com/localstack/localstack/issues/6328 + put_get_delete_list_opts(&integration, is_local).await; list_uses_directories_correctly(&integration).await; list_with_delimiter(&integration).await; rename_and_copy(&integration).await; diff --git a/src/azure/mod.rs b/src/azure/mod.rs index dd1cde9..f7ca4cf 100644 --- a/src/azure/mod.rs +++ b/src/azure/mod.rs @@ -595,7 +595,7 @@ mod tests { use super::*; use crate::tests::{ copy_if_not_exists, list_uses_directories_correctly, list_with_delimiter, - put_get_delete_list, rename_and_copy, stream_get, + put_get_delete_list, put_get_delete_list_opts, rename_and_copy, stream_get, }; use std::env; @@ -663,9 +663,10 @@ mod tests { #[tokio::test] async fn azure_blob_test() { + let use_emulator = env::var("AZURE_USE_EMULATOR").is_ok(); let integration = maybe_skip_integration!().build().unwrap(); - - put_get_delete_list(&integration).await; + // Azurite doesn't support listing with spaces - https://github.com/localstack/localstack/issues/6328 + put_get_delete_list_opts(&integration, use_emulator).await; list_uses_directories_correctly(&integration).await; list_with_delimiter(&integration).await; rename_and_copy(&integration).await; @@ -687,13 +688,9 @@ mod tests { .with_container_name( env::var("OBJECT_STORE_BUCKET").expect("must be set OBJECT_STORE_BUCKET"), ) - .with_client_secret_authorization( - env::var("AZURE_STORAGE_CLIENT_ID") + .with_access_key( + env::var("AZURE_STORAGE_ACCESS_KEY") .expect("must be set AZURE_STORAGE_CLIENT_ID"), - env::var("AZURE_STORAGE_CLIENT_SECRET") - .expect("must be set AZURE_STORAGE_CLIENT_SECRET"), - env::var("AZURE_STORAGE_TENANT_ID") - .expect("must be set AZURE_STORAGE_TENANT_ID"), ); let integration = builder.build().unwrap(); diff --git a/src/lib.rs b/src/lib.rs index 16f0c6f..5eaaaba 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -506,6 +506,13 @@ mod tests { use tokio::io::AsyncWriteExt; pub(crate) async fn put_get_delete_list(storage: &DynObjectStore) { + put_get_delete_list_opts(storage, false).await + } + + pub(crate) async fn put_get_delete_list_opts( + storage: &DynObjectStore, + skip_list_with_spaces: bool, + ) { delete_fixtures(storage).await; let content_list = flatten_list_stream(storage, None).await.unwrap(); @@ -701,6 +708,21 @@ mod tests { assert_eq!(files, vec![path.clone()]); storage.delete(&path).await.unwrap(); + + let path = Path::parse("foo bar/I contain spaces.parquet").unwrap(); + storage.put(&path, Bytes::from(vec![0, 1])).await.unwrap(); + storage.head(&path).await.unwrap(); + + if !skip_list_with_spaces { + let files = flatten_list_stream(storage, Some(&Path::from("foo bar"))) + .await + .unwrap(); + assert_eq!(files, vec![path.clone()]); + } + storage.delete(&path).await.unwrap(); + + let files = flatten_list_stream(storage, None).await.unwrap(); + assert!(files.is_empty(), "{:?}", files); } fn get_vec_of_bytes(chunk_length: usize, num_chunks: usize) -> Vec { diff --git a/src/path/mod.rs b/src/path/mod.rs index e5a7b64..80e0f79 100644 --- a/src/path/mod.rs +++ b/src/path/mod.rs @@ -534,4 +534,15 @@ mod tests { needle ); } + + #[test] + fn path_containing_spaces() { + let a = Path::from_iter(["foo bar", "baz"]); + let b = Path::from("foo bar/baz"); + let c = Path::parse("foo bar/baz").unwrap(); + + assert_eq!(a.raw, "foo bar/baz"); + assert_eq!(a.raw, b.raw); + assert_eq!(b.raw, c.raw); + } } From 474c00598711a287cc1340dcae5b0572f59610c5 Mon Sep 17 00:00:00 2001 From: Will Jones Date: Sat, 1 Oct 2022 00:17:22 -0700 Subject: [PATCH 045/397] Support for overriding instance metadata endpoint (#2811) * Support for setting instance metadata endpoint * Actually implement * Apply suggestions from code review Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> --- src/aws/credential.rs | 4 ++-- src/aws/mod.rs | 39 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 41 insertions(+), 2 deletions(-) diff --git a/src/aws/credential.rs b/src/aws/credential.rs index d446164..ada855b 100644 --- a/src/aws/credential.rs +++ b/src/aws/credential.rs @@ -321,17 +321,17 @@ pub struct InstanceCredentialProvider { pub client: Client, pub retry_config: RetryConfig, pub imdsv1_fallback: bool, + pub metadata_endpoint: String, } impl InstanceCredentialProvider { async fn get_credential(&self) -> Result> { self.cache .get_or_insert_with(|| { - const METADATA_ENDPOINT: &str = "http://169.254.169.254"; instance_creds( &self.client, &self.retry_config, - METADATA_ENDPOINT, + &self.metadata_endpoint, self.imdsv1_fallback, ) .map_err(|source| crate::Error::Generic { diff --git a/src/aws/mod.rs b/src/aws/mod.rs index d186c7f..a602603 100644 --- a/src/aws/mod.rs +++ b/src/aws/mod.rs @@ -72,6 +72,9 @@ pub(crate) const STRICT_ENCODE_SET: percent_encoding::AsciiSet = /// This struct is used to maintain the URI path encoding const STRICT_PATH_ENCODE_SET: percent_encoding::AsciiSet = STRICT_ENCODE_SET.remove(b'/'); +/// Default metadata endpoint +static METADATA_ENDPOINT: &str = "http://169.254.169.254"; + /// A specialized `Error` for object store-related errors #[derive(Debug, Snafu)] #[allow(missing_docs)] @@ -354,6 +357,7 @@ pub struct AmazonS3Builder { retry_config: RetryConfig, allow_http: bool, imdsv1_fallback: bool, + metadata_endpoint: Option, } impl AmazonS3Builder { @@ -370,6 +374,7 @@ impl AmazonS3Builder { /// * AWS_DEFAULT_REGION -> region /// * AWS_ENDPOINT -> endpoint /// * AWS_SESSION_TOKEN -> token + /// * AWS_CONTAINER_CREDENTIALS_RELATIVE_URI -> /// # Example /// ``` /// use object_store::aws::AmazonS3Builder; @@ -401,6 +406,15 @@ impl AmazonS3Builder { builder.token = Some(token); } + // This env var is set in ECS + // https://docs.aws.amazon.com/AmazonECS/latest/developerguide/task-iam-roles.html + if let Ok(metadata_relative_uri) = + std::env::var("AWS_CONTAINER_CREDENTIALS_RELATIVE_URI") + { + builder.metadata_endpoint = + Some(format!("{}{}", METADATA_ENDPOINT, metadata_relative_uri)); + } + builder } @@ -478,6 +492,16 @@ impl AmazonS3Builder { self } + /// Set the [instance metadata endpoint](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/ec2-instance-metadata.html), + /// used primarily within AWS EC2. + /// + /// This defaults to the IPv4 endpoint: http://169.254.169.254. One can alternatively use the IPv6 + /// endpoint http://fd00:ec2::254. + pub fn with_metadata_endpoint(mut self, endpoint: impl Into) -> Self { + self.metadata_endpoint = Some(endpoint.into()); + self + } + /// Create a [`AmazonS3`] instance from the provided values, /// consuming `self`. pub fn build(self) -> Result { @@ -536,6 +560,9 @@ impl AmazonS3Builder { client, retry_config: self.retry_config.clone(), imdsv1_fallback: self.imdsv1_fallback, + metadata_endpoint: self + .metadata_endpoint + .unwrap_or_else(|| METADATA_ENDPOINT.into()), }) } }, @@ -667,6 +694,10 @@ mod tests { let aws_session_token = env::var("AWS_SESSION_TOKEN") .unwrap_or_else(|_| "object_store:fake_session_token".into()); + let container_creds_relative_uri = + env::var("AWS_CONTAINER_CREDENTIALS_RELATIVE_URI") + .unwrap_or_else(|_| "/object_store/fake_credentials_uri".into()); + // required env::set_var("AWS_ACCESS_KEY_ID", &aws_access_key_id); env::set_var("AWS_SECRET_ACCESS_KEY", &aws_secret_access_key); @@ -675,6 +706,10 @@ mod tests { // optional env::set_var("AWS_ENDPOINT", &aws_endpoint); env::set_var("AWS_SESSION_TOKEN", &aws_session_token); + env::set_var( + "AWS_CONTAINER_CREDENTIALS_RELATIVE_URI", + &container_creds_relative_uri, + ); let builder = AmazonS3Builder::from_env(); assert_eq!(builder.access_key_id.unwrap(), aws_access_key_id.as_str()); @@ -686,6 +721,10 @@ mod tests { assert_eq!(builder.endpoint.unwrap(), aws_endpoint); assert_eq!(builder.token.unwrap(), aws_session_token); + + let metadata_uri = + format!("{}{}", METADATA_ENDPOINT, container_creds_relative_uri); + assert_eq!(builder.metadata_endpoint.unwrap(), metadata_uri); } #[tokio::test] From 900b0613cf612ff74b7faef25043078e7d6ca29a Mon Sep 17 00:00:00 2001 From: askoa <112126368+askoa@users.noreply.github.com> Date: Sat, 1 Oct 2022 11:09:00 -0400 Subject: [PATCH 046/397] Handle S3 virtual host request type (#2782) * include s2 virtual host request type * formatting changes * fix issues highlighted in PR comments * initialize bucket_endpoint * some imporments on endpoint initialization * fix issue in initalizing bucket_endpoint * incorporating PR comments * incorporate PR comments * fix typo in comment Co-authored-by: askoa --- src/aws/client.rs | 12 ++++------- src/aws/mod.rs | 54 +++++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 54 insertions(+), 12 deletions(-) diff --git a/src/aws/client.rs b/src/aws/client.rs index 5ec9390..2962162 100644 --- a/src/aws/client.rs +++ b/src/aws/client.rs @@ -197,6 +197,7 @@ pub struct S3Config { pub region: String, pub endpoint: String, pub bucket: String, + pub bucket_endpoint: String, pub credentials: CredentialProvider, pub retry_config: RetryConfig, pub allow_http: bool, @@ -204,7 +205,7 @@ pub struct S3Config { impl S3Config { fn path_url(&self, path: &Path) -> String { - format!("{}/{}/{}", self.endpoint, self.bucket, encode_path(path)) + format!("{}/{}", self.bucket_endpoint, encode_path(path)) } } @@ -342,7 +343,7 @@ impl S3Client { token: Option<&str>, ) -> Result<(ListResult, Option)> { let credential = self.get_credential().await?; - let url = format!("{}/{}", self.config.endpoint, self.config.bucket); + let url = self.config.bucket_endpoint.clone(); let mut query = Vec::with_capacity(4); @@ -398,12 +399,7 @@ impl S3Client { pub async fn create_multipart(&self, location: &Path) -> Result { let credential = self.get_credential().await?; - let url = format!( - "{}/{}/{}?uploads=", - self.config.endpoint, - self.config.bucket, - encode_path(location) - ); + let url = format!("{}?uploads=", self.config.path_url(location),); let response = self .client diff --git a/src/aws/mod.rs b/src/aws/mod.rs index a602603..e3510b3 100644 --- a/src/aws/mod.rs +++ b/src/aws/mod.rs @@ -357,6 +357,7 @@ pub struct AmazonS3Builder { retry_config: RetryConfig, allow_http: bool, imdsv1_fallback: bool, + virtual_hosted_style_request: bool, metadata_endpoint: Option, } @@ -446,10 +447,13 @@ impl AmazonS3Builder { } /// Sets the endpoint for communicating with AWS S3. Default value - /// is based on region. + /// is based on region. The `endpoint` field should be consistent with + /// the field `virtual_hosted_style_request'. /// /// For example, this might be set to `"http://localhost:4566:` /// for testing against a localstack instance. + /// If `virtual_hosted_style_request` is set to true then `endpoint` + /// should have bucket name included. pub fn with_endpoint(mut self, endpoint: impl Into) -> Self { self.endpoint = Some(endpoint.into()); self @@ -469,6 +473,23 @@ impl AmazonS3Builder { self } + /// Sets if virtual hosted style request has to be used. + /// If `virtual_hosted_style_request` is : + /// * false (default): Path style request is used + /// * true: Virtual hosted style request is used + /// + /// If the `endpoint` is provided then it should be + /// consistent with `virtual_hosted_style_request`. + /// i.e. if `virtual_hosted_style_request` is set to true + /// then `endpoint` should have bucket name included. + pub fn with_virtual_hosted_style_request( + mut self, + virtual_hosted_style_request: bool, + ) -> Self { + self.virtual_hosted_style_request = virtual_hosted_style_request; + self + } + /// Set the retry configuration pub fn with_retry(mut self, retry_config: RetryConfig) -> Self { self.retry_config = retry_config; @@ -568,14 +589,29 @@ impl AmazonS3Builder { }, }; - let endpoint = self - .endpoint - .unwrap_or_else(|| format!("https://s3.{}.amazonaws.com", region)); + let endpoint: String; + let bucket_endpoint: String; + + //If `endpoint` is provided then its assumed to be consistent with + // `virutal_hosted_style_request`. i.e. if `virtual_hosted_style_request` is true then + // `endpoint` should have bucket name included. + if self.virtual_hosted_style_request { + endpoint = self.endpoint.unwrap_or_else(|| { + format!("https://{}.s3.{}.amazonaws.com", bucket, region) + }); + bucket_endpoint = endpoint.clone(); + } else { + endpoint = self + .endpoint + .unwrap_or_else(|| format!("https://s3.{}.amazonaws.com", region)); + bucket_endpoint = format!("{}/{}", endpoint, bucket); + } let config = S3Config { region, endpoint, bucket, + bucket_endpoint, credentials, retry_config: self.retry_config, allow_http: self.allow_http, @@ -674,6 +710,16 @@ mod tests { config }; + let config = if let Some(virtual_hosted_style_request) = + env::var("OBJECT_STORE_VIRTUAL_HOSTED_STYLE_REQUEST").ok() + { + config.with_virtual_hosted_style_request( + virtual_hosted_style_request.trim().parse().unwrap(), + ) + } else { + config + }; + config } }}; From c52216e5aca7cb56a1f35738d1014e7384295908 Mon Sep 17 00:00:00 2001 From: Brent Gardner Date: Mon, 3 Oct 2022 05:52:38 -0700 Subject: [PATCH 047/397] Allow Configuring non-TLS HTTP Connections in AmazonS3Builder::from_env (#2807) * Allow HTTP * Update docs --- src/aws/mod.rs | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/aws/mod.rs b/src/aws/mod.rs index e3510b3..c08a635 100644 --- a/src/aws/mod.rs +++ b/src/aws/mod.rs @@ -376,6 +376,7 @@ impl AmazonS3Builder { /// * AWS_ENDPOINT -> endpoint /// * AWS_SESSION_TOKEN -> token /// * AWS_CONTAINER_CREDENTIALS_RELATIVE_URI -> + /// * AWS_ALLOW_HTTP -> set to "true" to permit HTTP connections without TLS /// # Example /// ``` /// use object_store::aws::AmazonS3Builder; @@ -416,6 +417,10 @@ impl AmazonS3Builder { Some(format!("{}{}", METADATA_ENDPOINT, metadata_relative_uri)); } + if let Ok(text) = std::env::var("AWS_ALLOW_HTTP") { + builder.allow_http = text == "true"; + } + builder } From 226ecc8b117d93539f0fdd70efab559ff30e7f6d Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Tue, 4 Oct 2022 18:52:54 +0100 Subject: [PATCH 048/397] Prepare object_store 0.5.1 (#2824) --- CHANGELOG-old.md | 37 +++++++++++++++++++++++++++++ CHANGELOG.md | 40 +++++++++++--------------------- Cargo.toml | 2 +- dev/release/update_change_log.sh | 4 ++-- 4 files changed, 54 insertions(+), 29 deletions(-) diff --git a/CHANGELOG-old.md b/CHANGELOG-old.md index a6bda3c..bf1ef62 100644 --- a/CHANGELOG-old.md +++ b/CHANGELOG-old.md @@ -19,6 +19,43 @@ # Historical Changelog +## [object_store_0.5.0](https://github.com/apache/arrow-rs/tree/object_store_0.5.0) (2022-09-08) + +[Full Changelog](https://github.com/apache/arrow-rs/compare/object_store_0.4.0...object_store_0.5.0) + +**Breaking changes:** + +- Replace azure sdk with custom implementation [\#2509](https://github.com/apache/arrow-rs/pull/2509) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([roeap](https://github.com/roeap)) +- Replace rusoto with custom implementation for AWS \(\#2176\) [\#2352](https://github.com/apache/arrow-rs/pull/2352) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) + +**Implemented enhancements:** + +- IMDSv1 Fallback for S3 [\#2609](https://github.com/apache/arrow-rs/issues/2609) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Print Response Body On Error [\#2572](https://github.com/apache/arrow-rs/issues/2572) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Coalesce Ranges Parallel Fetch [\#2562](https://github.com/apache/arrow-rs/issues/2562) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Support Coalescing Out-of-Order Ranges [\#2561](https://github.com/apache/arrow-rs/issues/2561) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- object\_store: Add TokenProvider authorization to azure [\#2373](https://github.com/apache/arrow-rs/issues/2373) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- AmazonS3Builder::from\_env to populate credentials from environment [\#2361](https://github.com/apache/arrow-rs/issues/2361) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- AmazonS3 Support IMDSv2 [\#2350](https://github.com/apache/arrow-rs/issues/2350) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] + +**Fixed bugs:** + +- Retry Logic Fails to Retry Server Errors [\#2573](https://github.com/apache/arrow-rs/issues/2573) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Fix multiple part uploads at once making vector size inconsistent [\#2681](https://github.com/apache/arrow-rs/pull/2681) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([gruuya](https://github.com/gruuya)) +- Fix panic in `object_store::util::coalesce_ranges` [\#2554](https://github.com/apache/arrow-rs/pull/2554) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([thinkharderdev](https://github.com/thinkharderdev)) + +**Merged pull requests:** + +- update doc for object\_store copy\_if\_not\_exists [\#2653](https://github.com/apache/arrow-rs/pull/2653) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([JanKaul](https://github.com/JanKaul)) +- Update quick-xml 0.24 [\#2625](https://github.com/apache/arrow-rs/pull/2625) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Add IMDSv1 fallback \(\#2609\) [\#2610](https://github.com/apache/arrow-rs/pull/2610) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- ObjectStore cleanup \(\#2587\) [\#2590](https://github.com/apache/arrow-rs/pull/2590) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Fix retry logic \(\#2573\) \(\#2572\) [\#2574](https://github.com/apache/arrow-rs/pull/2574) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Improve coalesce\_ranges \(\#2561\) \(\#2562\) [\#2563](https://github.com/apache/arrow-rs/pull/2563) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Update environment variable name for amazonS3builder in integration \(\#2550\) [\#2553](https://github.com/apache/arrow-rs/pull/2553) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([amrltqt](https://github.com/amrltqt)) +- Build AmazonS3builder from environment variables \(\#2361\) [\#2536](https://github.com/apache/arrow-rs/pull/2536) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([amrltqt](https://github.com/amrltqt)) +- feat: add token provider authorization to azure store [\#2374](https://github.com/apache/arrow-rs/pull/2374) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([roeap](https://github.com/roeap)) + ## [object_store_0.4.0](https://github.com/apache/arrow-rs/tree/object_store_0.4.0) (2022-08-10) [Full Changelog](https://github.com/apache/arrow-rs/compare/object_store_0.3.0...object_store_0.4.0) diff --git a/CHANGELOG.md b/CHANGELOG.md index 538eebf..6919111 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,42 +19,30 @@ # Changelog -## [object_store_0.5.0](https://github.com/apache/arrow-rs/tree/object_store_0.5.0) (2022-09-08) +## [object_store_0.5.1](https://github.com/apache/arrow-rs/tree/object_store_0.5.1) (2022-10-04) -[Full Changelog](https://github.com/apache/arrow-rs/compare/object_store_0.4.0...object_store_0.5.0) - -**Breaking changes:** - -- Replace azure sdk with custom implementation [\#2509](https://github.com/apache/arrow-rs/pull/2509) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([roeap](https://github.com/roeap)) -- Replace rusoto with custom implementation for AWS \(\#2176\) [\#2352](https://github.com/apache/arrow-rs/pull/2352) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +[Full Changelog](https://github.com/apache/arrow-rs/compare/object_store_0.5.0...object_store_0.5.1) **Implemented enhancements:** -- IMDSv1 Fallback for S3 [\#2609](https://github.com/apache/arrow-rs/issues/2609) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- Print Response Body On Error [\#2572](https://github.com/apache/arrow-rs/issues/2572) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- Coalesce Ranges Parallel Fetch [\#2562](https://github.com/apache/arrow-rs/issues/2562) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- Support Coalescing Out-of-Order Ranges [\#2561](https://github.com/apache/arrow-rs/issues/2561) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- object\_store: Add TokenProvider authorization to azure [\#2373](https://github.com/apache/arrow-rs/issues/2373) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- AmazonS3Builder::from\_env to populate credentials from environment [\#2361](https://github.com/apache/arrow-rs/issues/2361) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- AmazonS3 Support IMDSv2 [\#2350](https://github.com/apache/arrow-rs/issues/2350) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Allow HTTP S3 URLs [\#2806](https://github.com/apache/arrow-rs/issues/2806) +- object\_store: support AWS ECS instance credentials [\#2802](https://github.com/apache/arrow-rs/issues/2802) +- Object Store S3 Alibaba Cloud OSS support [\#2777](https://github.com/apache/arrow-rs/issues/2777) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Expose option to use GCS object store in integration tests [\#2627](https://github.com/apache/arrow-rs/issues/2627) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] **Fixed bugs:** -- Retry Logic Fails to Retry Server Errors [\#2573](https://github.com/apache/arrow-rs/issues/2573) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- Fix multiple part uploads at once making vector size inconsistent [\#2681](https://github.com/apache/arrow-rs/pull/2681) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([gruuya](https://github.com/gruuya)) -- Fix panic in `object_store::util::coalesce_ranges` [\#2554](https://github.com/apache/arrow-rs/pull/2554) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([thinkharderdev](https://github.com/thinkharderdev)) +- S3 Signature Error Performing List With Prefix Containing Spaces [\#2800](https://github.com/apache/arrow-rs/issues/2800) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Erratic Behaviour if Incorrect S3 Region Configured [\#2795](https://github.com/apache/arrow-rs/issues/2795) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] **Merged pull requests:** -- update doc for object\_store copy\_if\_not\_exists [\#2653](https://github.com/apache/arrow-rs/pull/2653) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([JanKaul](https://github.com/JanKaul)) -- Update quick-xml 0.24 [\#2625](https://github.com/apache/arrow-rs/pull/2625) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- Add IMDSv1 fallback \(\#2609\) [\#2610](https://github.com/apache/arrow-rs/pull/2610) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- ObjectStore cleanup \(\#2587\) [\#2590](https://github.com/apache/arrow-rs/pull/2590) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- Fix retry logic \(\#2573\) \(\#2572\) [\#2574](https://github.com/apache/arrow-rs/pull/2574) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- Improve coalesce\_ranges \(\#2561\) \(\#2562\) [\#2563](https://github.com/apache/arrow-rs/pull/2563) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- Update environment variable name for amazonS3builder in integration \(\#2550\) [\#2553](https://github.com/apache/arrow-rs/pull/2553) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([amrltqt](https://github.com/amrltqt)) -- Build AmazonS3builder from environment variables \(\#2361\) [\#2536](https://github.com/apache/arrow-rs/pull/2536) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([amrltqt](https://github.com/amrltqt)) -- feat: add token provider authorization to azure store [\#2374](https://github.com/apache/arrow-rs/pull/2374) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([roeap](https://github.com/roeap)) +- Support for overriding instance metadata endpoint [\#2811](https://github.com/apache/arrow-rs/pull/2811) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([wjones127](https://github.com/wjones127)) +- Allow Configuring non-TLS HTTP Connections in AmazonS3Builder::from\_env [\#2807](https://github.com/apache/arrow-rs/pull/2807) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([avantgardnerio](https://github.com/avantgardnerio)) +- Fix S3 query canonicalization \(\#2800\) [\#2801](https://github.com/apache/arrow-rs/pull/2801) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Handle incomplete HTTP redirects missing LOCATION \(\#2795\) [\#2796](https://github.com/apache/arrow-rs/pull/2796) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Handle S3 virtual host request type [\#2782](https://github.com/apache/arrow-rs/pull/2782) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([askoa](https://github.com/askoa)) +- Fix object\_store multipart uploads on S3 Compatible Stores [\#2731](https://github.com/apache/arrow-rs/pull/2731) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([mildbyte](https://github.com/mildbyte)) diff --git a/Cargo.toml b/Cargo.toml index 9e4e68d..6abb390 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "object_store" -version = "0.5.0" +version = "0.5.1" edition = "2021" license = "MIT/Apache-2.0" readme = "README.md" diff --git a/dev/release/update_change_log.sh b/dev/release/update_change_log.sh index e737e04..865acde 100755 --- a/dev/release/update_change_log.sh +++ b/dev/release/update_change_log.sh @@ -29,8 +29,8 @@ set -e -SINCE_TAG="object_store_0.4.0" -FUTURE_RELEASE="object_store_0.5.0" +SINCE_TAG="object_store_0.5.0" +FUTURE_RELEASE="object_store_0.5.1" SOURCE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" SOURCE_TOP_DIR="$(cd "${SOURCE_DIR}/../../" && pwd)" From 32c9da7095d265aa6bf1fe3c4d943777246ad170 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 25 Oct 2022 07:25:10 +1300 Subject: [PATCH 049/397] Update quick-xml requirement from 0.25.0 to 0.26.0 (#2918) Updates the requirements on [quick-xml](https://github.com/tafia/quick-xml) to permit the latest version. - [Release notes](https://github.com/tafia/quick-xml/releases) - [Changelog](https://github.com/tafia/quick-xml/blob/master/Changelog.md) - [Commits](https://github.com/tafia/quick-xml/compare/v0.25.0...v0.26.0) --- updated-dependencies: - dependency-name: quick-xml dependency-type: direct:production ... Signed-off-by: dependabot[bot] Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index 6abb390..e521373 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -44,7 +44,7 @@ walkdir = "2" # Cloud storage support base64 = { version = "0.13", default-features = false, optional = true } -quick-xml = { version = "0.25.0", features = ["serialize"], optional = true } +quick-xml = { version = "0.26.0", features = ["serialize"], optional = true } serde = { version = "1.0", default-features = false, features = ["derive"], optional = true } serde_json = { version = "1.0", default-features = false, optional = true } rand = { version = "0.8", default-features = false, features = ["std", "std_rng"], optional = true } From e5de7ce3e565f8ccc795ece54ac5599a796ca7ab Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Wed, 26 Oct 2022 09:04:33 +1300 Subject: [PATCH 050/397] Add experimental AWS_PROFILE support (#2178) (#2891) * Add experimental AWS_PROFILE support (#2178) * Add docs * Include region --- Cargo.toml | 7 ++ src/aws/client.rs | 2 +- src/aws/credential.rs | 152 +++++++++++++++++++++++++++++------------- src/aws/mod.rs | 103 ++++++++++++++++++++-------- 4 files changed, 189 insertions(+), 75 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index e521373..fc2af7e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -52,12 +52,19 @@ reqwest = { version = "0.11", default-features = false, features = ["rustls-tls" ring = { version = "0.16", default-features = false, features = ["std"], optional = true } rustls-pemfile = { version = "1.0", default-features = false, optional = true } +# AWS Profile support +aws-types = { version = "0.49", optional = true } +aws-config = { version = "0.49", optional = true } + [features] cloud = ["serde", "serde_json", "quick-xml", "reqwest", "reqwest/json", "reqwest/stream", "chrono/serde", "base64", "rand", "ring"] azure = ["cloud"] gcp = ["cloud", "rustls-pemfile"] aws = ["cloud"] +# Experimental support for AWS_PROFILE +aws_profile = ["aws", "aws-config", "aws-types"] + [dev-dependencies] # In alphabetical order dotenv = "0.15.0" tempfile = "3.1.0" diff --git a/src/aws/client.rs b/src/aws/client.rs index 2962162..a07cdb3 100644 --- a/src/aws/client.rs +++ b/src/aws/client.rs @@ -198,7 +198,7 @@ pub struct S3Config { pub endpoint: String, pub bucket: String, pub bucket_endpoint: String, - pub credentials: CredentialProvider, + pub credentials: Box, pub retry_config: RetryConfig, pub allow_http: bool, } diff --git a/src/aws/credential.rs b/src/aws/credential.rs index ada855b..32430d7 100644 --- a/src/aws/credential.rs +++ b/src/aws/credential.rs @@ -22,6 +22,7 @@ use crate::util::hmac_sha256; use crate::{Result, RetryConfig}; use bytes::Buf; use chrono::{DateTime, Utc}; +use futures::future::BoxFuture; use futures::TryFutureExt; use percent_encoding::utf8_percent_encode; use reqwest::header::{HeaderMap, HeaderValue}; @@ -289,21 +290,8 @@ fn canonicalize_headers(header_map: &HeaderMap) -> (String, String) { } /// Provides credentials for use when signing requests -#[derive(Debug)] -pub enum CredentialProvider { - Static(StaticCredentialProvider), - Instance(InstanceCredentialProvider), - WebIdentity(WebIdentityProvider), -} - -impl CredentialProvider { - pub async fn get_credential(&self) -> Result> { - match self { - Self::Static(s) => Ok(Arc::clone(&s.credential)), - Self::Instance(c) => c.get_credential().await, - Self::WebIdentity(c) => c.get_credential().await, - } - } +pub trait CredentialProvider: std::fmt::Debug + Send + Sync { + fn get_credential(&self) -> BoxFuture<'_, Result>>; } /// A static set of credentials @@ -312,6 +300,12 @@ pub struct StaticCredentialProvider { pub credential: Arc, } +impl CredentialProvider for StaticCredentialProvider { + fn get_credential(&self) -> BoxFuture<'_, Result>> { + Box::pin(futures::future::ready(Ok(Arc::clone(&self.credential)))) + } +} + /// Credentials sourced from the instance metadata service /// /// @@ -324,22 +318,20 @@ pub struct InstanceCredentialProvider { pub metadata_endpoint: String, } -impl InstanceCredentialProvider { - async fn get_credential(&self) -> Result> { - self.cache - .get_or_insert_with(|| { - instance_creds( - &self.client, - &self.retry_config, - &self.metadata_endpoint, - self.imdsv1_fallback, - ) - .map_err(|source| crate::Error::Generic { - store: "S3", - source, - }) +impl CredentialProvider for InstanceCredentialProvider { + fn get_credential(&self) -> BoxFuture<'_, Result>> { + Box::pin(self.cache.get_or_insert_with(|| { + instance_creds( + &self.client, + &self.retry_config, + &self.metadata_endpoint, + self.imdsv1_fallback, + ) + .map_err(|source| crate::Error::Generic { + store: "S3", + source, }) - .await + })) } } @@ -357,24 +349,22 @@ pub struct WebIdentityProvider { pub retry_config: RetryConfig, } -impl WebIdentityProvider { - async fn get_credential(&self) -> Result> { - self.cache - .get_or_insert_with(|| { - web_identity( - &self.client, - &self.retry_config, - &self.token, - &self.role_arn, - &self.session_name, - &self.endpoint, - ) - .map_err(|source| crate::Error::Generic { - store: "S3", - source, - }) +impl CredentialProvider for WebIdentityProvider { + fn get_credential(&self) -> BoxFuture<'_, Result>> { + Box::pin(self.cache.get_or_insert_with(|| { + web_identity( + &self.client, + &self.retry_config, + &self.token, + &self.role_arn, + &self.session_name, + &self.endpoint, + ) + .map_err(|source| crate::Error::Generic { + store: "S3", + source, }) - .await + })) } } @@ -520,6 +510,74 @@ async fn web_identity( }) } +#[cfg(feature = "aws_profile")] +mod profile { + use super::*; + use aws_config::profile::ProfileFileCredentialsProvider; + use aws_config::provider_config::ProviderConfig; + use aws_types::credentials::ProvideCredentials; + use aws_types::region::Region; + use std::time::SystemTime; + + #[derive(Debug)] + pub struct ProfileProvider { + cache: TokenCache>, + credentials: ProfileFileCredentialsProvider, + } + + impl ProfileProvider { + pub fn new(name: String, region: String) -> Self { + let config = ProviderConfig::default().with_region(Some(Region::new(region))); + + Self { + cache: Default::default(), + credentials: ProfileFileCredentialsProvider::builder() + .configure(&config) + .profile_name(name) + .build(), + } + } + } + + impl CredentialProvider for ProfileProvider { + fn get_credential(&self) -> BoxFuture<'_, Result>> { + Box::pin(self.cache.get_or_insert_with(move || async move { + let c = + self.credentials + .provide_credentials() + .await + .map_err(|source| crate::Error::Generic { + store: "S3", + source: Box::new(source), + })?; + + let t_now = SystemTime::now(); + let expiry = match c.expiry().and_then(|e| e.duration_since(t_now).ok()) { + Some(ttl) => Instant::now() + ttl, + None => { + return Err(crate::Error::Generic { + store: "S3", + source: "Invalid expiry".into(), + }) + } + }; + + Ok(TemporaryToken { + token: Arc::new(AwsCredential { + key_id: c.access_key_id().to_string(), + secret_key: c.secret_access_key().to_string(), + token: c.session_token().map(ToString::to_string), + }), + expiry, + }) + })) + } + } +} + +#[cfg(feature = "aws_profile")] +pub use profile::ProfileProvider; + #[cfg(test)] mod tests { use super::*; diff --git a/src/aws/mod.rs b/src/aws/mod.rs index c08a635..4a81065 100644 --- a/src/aws/mod.rs +++ b/src/aws/mod.rs @@ -109,6 +109,9 @@ enum Error { #[snafu(display("Missing SecretAccessKey"))] MissingSecretAccessKey, + #[snafu(display("Profile support requires aws_profile feature"))] + MissingProfileFeature, + #[snafu(display("ETag Header missing from response"))] MissingEtag, @@ -359,6 +362,7 @@ pub struct AmazonS3Builder { imdsv1_fallback: bool, virtual_hosted_style_request: bool, metadata_endpoint: Option, + profile: Option, } impl AmazonS3Builder { @@ -370,13 +374,14 @@ impl AmazonS3Builder { /// Fill the [`AmazonS3Builder`] with regular AWS environment variables /// /// Variables extracted from environment: - /// * AWS_ACCESS_KEY_ID -> access_key_id - /// * AWS_SECRET_ACCESS_KEY -> secret_access_key - /// * AWS_DEFAULT_REGION -> region - /// * AWS_ENDPOINT -> endpoint - /// * AWS_SESSION_TOKEN -> token - /// * AWS_CONTAINER_CREDENTIALS_RELATIVE_URI -> - /// * AWS_ALLOW_HTTP -> set to "true" to permit HTTP connections without TLS + /// * `AWS_ACCESS_KEY_ID` -> access_key_id + /// * `AWS_SECRET_ACCESS_KEY` -> secret_access_key + /// * `AWS_DEFAULT_REGION` -> region + /// * `AWS_ENDPOINT` -> endpoint + /// * `AWS_SESSION_TOKEN` -> token + /// * `AWS_CONTAINER_CREDENTIALS_RELATIVE_URI` -> + /// * `AWS_ALLOW_HTTP` -> set to "true" to permit HTTP connections without TLS + /// * `AWS_PROFILE` -> set profile name, requires `aws_profile` feature enabled /// # Example /// ``` /// use object_store::aws::AmazonS3Builder; @@ -408,6 +413,10 @@ impl AmazonS3Builder { builder.token = Some(token); } + if let Ok(profile) = std::env::var("AWS_PROFILE") { + builder.profile = Some(profile); + } + // This env var is set in ECS // https://docs.aws.amazon.com/AmazonECS/latest/developerguide/task-iam-roles.html if let Ok(metadata_relative_uri) = @@ -528,6 +537,24 @@ impl AmazonS3Builder { self } + /// Set the AWS profile name, see + /// + /// This makes use of [aws-config] to provide credentials and therefore requires + /// the `aws-profile` feature to be enabled + /// + /// It is strongly encouraged that users instead make use of a credential manager + /// such as [aws-vault] not only to avoid the significant additional dependencies, + /// but also to avoid storing credentials in [plain text on disk] + /// + /// [aws-config]: https://docs.rs/aws-config + /// [aws-vault]: https://github.com/99designs/aws-vault + /// [plain text on disk]: https://99designs.com.au/blog/engineering/aws-vault/ + #[cfg(feature = "aws_profile")] + pub fn with_profile(mut self, profile: impl Into) -> Self { + self.profile = Some(profile.into()); + self + } + /// Create a [`AmazonS3`] instance from the provided values, /// consuming `self`. pub fn build(self) -> Result { @@ -537,13 +564,13 @@ impl AmazonS3Builder { let credentials = match (self.access_key_id, self.secret_access_key, self.token) { (Some(key_id), Some(secret_key), token) => { info!("Using Static credential provider"); - CredentialProvider::Static(StaticCredentialProvider { + Box::new(StaticCredentialProvider { credential: Arc::new(AwsCredential { key_id, secret_key, token, }), - }) + }) as _ } (None, Some(_), _) => return Err(Error::MissingAccessKeyId.into()), (Some(_), None, _) => return Err(Error::MissingSecretAccessKey.into()), @@ -565,7 +592,7 @@ impl AmazonS3Builder { // Disallow non-HTTPs requests let client = Client::builder().https_only(true).build().unwrap(); - CredentialProvider::WebIdentity(WebIdentityProvider { + Box::new(WebIdentityProvider { cache: Default::default(), token, session_name, @@ -573,24 +600,30 @@ impl AmazonS3Builder { endpoint, client, retry_config: self.retry_config.clone(), - }) - } - _ => { - info!("Using Instance credential provider"); - - // The instance metadata endpoint is access over HTTP - let client = Client::builder().https_only(false).build().unwrap(); - - CredentialProvider::Instance(InstanceCredentialProvider { - cache: Default::default(), - client, - retry_config: self.retry_config.clone(), - imdsv1_fallback: self.imdsv1_fallback, - metadata_endpoint: self - .metadata_endpoint - .unwrap_or_else(|| METADATA_ENDPOINT.into()), - }) + }) as _ } + _ => match self.profile { + Some(profile) => { + info!("Using profile \"{}\" credential provider", profile); + profile_credentials(profile, region.clone())? + } + None => { + info!("Using Instance credential provider"); + + // The instance metadata endpoint is access over HTTP + let client = Client::builder().https_only(false).build().unwrap(); + + Box::new(InstanceCredentialProvider { + cache: Default::default(), + client, + retry_config: self.retry_config.clone(), + imdsv1_fallback: self.imdsv1_fallback, + metadata_endpoint: self + .metadata_endpoint + .unwrap_or_else(|| METADATA_ENDPOINT.into()), + }) as _ + } + }, }, }; @@ -628,6 +661,22 @@ impl AmazonS3Builder { } } +#[cfg(feature = "aws_profile")] +fn profile_credentials( + profile: String, + region: String, +) -> Result> { + Ok(Box::new(credential::ProfileProvider::new(profile, region))) +} + +#[cfg(not(feature = "aws_profile"))] +fn profile_credentials( + _profile: String, + _region: String, +) -> Result> { + Err(Error::MissingProfileFeature.into()) +} + #[cfg(test)] mod tests { use super::*; From e2e7dbad629baa7da3b45ce8836c3a8433575124 Mon Sep 17 00:00:00 2001 From: John Hughes Date: Wed, 26 Oct 2022 05:34:18 +0200 Subject: [PATCH 051/397] Support building `object_store` and `parquet` on wasm32-unknown-unknown target (#2896) * Support building object_store on wasm32-unknown-unknown target * Added cargo check step to parquet workflow for wasm32-unknown-unknown * Added compile-time warning for unsupported cloud features when compiling with wasm32 * Added cargo check features to the parquet github workflow. * Added a section to the README.md for parquet * * Added wasm32-unknown-unknown section to the object_store README.md --- Cargo.toml | 6 ++++-- README.md | 9 ++++++++- src/lib.rs | 20 +++++++++++++++++--- src/path/mod.rs | 6 ++++++ src/util.rs | 1 + 5 files changed, 36 insertions(+), 6 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index fc2af7e..f5eb111 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -37,7 +37,7 @@ itertools = "0.10.1" parking_lot = { version = "0.12" } percent-encoding = "2.1" snafu = "0.7" -tokio = { version = "1.18", features = ["sync", "macros", "parking_lot", "rt-multi-thread", "time", "io-util"] } +tokio = { version = "1.18", features = ["sync", "macros", "rt", "time", "io-util"] } tracing = { version = "0.1" } url = "2.2" walkdir = "2" @@ -51,13 +51,15 @@ rand = { version = "0.8", default-features = false, features = ["std", "std_rng" reqwest = { version = "0.11", default-features = false, features = ["rustls-tls"], optional = true } ring = { version = "0.16", default-features = false, features = ["std"], optional = true } rustls-pemfile = { version = "1.0", default-features = false, optional = true } +# Fix for wasm32-unknown-unknown (see https://docs.rs/getrandom/latest/getrandom/#webassembly-support) +getrandom = { version = "0.2", features = ["js"], optional = true } # AWS Profile support aws-types = { version = "0.49", optional = true } aws-config = { version = "0.49", optional = true } [features] -cloud = ["serde", "serde_json", "quick-xml", "reqwest", "reqwest/json", "reqwest/stream", "chrono/serde", "base64", "rand", "ring"] +cloud = ["serde", "serde_json", "quick-xml", "reqwest", "reqwest/json", "reqwest/stream", "chrono/serde", "base64", "rand", "ring", "getrandom"] azure = ["cloud"] gcp = ["cloud", "rustls-pemfile"] aws = ["cloud"] diff --git a/README.md b/README.md index fd10414..5b47a65 100644 --- a/README.md +++ b/README.md @@ -33,7 +33,14 @@ change. Supported object stores include: * Memory * Custom implementations - Originally developed for [InfluxDB IOx](https://github.com/influxdata/influxdb_iox/) and later split out and donated to [Apache Arrow](https://arrow.apache.org/). See [docs.rs](https://docs.rs/object_store) for usage instructions + +## Support for `wasm32-unknown-unknown` target + +It's possible to build `object_store` for the `wasm32-unknown-unknown` target, however the cloud storage features `aws`, `azure`, and `gcp` are not supported. + +``` +cargo build -p object_store --target wasm32-unknown-unknown +``` \ No newline at end of file diff --git a/src/lib.rs b/src/lib.rs index 5eaaaba..6278d82 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -153,6 +153,12 @@ //! ``` //! +#[cfg(all( + target_arch = "wasm32", + any(feature = "gcp", feature = "aws", feature = "azure",) +))] +compile_error!("Features 'gcp', 'aws', 'azure' are not supported on wasm."); + #[cfg(feature = "aws")] pub mod aws; #[cfg(feature = "azure")] @@ -160,6 +166,7 @@ pub mod azure; #[cfg(feature = "gcp")] pub mod gcp; pub mod limit; +#[cfg(not(target_arch = "wasm32"))] pub mod local; pub mod memory; pub mod path; @@ -176,15 +183,16 @@ mod multipart; mod util; use crate::path::Path; -use crate::util::{ - coalesce_ranges, collect_bytes, maybe_spawn_blocking, OBJECT_STORE_COALESCE_DEFAULT, -}; +#[cfg(not(target_arch = "wasm32"))] +use crate::util::maybe_spawn_blocking; +use crate::util::{coalesce_ranges, collect_bytes, OBJECT_STORE_COALESCE_DEFAULT}; use async_trait::async_trait; use bytes::Bytes; use chrono::{DateTime, Utc}; use futures::{stream::BoxStream, StreamExt}; use snafu::Snafu; use std::fmt::{Debug, Formatter}; +#[cfg(not(target_arch = "wasm32"))] use std::io::{Read, Seek, SeekFrom}; use std::ops::Range; use tokio::io::AsyncWrite; @@ -351,6 +359,7 @@ impl GetResult { /// Collects the data into a [`Bytes`] pub async fn bytes(self) -> Result { match self { + #[cfg(not(target_arch = "wasm32"))] Self::File(mut file, path) => { maybe_spawn_blocking(move || { let len = file.seek(SeekFrom::End(0)).map_err(|source| { @@ -377,6 +386,8 @@ impl GetResult { .await } Self::Stream(s) => collect_bytes(s, None).await, + #[cfg(target_arch = "wasm32")] + _ => unimplemented!("File IO not implemented on wasm32."), } } @@ -396,6 +407,7 @@ impl GetResult { /// no additional complexity or overheads pub fn into_stream(self) -> BoxStream<'static, Result> { match self { + #[cfg(not(target_arch = "wasm32"))] Self::File(file, path) => { const CHUNK_SIZE: usize = 8 * 1024; @@ -424,6 +436,8 @@ impl GetResult { .boxed() } Self::Stream(s) => s, + #[cfg(target_arch = "wasm32")] + _ => unimplemented!("File IO not implemented on wasm32."), } } } diff --git a/src/path/mod.rs b/src/path/mod.rs index 80e0f79..59ad471 100644 --- a/src/path/mod.rs +++ b/src/path/mod.rs @@ -18,9 +18,11 @@ //! Path abstraction for Object Storage use itertools::Itertools; +#[cfg(not(target_arch = "wasm32"))] use percent_encoding::percent_decode; use snafu::{ensure, ResultExt, Snafu}; use std::fmt::Formatter; +#[cfg(not(target_arch = "wasm32"))] use url::Url; /// The delimiter to separate object namespaces, creating a directory structure. @@ -160,6 +162,7 @@ impl Path { }) } + #[cfg(not(target_arch = "wasm32"))] /// Convert a filesystem path to a [`Path`] relative to the filesystem root /// /// This will return an error if the path contains illegal character sequences @@ -176,6 +179,7 @@ impl Path { Self::from_absolute_path(absolute) } + #[cfg(not(target_arch = "wasm32"))] /// Convert an absolute filesystem path to a [`Path`] relative to the filesystem root /// /// This will return an error if the path contains illegal character sequences @@ -184,6 +188,7 @@ impl Path { Self::from_absolute_path_with_base(path, None) } + #[cfg(not(target_arch = "wasm32"))] /// Convert a filesystem path to a [`Path`] relative to the provided base /// /// This will return an error if the path contains illegal character sequences @@ -308,6 +313,7 @@ where } } +#[cfg(not(target_arch = "wasm32"))] /// Given an absolute filesystem path convert it to a URL representation without canonicalization pub(crate) fn absolute_path_to_url( path: impl AsRef, diff --git a/src/util.rs b/src/util.rs index 2814ca2..41c72d0 100644 --- a/src/util.rs +++ b/src/util.rs @@ -69,6 +69,7 @@ where } } +#[cfg(not(target_arch = "wasm32"))] /// Takes a function and spawns it to a tokio blocking pool if available pub async fn maybe_spawn_blocking(f: F) -> Result where From 8292ef4ef252a9dd59c36946225f3be2949fe3b2 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Sat, 29 Oct 2022 17:20:30 +1300 Subject: [PATCH 052/397] Update AWS SDK (#2974) --- Cargo.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index f5eb111..fc80cb5 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -55,8 +55,8 @@ rustls-pemfile = { version = "1.0", default-features = false, optional = true } getrandom = { version = "0.2", features = ["js"], optional = true } # AWS Profile support -aws-types = { version = "0.49", optional = true } -aws-config = { version = "0.49", optional = true } +aws-types = { version = "0.51", optional = true } +aws-config = { version = "0.51", optional = true } [features] cloud = ["serde", "serde_json", "quick-xml", "reqwest", "reqwest/json", "reqwest/stream", "chrono/serde", "base64", "rand", "ring", "getrandom"] From 6e6305ec11e29bc273adea79767e5ed9986a7f3f Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Fri, 4 Nov 2022 15:46:27 +1300 Subject: [PATCH 053/397] Fix more clippy lints (#3015) --- src/local.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/local.rs b/src/local.rs index fd3c359..f7b7ad7 100644 --- a/src/local.rs +++ b/src/local.rs @@ -803,16 +803,16 @@ fn open_file(path: &PathBuf) -> Result { } fn open_writable_file(path: &PathBuf) -> Result { - match File::create(&path) { + match File::create(path) { Ok(f) => Ok(f), Err(err) if err.kind() == std::io::ErrorKind::NotFound => { let parent = path .parent() .context(UnableToCreateFileSnafu { path: &path, err })?; - std::fs::create_dir_all(&parent) + std::fs::create_dir_all(parent) .context(UnableToCreateDirSnafu { path: parent })?; - match File::create(&path) { + match File::create(path) { Ok(f) => Ok(f), Err(err) => Err(Error::UnableToCreateFile { path: path.to_path_buf(), From e74a3cbfd782ca024b0a3e4847dee4f8f152243c Mon Sep 17 00:00:00 2001 From: Wei-Ting Kuo Date: Mon, 21 Nov 2022 22:15:38 +0800 Subject: [PATCH 054/397] use chrono add/sub months (#3132) * use cargo add/sub months * update all chrono versions * clippy --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index fc80cb5..fd7442f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -31,7 +31,7 @@ all-features = true [dependencies] # In alphabetical order async-trait = "0.1.53" bytes = "1.0" -chrono = { version = "0.4", default-features = false, features = ["clock"] } +chrono = { version = "0.4.23", default-features = false, features = ["clock"] } futures = "0.3" itertools = "0.10.1" parking_lot = { version = "0.12" } From 4ea539ddda44d6935f871abc10fffcb3804d37e2 Mon Sep 17 00:00:00 2001 From: Ian Alexander Joiner <14581281+iajoiner@users.noreply.github.com> Date: Fri, 25 Nov 2022 13:30:03 -0500 Subject: [PATCH 055/397] Update version to 28.0.0 and add changelog (#3181) * Update version * Create changelog --- CONTRIBUTING.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 7c2832c..e780ec5 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -49,7 +49,7 @@ export TEST_INTEGRATION=1 export AWS_DEFAULT_REGION=us-east-1 export AWS_ACCESS_KEY_ID=test export AWS_SECRET_ACCESS_KEY=test -export AWS_ENDPOINT=http://127.0.0.1:4566 +export AWS_ENDPOINT=http://128.0.0.1:4566 export OBJECT_STORE_BUCKET=test-bucket ``` @@ -79,7 +79,7 @@ $ podman run -p 10000:10000 -p 10001:10001 -p 10002:10002 mcr.microsoft.com/azur Create a bucket ``` -$ podman run --net=host mcr.microsoft.com/azure-cli az storage container create -n test-bucket --connection-string 'DefaultEndpointsProtocol=http;AccountName=devstoreaccount1;AccountKey=Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==;BlobEndpoint=http://127.0.0.1:10000/devstoreaccount1;QueueEndpoint=http://127.0.0.1:10001/devstoreaccount1;' +$ podman run --net=host mcr.microsoft.com/azure-cli az storage container create -n test-bucket --connection-string 'DefaultEndpointsProtocol=http;AccountName=devstoreaccount1;AccountKey=Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==;BlobEndpoint=http://128.0.0.1:10000/devstoreaccount1;QueueEndpoint=http://128.0.0.1:10001/devstoreaccount1;' ``` Run tests From 7f67b7aea50e3f63bb90836e0b3375e5495e8bcc Mon Sep 17 00:00:00 2001 From: Sumit Date: Tue, 29 Nov 2022 11:45:02 +0100 Subject: [PATCH 056/397] object_store: add support for using proxy_url for connection testing (#3109) --- src/aws/client.rs | 23 +++++++++++++----- src/aws/mod.rs | 58 +++++++++++++++++++++++++++++++++++++++++---- src/azure/client.rs | 25 +++++++++++++------ src/azure/mod.rs | 11 ++++++++- src/gcp/mod.rs | 55 +++++++++++++++++++++++++++++++++++++++++- 5 files changed, 153 insertions(+), 19 deletions(-) diff --git a/src/aws/client.rs b/src/aws/client.rs index a07cdb3..e51fe41 100644 --- a/src/aws/client.rs +++ b/src/aws/client.rs @@ -88,6 +88,9 @@ pub(crate) enum Error { #[snafu(display("Got invalid multipart response: {}", source))] InvalidMultipartResponse { source: quick_xml::de::DeError }, + + #[snafu(display("Unable to use proxy url: {}", source))] + ProxyUrl { source: reqwest::Error }, } impl From for crate::Error { @@ -201,6 +204,7 @@ pub struct S3Config { pub credentials: Box, pub retry_config: RetryConfig, pub allow_http: bool, + pub proxy_url: Option, } impl S3Config { @@ -216,13 +220,20 @@ pub(crate) struct S3Client { } impl S3Client { - pub fn new(config: S3Config) -> Self { - let client = reqwest::ClientBuilder::new() - .https_only(!config.allow_http) - .build() - .unwrap(); + pub fn new(config: S3Config) -> Result { + let builder = reqwest::ClientBuilder::new().https_only(!config.allow_http); + let client = match &config.proxy_url { + Some(ref url) => { + let pr = reqwest::Proxy::all(url) + .map_err(|source| Error::ProxyUrl { source })?; + builder.proxy(pr) + } + _ => builder, + } + .build() + .unwrap(); - Self { config, client } + Ok(Self { config, client }) } /// Returns the config diff --git a/src/aws/mod.rs b/src/aws/mod.rs index 4a81065..cf7a554 100644 --- a/src/aws/mod.rs +++ b/src/aws/mod.rs @@ -36,7 +36,7 @@ use bytes::Bytes; use chrono::{DateTime, Utc}; use futures::stream::BoxStream; use futures::TryStreamExt; -use reqwest::Client; +use reqwest::{Client, Proxy}; use snafu::{OptionExt, ResultExt, Snafu}; use std::collections::BTreeSet; use std::ops::Range; @@ -120,6 +120,9 @@ enum Error { #[snafu(display("Error reading token file: {}", source))] ReadTokenFile { source: std::io::Error }, + + #[snafu(display("Unable to use proxy url: {}", source))] + ProxyUrl { source: reqwest::Error }, } impl From for super::Error { @@ -363,6 +366,7 @@ pub struct AmazonS3Builder { virtual_hosted_style_request: bool, metadata_endpoint: Option, profile: Option, + proxy_url: Option, } impl AmazonS3Builder { @@ -537,6 +541,12 @@ impl AmazonS3Builder { self } + /// Set the proxy_url to be used by the underlying client + pub fn with_proxy_url(mut self, proxy_url: impl Into) -> Self { + self.proxy_url = Some(proxy_url.into()); + self + } + /// Set the AWS profile name, see /// /// This makes use of [aws-config] to provide credentials and therefore requires @@ -561,6 +571,14 @@ impl AmazonS3Builder { let bucket = self.bucket_name.context(MissingBucketNameSnafu)?; let region = self.region.context(MissingRegionSnafu)?; + let clientbuilder = match self.proxy_url { + Some(ref url) => { + let pr: Proxy = + Proxy::all(url).map_err(|source| Error::ProxyUrl { source })?; + Client::builder().proxy(pr) + } + None => Client::builder(), + }; let credentials = match (self.access_key_id, self.secret_access_key, self.token) { (Some(key_id), Some(secret_key), token) => { info!("Using Static credential provider"); @@ -590,7 +608,7 @@ impl AmazonS3Builder { let endpoint = format!("https://sts.{}.amazonaws.com", region); // Disallow non-HTTPs requests - let client = Client::builder().https_only(true).build().unwrap(); + let client = clientbuilder.https_only(true).build().unwrap(); Box::new(WebIdentityProvider { cache: Default::default(), @@ -611,7 +629,7 @@ impl AmazonS3Builder { info!("Using Instance credential provider"); // The instance metadata endpoint is access over HTTP - let client = Client::builder().https_only(false).build().unwrap(); + let client = clientbuilder.https_only(false).build().unwrap(); Box::new(InstanceCredentialProvider { cache: Default::default(), @@ -653,9 +671,10 @@ impl AmazonS3Builder { credentials, retry_config: self.retry_config, allow_http: self.allow_http, + proxy_url: self.proxy_url, }; - let client = Arc::new(S3Client::new(config)); + let client = Arc::new(S3Client::new(config).unwrap()); Ok(AmazonS3 { client }) } @@ -898,4 +917,35 @@ mod tests { let err = integration.delete(&location).await.unwrap_err(); assert!(matches!(err, crate::Error::NotFound { .. }), "{}", err); } + + #[tokio::test] + async fn s3_test_proxy_url() { + let s3 = AmazonS3Builder::new() + .with_access_key_id("access_key_id") + .with_secret_access_key("secret_access_key") + .with_region("region") + .with_bucket_name("bucket_name") + .with_allow_http(true) + .with_proxy_url("https://example.com") + .build(); + + assert!(s3.is_ok()); + + let s3 = AmazonS3Builder::new() + .with_access_key_id("access_key_id") + .with_secret_access_key("secret_access_key") + .with_region("region") + .with_bucket_name("bucket_name") + .with_allow_http(true) + .with_proxy_url("asdf://example.com") + .build(); + + assert!(match s3 { + Err(crate::Error::Generic { source, .. }) => matches!( + source.downcast_ref(), + Some(crate::aws::Error::ProxyUrl { .. }) + ), + _ => false, + }) + } } diff --git a/src/azure/client.rs b/src/azure/client.rs index ece0785..d8cfdd1 100644 --- a/src/azure/client.rs +++ b/src/azure/client.rs @@ -27,7 +27,7 @@ use chrono::{DateTime, TimeZone, Utc}; use itertools::Itertools; use reqwest::{ header::{HeaderValue, CONTENT_LENGTH, IF_NONE_MATCH, RANGE}, - Client as ReqwestClient, Method, Response, StatusCode, + Client as ReqwestClient, Method, Proxy, Response, StatusCode, }; use serde::{Deserialize, Deserializer, Serialize}; use snafu::{ResultExt, Snafu}; @@ -82,6 +82,9 @@ pub(crate) enum Error { Authorization { source: crate::azure::credential::Error, }, + + #[snafu(display("Unable to use proxy url: {}", source))] + ProxyUrl { source: reqwest::Error }, } impl From for crate::Error { @@ -124,6 +127,7 @@ pub struct AzureConfig { pub allow_http: bool, pub service: Url, pub is_emulator: bool, + pub proxy_url: Option, } impl AzureConfig { @@ -148,13 +152,20 @@ pub(crate) struct AzureClient { impl AzureClient { /// create a new instance of [AzureClient] - pub fn new(config: AzureConfig) -> Self { - let client = reqwest::ClientBuilder::new() - .https_only(!config.allow_http) - .build() - .unwrap(); + pub fn new(config: AzureConfig) -> Result { + let builder = ReqwestClient::builder(); + + let client = if let Some(url) = config.proxy_url.as_ref() { + let pr = Proxy::all(url).map_err(|source| Error::ProxyUrl { source }); + builder.proxy(pr.unwrap()) + } else { + builder + } + .https_only(!config.allow_http) + .build() + .unwrap(); - Self { config, client } + Ok(Self { config, client }) } /// Returns the config diff --git a/src/azure/mod.rs b/src/azure/mod.rs index f7ca4cf..060b4b2 100644 --- a/src/azure/mod.rs +++ b/src/azure/mod.rs @@ -360,6 +360,7 @@ pub struct MicrosoftAzureBuilder { use_emulator: bool, retry_config: RetryConfig, allow_http: bool, + proxy_url: Option, } impl Debug for MicrosoftAzureBuilder { @@ -500,6 +501,12 @@ impl MicrosoftAzureBuilder { self } + /// Set the proxy_url to be used by the underlying client + pub fn with_proxy_url(mut self, proxy_url: impl Into) -> Self { + self.proxy_url = Some(proxy_url.into()); + self + } + /// Configure a connection to container with given name on Microsoft Azure /// Blob store. pub fn build(self) -> Result { @@ -516,6 +523,7 @@ impl MicrosoftAzureBuilder { retry_config, allow_http, authority_host, + proxy_url, } = self; let container = container_name.ok_or(Error::MissingContainerName {})?; @@ -567,9 +575,10 @@ impl MicrosoftAzureBuilder { container, credentials: auth, is_emulator, + proxy_url, }; - let client = Arc::new(client::AzureClient::new(config)); + let client = Arc::new(client::AzureClient::new(config)?); Ok(MicrosoftAzure { client }) } diff --git a/src/gcp/mod.rs b/src/gcp/mod.rs index 0ef4d35..0da92fd 100644 --- a/src/gcp/mod.rs +++ b/src/gcp/mod.rs @@ -41,6 +41,7 @@ use chrono::{DateTime, Utc}; use futures::{stream::BoxStream, StreamExt, TryStreamExt}; use percent_encoding::{percent_encode, NON_ALPHANUMERIC}; use reqwest::header::RANGE; +use reqwest::Proxy; use reqwest::{header, Client, Method, Response, StatusCode}; use snafu::{ResultExt, Snafu}; use tokio::io::AsyncWrite; @@ -122,6 +123,9 @@ enum Error { #[snafu(display("GCP credential error: {}", source))] Credential { source: credential::Error }, + + #[snafu(display("Unable to use proxy url: {}", source))] + ProxyUrl { source: reqwest::Error }, } impl From for super::Error { @@ -741,6 +745,7 @@ pub struct GoogleCloudStorageBuilder { service_account_path: Option, client: Option, retry_config: RetryConfig, + proxy_url: Option, } impl GoogleCloudStorageBuilder { @@ -782,6 +787,12 @@ impl GoogleCloudStorageBuilder { self } + /// Set proxy url used for connection + pub fn with_proxy_url(mut self, proxy_url: impl Into) -> Self { + self.proxy_url = Some(proxy_url.into()); + self + } + /// Configure a connection to Google Cloud Storage, returning a /// new [`GoogleCloudStorage`] and consuming `self` pub fn build(self) -> Result { @@ -790,12 +801,24 @@ impl GoogleCloudStorageBuilder { service_account_path, client, retry_config, + proxy_url, } = self; let bucket_name = bucket_name.ok_or(Error::MissingBucketName {})?; let service_account_path = service_account_path.ok_or(Error::MissingServiceAccountPath)?; - let client = client.unwrap_or_else(Client::new); + + let client = match (proxy_url, client) { + (_, Some(client)) => client, + (Some(url), None) => { + let pr = Proxy::all(&url).map_err(|source| Error::ProxyUrl { source })?; + Client::builder() + .proxy(pr) + .build() + .map_err(|source| Error::ProxyUrl { source })? + } + (None, None) => Client::new(), + }; let credentials = reader_credentials_file(service_account_path)?; @@ -1015,4 +1038,34 @@ mod test { err ) } + + #[tokio::test] + async fn gcs_test_proxy_url() { + use std::io::Write; + use tempfile::NamedTempFile; + let mut tfile = NamedTempFile::new().unwrap(); + let creds = r#"{"private_key": "private_key", "client_email":"client_email", "disable_oauth":true}"#; + write!(tfile, "{}", creds).unwrap(); + let service_account_path = tfile.path(); + let gcs = GoogleCloudStorageBuilder::new() + .with_service_account_path(service_account_path.to_str().unwrap()) + .with_bucket_name("foo") + .with_proxy_url("https://example.com") + .build(); + assert!(dbg!(gcs).is_ok()); + + let gcs = GoogleCloudStorageBuilder::new() + .with_service_account_path(service_account_path.to_str().unwrap()) + .with_bucket_name("foo") + .with_proxy_url("asdf://example.com") + .build(); + + assert!(match gcs { + Err(ObjectStoreError::Generic { source, .. }) => matches!( + source.downcast_ref(), + Some(crate::gcp::Error::ProxyUrl { .. }) + ), + _ => false, + }) + } } From 227d410276a00db37e217cb37e4fef2209bfd3e8 Mon Sep 17 00:00:00 2001 From: Will Jones Date: Thu, 1 Dec 2022 06:23:38 -0800 Subject: [PATCH 057/397] fix(object_store,aws,gcp): multipart upload enforce size limit of 5 MiB not 5MB (#3234) * fix: use better minimum part size * test: don't make the test larger than necessary * Further tweaks * Format Co-authored-by: Raphael Taylor-Davies --- CONTRIBUTING.md | 6 +++--- src/lib.rs | 3 ++- src/multipart.rs | 15 ++++++++++----- 3 files changed, 15 insertions(+), 9 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index e780ec5..4e6b3af 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -46,9 +46,9 @@ Setup environment ``` export TEST_INTEGRATION=1 -export AWS_DEFAULT_REGION=us-east-1 -export AWS_ACCESS_KEY_ID=test -export AWS_SECRET_ACCESS_KEY=test +export OBJECT_STORE_AWS_DEFAULT_REGION=us-east-1 +export OBJECT_STORE_AWS_ACCESS_KEY_ID=test +export OBJECT_STORE_AWS_SECRET_ACCESS_KEY=test export AWS_ENDPOINT=http://128.0.0.1:4566 export OBJECT_STORE_BUCKET=test-bucket ``` diff --git a/src/lib.rs b/src/lib.rs index 6278d82..a36bb5f 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -769,7 +769,8 @@ mod tests { assert_eq!(bytes_expected, bytes_written); // Can overwrite some storage - let data = get_vec_of_bytes(5_000, 5); + // Sizes carefully chosen to exactly hit min limit of 5 MiB + let data = get_vec_of_bytes(242_880, 22); let bytes_expected = data.concat(); let (_, mut writer) = storage.put_multipart(&location).await.unwrap(); for chunk in &data { diff --git a/src/multipart.rs b/src/multipart.rs index 102d8be..de85914 100644 --- a/src/multipart.rs +++ b/src/multipart.rs @@ -81,7 +81,11 @@ where current_buffer: Vec::new(), // TODO: Should self vary by provider? // TODO: Should we automatically increase then when part index gets large? - min_part_size: 5_000_000, + + // Minimum size of 5 MiB + // https://docs.aws.amazon.com/AmazonS3/latest/userguide/qfacts.html + // https://cloud.google.com/storage/quotas#requests + min_part_size: 5_242_880, current_part_idx: 0, completion_task: None, } @@ -113,13 +117,14 @@ where mut self: Pin<&mut Self>, cx: &mut std::task::Context<'_>, buf: &[u8], - ) -> std::task::Poll> { + ) -> Poll> { // Poll current tasks self.as_mut().poll_tasks(cx)?; // If adding buf to pending buffer would trigger send, check // whether we have capacity for another task. - let enough_to_send = (buf.len() + self.current_buffer.len()) > self.min_part_size; + let enough_to_send = + (buf.len() + self.current_buffer.len()) >= self.min_part_size; if enough_to_send && self.tasks.len() < self.max_concurrency { // If we do, copy into the buffer and submit the task, and return ready. self.current_buffer.extend_from_slice(buf); @@ -149,7 +154,7 @@ where fn poll_flush( mut self: Pin<&mut Self>, cx: &mut std::task::Context<'_>, - ) -> std::task::Poll> { + ) -> Poll> { // Poll current tasks self.as_mut().poll_tasks(cx)?; @@ -177,7 +182,7 @@ where fn poll_shutdown( mut self: Pin<&mut Self>, cx: &mut std::task::Context<'_>, - ) -> std::task::Poll> { + ) -> Poll> { // First, poll flush match self.as_mut().poll_flush(cx) { Poll::Pending => return Poll::Pending, From 15123f7cb976bea3a0cb41df07e54f9ce34361d2 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Fri, 2 Dec 2022 09:49:02 +0000 Subject: [PATCH 058/397] Add ObjectStore ClientConfig (#3252) * Add ObjectStore ClientConfig * Fix default allow HTTP for GCP * Fix tests * Tweak error message --- src/aws/client.rs | 22 +++------------ src/aws/mod.rs | 65 +++++++++++++++++++++----------------------- src/azure/client.rs | 26 +++++------------- src/azure/mod.rs | 32 +++++++++++++--------- src/client/mod.rs | 50 ++++++++++++++++++++++++++++++++++ src/gcp/mod.rs | 66 ++++++++++++++++++++++----------------------- src/lib.rs | 3 +++ 7 files changed, 147 insertions(+), 117 deletions(-) diff --git a/src/aws/client.rs b/src/aws/client.rs index e51fe41..ccc0a9c 100644 --- a/src/aws/client.rs +++ b/src/aws/client.rs @@ -23,7 +23,8 @@ use crate::multipart::UploadPart; use crate::path::DELIMITER; use crate::util::{format_http_range, format_prefix}; use crate::{ - BoxStream, ListResult, MultipartId, ObjectMeta, Path, Result, RetryConfig, StreamExt, + BoxStream, ClientOptions, ListResult, MultipartId, ObjectMeta, Path, Result, + RetryConfig, StreamExt, }; use bytes::{Buf, Bytes}; use chrono::{DateTime, Utc}; @@ -88,9 +89,6 @@ pub(crate) enum Error { #[snafu(display("Got invalid multipart response: {}", source))] InvalidMultipartResponse { source: quick_xml::de::DeError }, - - #[snafu(display("Unable to use proxy url: {}", source))] - ProxyUrl { source: reqwest::Error }, } impl From for crate::Error { @@ -203,8 +201,7 @@ pub struct S3Config { pub bucket_endpoint: String, pub credentials: Box, pub retry_config: RetryConfig, - pub allow_http: bool, - pub proxy_url: Option, + pub client_options: ClientOptions, } impl S3Config { @@ -221,18 +218,7 @@ pub(crate) struct S3Client { impl S3Client { pub fn new(config: S3Config) -> Result { - let builder = reqwest::ClientBuilder::new().https_only(!config.allow_http); - let client = match &config.proxy_url { - Some(ref url) => { - let pr = reqwest::Proxy::all(url) - .map_err(|source| Error::ProxyUrl { source })?; - builder.proxy(pr) - } - _ => builder, - } - .build() - .unwrap(); - + let client = config.client_options.client()?; Ok(Self { config, client }) } diff --git a/src/aws/mod.rs b/src/aws/mod.rs index cf7a554..c92b8c2 100644 --- a/src/aws/mod.rs +++ b/src/aws/mod.rs @@ -36,7 +36,6 @@ use bytes::Bytes; use chrono::{DateTime, Utc}; use futures::stream::BoxStream; use futures::TryStreamExt; -use reqwest::{Client, Proxy}; use snafu::{OptionExt, ResultExt, Snafu}; use std::collections::BTreeSet; use std::ops::Range; @@ -51,8 +50,8 @@ use crate::aws::credential::{ }; use crate::multipart::{CloudMultiPartUpload, CloudMultiPartUploadImpl, UploadPart}; use crate::{ - GetResult, ListResult, MultipartId, ObjectMeta, ObjectStore, Path, Result, - RetryConfig, StreamExt, + ClientOptions, GetResult, ListResult, MultipartId, ObjectMeta, ObjectStore, Path, + Result, RetryConfig, StreamExt, }; mod client; @@ -120,9 +119,6 @@ enum Error { #[snafu(display("Error reading token file: {}", source))] ReadTokenFile { source: std::io::Error }, - - #[snafu(display("Unable to use proxy url: {}", source))] - ProxyUrl { source: reqwest::Error }, } impl From for super::Error { @@ -361,12 +357,11 @@ pub struct AmazonS3Builder { endpoint: Option, token: Option, retry_config: RetryConfig, - allow_http: bool, imdsv1_fallback: bool, virtual_hosted_style_request: bool, metadata_endpoint: Option, profile: Option, - proxy_url: Option, + client_options: ClientOptions, } impl AmazonS3Builder { @@ -431,7 +426,8 @@ impl AmazonS3Builder { } if let Ok(text) = std::env::var("AWS_ALLOW_HTTP") { - builder.allow_http = text == "true"; + builder.client_options = + builder.client_options.with_allow_http(text == "true"); } builder @@ -487,7 +483,7 @@ impl AmazonS3Builder { /// * false (default): Only HTTPS are allowed /// * true: HTTP and HTTPS are allowed pub fn with_allow_http(mut self, allow_http: bool) -> Self { - self.allow_http = allow_http; + self.client_options = self.client_options.with_allow_http(allow_http); self } @@ -543,7 +539,13 @@ impl AmazonS3Builder { /// Set the proxy_url to be used by the underlying client pub fn with_proxy_url(mut self, proxy_url: impl Into) -> Self { - self.proxy_url = Some(proxy_url.into()); + self.client_options = self.client_options.with_proxy_url(proxy_url); + self + } + + /// Sets the client options, overriding any already set + pub fn with_client_options(mut self, options: ClientOptions) -> Self { + self.client_options = options; self } @@ -571,14 +573,6 @@ impl AmazonS3Builder { let bucket = self.bucket_name.context(MissingBucketNameSnafu)?; let region = self.region.context(MissingRegionSnafu)?; - let clientbuilder = match self.proxy_url { - Some(ref url) => { - let pr: Proxy = - Proxy::all(url).map_err(|source| Error::ProxyUrl { source })?; - Client::builder().proxy(pr) - } - None => Client::builder(), - }; let credentials = match (self.access_key_id, self.secret_access_key, self.token) { (Some(key_id), Some(secret_key), token) => { info!("Using Static credential provider"); @@ -608,7 +602,11 @@ impl AmazonS3Builder { let endpoint = format!("https://sts.{}.amazonaws.com", region); // Disallow non-HTTPs requests - let client = clientbuilder.https_only(true).build().unwrap(); + let client = self + .client_options + .clone() + .with_allow_http(false) + .client()?; Box::new(WebIdentityProvider { cache: Default::default(), @@ -629,11 +627,12 @@ impl AmazonS3Builder { info!("Using Instance credential provider"); // The instance metadata endpoint is access over HTTP - let client = clientbuilder.https_only(false).build().unwrap(); + let client_options = + self.client_options.clone().with_allow_http(true); Box::new(InstanceCredentialProvider { cache: Default::default(), - client, + client: client_options.client()?, retry_config: self.retry_config.clone(), imdsv1_fallback: self.imdsv1_fallback, metadata_endpoint: self @@ -670,11 +669,10 @@ impl AmazonS3Builder { bucket_endpoint, credentials, retry_config: self.retry_config, - allow_http: self.allow_http, - proxy_url: self.proxy_url, + client_options: self.client_options, }; - let client = Arc::new(S3Client::new(config).unwrap()); + let client = Arc::new(S3Client::new(config)?); Ok(AmazonS3 { client }) } @@ -931,21 +929,20 @@ mod tests { assert!(s3.is_ok()); - let s3 = AmazonS3Builder::new() + let err = AmazonS3Builder::new() .with_access_key_id("access_key_id") .with_secret_access_key("secret_access_key") .with_region("region") .with_bucket_name("bucket_name") .with_allow_http(true) .with_proxy_url("asdf://example.com") - .build(); + .build() + .unwrap_err() + .to_string(); - assert!(match s3 { - Err(crate::Error::Generic { source, .. }) => matches!( - source.downcast_ref(), - Some(crate::aws::Error::ProxyUrl { .. }) - ), - _ => false, - }) + assert_eq!( + "Generic HTTP client error: builder error: unknown proxy scheme", + err + ); } } diff --git a/src/azure/client.rs b/src/azure/client.rs index d8cfdd1..b537f5e 100644 --- a/src/azure/client.rs +++ b/src/azure/client.rs @@ -21,13 +21,16 @@ use crate::client::pagination::stream_paginated; use crate::client::retry::RetryExt; use crate::path::DELIMITER; use crate::util::{format_http_range, format_prefix}; -use crate::{BoxStream, ListResult, ObjectMeta, Path, Result, RetryConfig, StreamExt}; +use crate::{ + BoxStream, ClientOptions, ListResult, ObjectMeta, Path, Result, RetryConfig, + StreamExt, +}; use bytes::{Buf, Bytes}; use chrono::{DateTime, TimeZone, Utc}; use itertools::Itertools; use reqwest::{ header::{HeaderValue, CONTENT_LENGTH, IF_NONE_MATCH, RANGE}, - Client as ReqwestClient, Method, Proxy, Response, StatusCode, + Client as ReqwestClient, Method, Response, StatusCode, }; use serde::{Deserialize, Deserializer, Serialize}; use snafu::{ResultExt, Snafu}; @@ -82,9 +85,6 @@ pub(crate) enum Error { Authorization { source: crate::azure::credential::Error, }, - - #[snafu(display("Unable to use proxy url: {}", source))] - ProxyUrl { source: reqwest::Error }, } impl From for crate::Error { @@ -124,10 +124,9 @@ pub struct AzureConfig { pub container: String, pub credentials: CredentialProvider, pub retry_config: RetryConfig, - pub allow_http: bool, pub service: Url, pub is_emulator: bool, - pub proxy_url: Option, + pub client_options: ClientOptions, } impl AzureConfig { @@ -153,18 +152,7 @@ pub(crate) struct AzureClient { impl AzureClient { /// create a new instance of [AzureClient] pub fn new(config: AzureConfig) -> Result { - let builder = ReqwestClient::builder(); - - let client = if let Some(url) = config.proxy_url.as_ref() { - let pr = Proxy::all(url).map_err(|source| Error::ProxyUrl { source }); - builder.proxy(pr.unwrap()) - } else { - builder - } - .https_only(!config.allow_http) - .build() - .unwrap(); - + let client = config.client_options.client()?; Ok(Self { config, client }) } diff --git a/src/azure/mod.rs b/src/azure/mod.rs index 060b4b2..4b7131e 100644 --- a/src/azure/mod.rs +++ b/src/azure/mod.rs @@ -30,7 +30,8 @@ use self::client::{BlockId, BlockList}; use crate::{ multipart::{CloudMultiPartUpload, CloudMultiPartUploadImpl, UploadPart}, path::Path, - GetResult, ListResult, MultipartId, ObjectMeta, ObjectStore, Result, RetryConfig, + ClientOptions, GetResult, ListResult, MultipartId, ObjectMeta, ObjectStore, Result, + RetryConfig, }; use async_trait::async_trait; use bytes::Bytes; @@ -359,8 +360,7 @@ pub struct MicrosoftAzureBuilder { authority_host: Option, use_emulator: bool, retry_config: RetryConfig, - allow_http: bool, - proxy_url: Option, + client_options: ClientOptions, } impl Debug for MicrosoftAzureBuilder { @@ -480,10 +480,10 @@ impl MicrosoftAzureBuilder { } /// Sets what protocol is allowed. If `allow_http` is : - /// * false (default): Only HTTPS is allowed + /// * false (default): Only HTTPS are allowed /// * true: HTTP and HTTPS are allowed pub fn with_allow_http(mut self, allow_http: bool) -> Self { - self.allow_http = allow_http; + self.client_options = self.client_options.with_allow_http(allow_http); self } @@ -503,7 +503,13 @@ impl MicrosoftAzureBuilder { /// Set the proxy_url to be used by the underlying client pub fn with_proxy_url(mut self, proxy_url: impl Into) -> Self { - self.proxy_url = Some(proxy_url.into()); + self.client_options = self.client_options.with_proxy_url(proxy_url); + self + } + + /// Sets the client options, overriding any already set + pub fn with_client_options(mut self, options: ClientOptions) -> Self { + self.client_options = options; self } @@ -521,14 +527,13 @@ impl MicrosoftAzureBuilder { sas_query_pairs, use_emulator, retry_config, - allow_http, authority_host, - proxy_url, + mut client_options, } = self; let container = container_name.ok_or(Error::MissingContainerName {})?; - let (is_emulator, allow_http, storage_url, auth, account) = if use_emulator { + let (is_emulator, storage_url, auth, account) = if use_emulator { let account_name = account_name.unwrap_or_else(|| EMULATOR_ACCOUNT.to_string()); // Allow overriding defaults. Values taken from @@ -537,7 +542,9 @@ impl MicrosoftAzureBuilder { let account_key = access_key.unwrap_or_else(|| EMULATOR_ACCOUNT_KEY.to_string()); let credential = credential::CredentialProvider::AccessKey(account_key); - (true, true, url, credential, account_name) + + client_options = client_options.with_allow_http(true); + (true, url, credential, account_name) } else { let account_name = account_name.ok_or(Error::MissingAccount {})?; let account_url = format!("https://{}.blob.core.windows.net", &account_name); @@ -564,18 +571,17 @@ impl MicrosoftAzureBuilder { } else { Err(Error::MissingCredentials {}) }?; - (false, allow_http, url, credential, account_name) + (false, url, credential, account_name) }; let config = client::AzureConfig { account, - allow_http, retry_config, service: storage_url, container, credentials: auth, is_emulator, - proxy_url, + client_options, }; let client = Arc::new(client::AzureClient::new(config)?); diff --git a/src/client/mod.rs b/src/client/mod.rs index c93c68a..2b58a77 100644 --- a/src/client/mod.rs +++ b/src/client/mod.rs @@ -23,3 +23,53 @@ pub mod mock_server; pub mod pagination; pub mod retry; pub mod token; + +use reqwest::{Client, ClientBuilder, Proxy}; + +fn map_client_error(e: reqwest::Error) -> super::Error { + super::Error::Generic { + store: "HTTP client", + source: Box::new(e), + } +} + +/// HTTP client configuration for remote object stores +#[derive(Debug, Clone, Default)] +pub struct ClientOptions { + proxy_url: Option, + allow_http: bool, +} + +impl ClientOptions { + /// Create a new [`ClientOptions`] with default values + pub fn new() -> Self { + Default::default() + } + + /// Sets what protocol is allowed. If `allow_http` is : + /// * false (default): Only HTTPS are allowed + /// * true: HTTP and HTTPS are allowed + pub fn with_allow_http(mut self, allow_http: bool) -> Self { + self.allow_http = allow_http; + self + } + + /// Set an HTTP proxy to use for requests + pub fn with_proxy_url(mut self, proxy_url: impl Into) -> Self { + self.proxy_url = Some(proxy_url.into()); + self + } + + pub(crate) fn client(&self) -> super::Result { + let mut builder = ClientBuilder::new(); + if let Some(proxy) = &self.proxy_url { + let proxy = Proxy::all(proxy).map_err(map_client_error)?; + builder = builder.proxy(proxy); + } + + builder + .https_only(!self.allow_http) + .build() + .map_err(map_client_error) + } +} diff --git a/src/gcp/mod.rs b/src/gcp/mod.rs index 0da92fd..41d6696 100644 --- a/src/gcp/mod.rs +++ b/src/gcp/mod.rs @@ -41,7 +41,6 @@ use chrono::{DateTime, Utc}; use futures::{stream::BoxStream, StreamExt, TryStreamExt}; use percent_encoding::{percent_encode, NON_ALPHANUMERIC}; use reqwest::header::RANGE; -use reqwest::Proxy; use reqwest::{header, Client, Method, Response, StatusCode}; use snafu::{ResultExt, Snafu}; use tokio::io::AsyncWrite; @@ -53,7 +52,8 @@ use crate::{ multipart::{CloudMultiPartUpload, CloudMultiPartUploadImpl, UploadPart}, path::{Path, DELIMITER}, util::{format_http_range, format_prefix}, - GetResult, ListResult, MultipartId, ObjectMeta, ObjectStore, Result, RetryConfig, + ClientOptions, GetResult, ListResult, MultipartId, ObjectMeta, ObjectStore, Result, + RetryConfig, }; use credential::OAuthProvider; @@ -123,9 +123,6 @@ enum Error { #[snafu(display("GCP credential error: {}", source))] Credential { source: credential::Error }, - - #[snafu(display("Unable to use proxy url: {}", source))] - ProxyUrl { source: reqwest::Error }, } impl From for super::Error { @@ -739,13 +736,23 @@ fn reader_credentials_file( /// .with_bucket_name(BUCKET_NAME) /// .build(); /// ``` -#[derive(Debug, Default)] +#[derive(Debug)] pub struct GoogleCloudStorageBuilder { bucket_name: Option, service_account_path: Option, - client: Option, retry_config: RetryConfig, - proxy_url: Option, + client_options: ClientOptions, +} + +impl Default for GoogleCloudStorageBuilder { + fn default() -> Self { + Self { + bucket_name: None, + service_account_path: None, + retry_config: Default::default(), + client_options: ClientOptions::new().with_allow_http(true), + } + } } impl GoogleCloudStorageBuilder { @@ -787,9 +794,15 @@ impl GoogleCloudStorageBuilder { self } - /// Set proxy url used for connection + /// Set the proxy_url to be used by the underlying client pub fn with_proxy_url(mut self, proxy_url: impl Into) -> Self { - self.proxy_url = Some(proxy_url.into()); + self.client_options = self.client_options.with_proxy_url(proxy_url); + self + } + + /// Sets the client options, overriding any already set + pub fn with_client_options(mut self, options: ClientOptions) -> Self { + self.client_options = options; self } @@ -799,27 +812,15 @@ impl GoogleCloudStorageBuilder { let Self { bucket_name, service_account_path, - client, retry_config, - proxy_url, + client_options, } = self; let bucket_name = bucket_name.ok_or(Error::MissingBucketName {})?; let service_account_path = service_account_path.ok_or(Error::MissingServiceAccountPath)?; - let client = match (proxy_url, client) { - (_, Some(client)) => client, - (Some(url), None) => { - let pr = Proxy::all(&url).map_err(|source| Error::ProxyUrl { source })?; - Client::builder() - .proxy(pr) - .build() - .map_err(|source| Error::ProxyUrl { source })? - } - (None, None) => Client::new(), - }; - + let client = client_options.client()?; let credentials = reader_credentials_file(service_account_path)?; // TODO: https://cloud.google.com/storage/docs/authentication#oauth-scopes @@ -1054,18 +1055,17 @@ mod test { .build(); assert!(dbg!(gcs).is_ok()); - let gcs = GoogleCloudStorageBuilder::new() + let err = GoogleCloudStorageBuilder::new() .with_service_account_path(service_account_path.to_str().unwrap()) .with_bucket_name("foo") .with_proxy_url("asdf://example.com") - .build(); + .build() + .unwrap_err() + .to_string(); - assert!(match gcs { - Err(ObjectStoreError::Generic { source, .. }) => matches!( - source.downcast_ref(), - Some(crate::gcp::Error::ProxyUrl { .. }) - ), - _ => false, - }) + assert_eq!( + "Generic HTTP client error: builder error: unknown proxy scheme", + err + ); } } diff --git a/src/lib.rs b/src/lib.rs index a36bb5f..ec41f38 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -197,6 +197,9 @@ use std::io::{Read, Seek, SeekFrom}; use std::ops::Range; use tokio::io::AsyncWrite; +#[cfg(any(feature = "azure", feature = "aws", feature = "gcp"))] +pub use client::ClientOptions; + /// An alias for a dynamically dispatched object store implementation. pub type DynObjectStore = dyn ObjectStore; From ba19b01b5a754104d4657b6af16b29ed20c89a34 Mon Sep 17 00:00:00 2001 From: Will Jones Date: Fri, 2 Dec 2022 02:28:28 -0800 Subject: [PATCH 059/397] fix(object_store,gcp): test copy_if_not_exist (#3236) * fix(object_store,gcp): test copy_if_not_exist * doc: update GCS testing instructions * test: move copy test into non-local branch * Revert CONTENT_LENGTH change Co-authored-by: Raphael Taylor-Davies --- CONTRIBUTING.md | 6 +++--- src/gcp/mod.rs | 35 +++++++++++++++++++++++++++++++---- 2 files changed, 34 insertions(+), 7 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 4e6b3af..efcd5fe 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -95,13 +95,13 @@ To test the GCS integration, we use [Fake GCS Server](https://github.com/fsouza/ Startup the fake server: ```shell -docker run -p 4443:4443 fsouza/fake-gcs-server +docker run -p 4443:4443 fsouza/fake-gcs-server -scheme http ``` Configure the account: ```shell -curl --insecure -v -X POST --data-binary '{"name":"test-bucket"}' -H "Content-Type: application/json" "https://localhost:4443/storage/v1/b" -echo '{"gcs_base_url": "https://localhost:4443", "disable_oauth": true, "client_email": "", "private_key": ""}' > /tmp/gcs.json +curl -v -X POST --data-binary '{"name":"test-bucket"}' -H "Content-Type: application/json" "http://localhost:4443/storage/v1/b" +echo '{"gcs_base_url": "http://localhost:4443", "disable_oauth": true, "client_email": "", "private_key": ""}' > /tmp/gcs.json ``` Now run the tests: diff --git a/src/gcp/mod.rs b/src/gcp/mod.rs index 41d6696..f93cbde 100644 --- a/src/gcp/mod.rs +++ b/src/gcp/mod.rs @@ -123,6 +123,12 @@ enum Error { #[snafu(display("GCP credential error: {}", source))] Credential { source: credential::Error }, + + #[snafu(display("Already exists: {}", path))] + AlreadyExists { + source: crate::client::retry::Error, + path: String, + }, } impl From for super::Error { @@ -138,6 +144,10 @@ impl From for super::Error { source: Box::new(source), } } + Error::AlreadyExists { source, path } => Self::AlreadyExists { + source: Box::new(source), + path, + }, _ => Self::Generic { store: "GCS", source: Box::new(err), @@ -419,8 +429,22 @@ impl GoogleCloudStorageClient { .bearer_auth(token) .send_retry(&self.retry_config) .await - .context(CopyRequestSnafu { - path: from.as_ref(), + .map_err(|err| { + if err + .status() + .map(|status| status == reqwest::StatusCode::PRECONDITION_FAILED) + .unwrap_or_else(|| false) + { + Error::AlreadyExists { + source: err, + path: to.to_string(), + } + } else { + Error::CopyRequest { + source: err, + path: from.to_string(), + } + } })?; Ok(()) @@ -880,8 +904,8 @@ mod test { use crate::{ tests::{ - get_nonexistent_object, list_uses_directories_correctly, list_with_delimiter, - put_get_delete_list, rename_and_copy, stream_get, + copy_if_not_exists, get_nonexistent_object, list_uses_directories_correctly, + list_with_delimiter, put_get_delete_list, rename_and_copy, stream_get, }, Error as ObjectStoreError, ObjectStore, }; @@ -946,6 +970,9 @@ mod test { list_with_delimiter(&integration).await; rename_and_copy(&integration).await; if integration.client.base_url == default_gcs_base_url() { + // Fake GCS server doesn't currently honor ifGenerationMatch + // https://github.com/fsouza/fake-gcs-server/issues/994 + copy_if_not_exists(&integration).await; // Fake GCS server does not yet implement XML Multipart uploads // https://github.com/fsouza/fake-gcs-server/issues/852 stream_get(&integration).await; From b9a8623ccb4c8f0b1274f8f7f7640ca7fc4b20d1 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Fri, 2 Dec 2022 18:42:41 +0000 Subject: [PATCH 060/397] Add more ClientConfig Options for Object Store RequestBuilder (#3127) (#3256) * Add more ClientConfig Options (#3127) * Add header support --- src/client/mod.rs | 151 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 151 insertions(+) diff --git a/src/client/mod.rs b/src/client/mod.rs index 2b58a77..47e6863 100644 --- a/src/client/mod.rs +++ b/src/client/mod.rs @@ -24,7 +24,9 @@ pub mod pagination; pub mod retry; pub mod token; +use reqwest::header::{HeaderMap, HeaderValue}; use reqwest::{Client, ClientBuilder, Proxy}; +use std::time::Duration; fn map_client_error(e: reqwest::Error) -> super::Error { super::Error::Generic { @@ -33,11 +35,25 @@ fn map_client_error(e: reqwest::Error) -> super::Error { } } +static DEFAULT_USER_AGENT: &str = + concat!(env!("CARGO_PKG_NAME"), "/", env!("CARGO_PKG_VERSION"),); + /// HTTP client configuration for remote object stores #[derive(Debug, Clone, Default)] pub struct ClientOptions { + user_agent: Option, + default_headers: Option, proxy_url: Option, allow_http: bool, + timeout: Option, + connect_timeout: Option, + pool_idle_timeout: Option, + pool_max_idle_per_host: Option, + http2_keep_alive_interval: Option, + http2_keep_alive_timeout: Option, + http2_keep_alive_while_idle: bool, + http1_only: bool, + http2_only: bool, } impl ClientOptions { @@ -46,6 +62,20 @@ impl ClientOptions { Default::default() } + /// Sets the User-Agent header to be used by this client + /// + /// Default is based on the version of this crate + pub fn with_user_agent(mut self, agent: HeaderValue) -> Self { + self.user_agent = Some(agent); + self + } + + /// Sets the default headers for every request + pub fn with_default_headers(mut self, headers: HeaderMap) -> Self { + self.default_headers = Some(headers); + self + } + /// Sets what protocol is allowed. If `allow_http` is : /// * false (default): Only HTTPS are allowed /// * true: HTTP and HTTPS are allowed @@ -54,19 +84,140 @@ impl ClientOptions { self } + /// Only use http1 connections + pub fn with_http1_only(mut self) -> Self { + self.http1_only = true; + self + } + + /// Only use http2 connections + pub fn with_http2_only(mut self) -> Self { + self.http2_only = true; + self + } + /// Set an HTTP proxy to use for requests pub fn with_proxy_url(mut self, proxy_url: impl Into) -> Self { self.proxy_url = Some(proxy_url.into()); self } + /// Set a request timeout + /// + /// The timeout is applied from when the request starts connecting until the + /// response body has finished + pub fn with_timeout(mut self, timeout: Duration) -> Self { + self.timeout = Some(timeout); + self + } + + /// Set a timeout for only the connect phase of a Client + pub fn with_connect_timeout(mut self, timeout: Duration) -> Self { + self.connect_timeout = Some(timeout); + self + } + + /// Set the pool max idle timeout + /// + /// This is the length of time an idle connection will be kept alive + /// + /// Default is 90 seconds + pub fn with_pool_idle_timeout(mut self, timeout: Duration) -> Self { + self.pool_idle_timeout = Some(timeout); + self + } + + /// Set the maximum number of idle connections per host + /// + /// Default is no limit + pub fn with_pool_max_idle_per_host(mut self, max: usize) -> Self { + self.pool_max_idle_per_host = Some(max); + self + } + + /// Sets an interval for HTTP2 Ping frames should be sent to keep a connection alive. + /// + /// Default is disabled + pub fn with_http2_keep_alive_interval(mut self, interval: Duration) -> Self { + self.http2_keep_alive_interval = Some(interval); + self + } + + /// Sets a timeout for receiving an acknowledgement of the keep-alive ping. + /// + /// If the ping is not acknowledged within the timeout, the connection will be closed. + /// Does nothing if http2_keep_alive_interval is disabled. + /// + /// Default is disabled + pub fn with_http2_keep_alive_timeout(mut self, interval: Duration) -> Self { + self.http2_keep_alive_timeout = Some(interval); + self + } + + /// Enable HTTP2 keep alive pings for idle connections + /// + /// If disabled, keep-alive pings are only sent while there are open request/response + /// streams. If enabled, pings are also sent when no streams are active + /// + /// Default is disabled + pub fn with_http2_keep_alive_while_idle(mut self) -> Self { + self.http2_keep_alive_while_idle = true; + self + } + pub(crate) fn client(&self) -> super::Result { let mut builder = ClientBuilder::new(); + + match &self.user_agent { + Some(user_agent) => builder = builder.user_agent(user_agent), + None => builder = builder.user_agent(DEFAULT_USER_AGENT), + } + + if let Some(headers) = &self.default_headers { + builder = builder.default_headers(headers.clone()) + } + if let Some(proxy) = &self.proxy_url { let proxy = Proxy::all(proxy).map_err(map_client_error)?; builder = builder.proxy(proxy); } + if let Some(timeout) = self.timeout { + builder = builder.timeout(timeout) + } + + if let Some(timeout) = self.connect_timeout { + builder = builder.connect_timeout(timeout) + } + + if let Some(timeout) = self.pool_idle_timeout { + builder = builder.pool_idle_timeout(timeout) + } + + if let Some(max) = self.pool_max_idle_per_host { + builder = builder.pool_max_idle_per_host(max) + } + + if let Some(interval) = self.http2_keep_alive_interval { + builder = builder.http2_keep_alive_interval(interval) + } + + if let Some(interval) = self.http2_keep_alive_timeout { + builder = builder.http2_keep_alive_timeout(interval) + } + + if self.http2_keep_alive_while_idle { + builder = builder.http2_keep_alive_while_idle(true) + } + + if self.http1_only { + builder = builder.http1_only() + } + + if self.http2_only { + builder = builder.http2_prior_knowledge() + } + builder .https_only(!self.allow_http) .build() From 99c5f5bc36f8e347f5db23f4db89a9912fb55f8d Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 2 Dec 2022 15:19:19 -0500 Subject: [PATCH 061/397] Update object_store version to 0.5.2 and add CHANGELOG (#3253) * Update object_store crate version to 0.5.2 * Initial changelog * Updates * More update * Update Changlog, semi manually * final updates --- CHANGELOG-old.md | 29 ++++++++++++++++++++++++++++- CHANGELOG.md | 30 ++++++++++++++++-------------- Cargo.toml | 2 +- dev/release/update_change_log.sh | 8 ++++---- 4 files changed, 49 insertions(+), 20 deletions(-) diff --git a/CHANGELOG-old.md b/CHANGELOG-old.md index bf1ef62..1397d8a 100644 --- a/CHANGELOG-old.md +++ b/CHANGELOG-old.md @@ -19,6 +19,34 @@ # Historical Changelog +# Changelog + +## [object_store_0.5.1](https://github.com/apache/arrow-rs/tree/object_store_0.5.1) (2022-10-04) + +[Full Changelog](https://github.com/apache/arrow-rs/compare/object_store_0.5.0...object_store_0.5.1) + +**Implemented enhancements:** + +- Allow HTTP S3 URLs [\#2806](https://github.com/apache/arrow-rs/issues/2806) +- object\_store: support AWS ECS instance credentials [\#2802](https://github.com/apache/arrow-rs/issues/2802) +- Object Store S3 Alibaba Cloud OSS support [\#2777](https://github.com/apache/arrow-rs/issues/2777) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Expose option to use GCS object store in integration tests [\#2627](https://github.com/apache/arrow-rs/issues/2627) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] + +**Fixed bugs:** + +- S3 Signature Error Performing List With Prefix Containing Spaces [\#2800](https://github.com/apache/arrow-rs/issues/2800) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Erratic Behaviour if Incorrect S3 Region Configured [\#2795](https://github.com/apache/arrow-rs/issues/2795) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] + +**Merged pull requests:** + +- Support for overriding instance metadata endpoint [\#2811](https://github.com/apache/arrow-rs/pull/2811) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([wjones127](https://github.com/wjones127)) +- Allow Configuring non-TLS HTTP Connections in AmazonS3Builder::from\_env [\#2807](https://github.com/apache/arrow-rs/pull/2807) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([avantgardnerio](https://github.com/avantgardnerio)) +- Fix S3 query canonicalization \(\#2800\) [\#2801](https://github.com/apache/arrow-rs/pull/2801) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Handle incomplete HTTP redirects missing LOCATION \(\#2795\) [\#2796](https://github.com/apache/arrow-rs/pull/2796) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Handle S3 virtual host request type [\#2782](https://github.com/apache/arrow-rs/pull/2782) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([askoa](https://github.com/askoa)) +- Fix object\_store multipart uploads on S3 Compatible Stores [\#2731](https://github.com/apache/arrow-rs/pull/2731) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([mildbyte](https://github.com/mildbyte)) + + ## [object_store_0.5.0](https://github.com/apache/arrow-rs/tree/object_store_0.5.0) (2022-09-08) [Full Changelog](https://github.com/apache/arrow-rs/compare/object_store_0.4.0...object_store_0.5.0) @@ -105,4 +133,3 @@ - Increase upper wait time to reduce flakyness of object store test [\#2142](https://github.com/apache/arrow-rs/pull/2142) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([viirya](https://github.com/viirya)) \* *This Changelog was automatically generated by [github_changelog_generator](https://github.com/github-changelog-generator/github-changelog-generator)* - diff --git a/CHANGELOG.md b/CHANGELOG.md index 6919111..528d649 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,30 +19,32 @@ # Changelog -## [object_store_0.5.1](https://github.com/apache/arrow-rs/tree/object_store_0.5.1) (2022-10-04) +## [object_store_0.5.2](https://github.com/apache/arrow-rs/tree/object_store_0.5.2) (2022-12-02) -[Full Changelog](https://github.com/apache/arrow-rs/compare/object_store_0.5.0...object_store_0.5.1) +[Full Changelog](https://github.com/apache/arrow-rs/compare/object_store_0.5.1...object_store_0.5.2) **Implemented enhancements:** -- Allow HTTP S3 URLs [\#2806](https://github.com/apache/arrow-rs/issues/2806) -- object\_store: support AWS ECS instance credentials [\#2802](https://github.com/apache/arrow-rs/issues/2802) -- Object Store S3 Alibaba Cloud OSS support [\#2777](https://github.com/apache/arrow-rs/issues/2777) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- Expose option to use GCS object store in integration tests [\#2627](https://github.com/apache/arrow-rs/issues/2627) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Object Store: Allow custom reqwest client [\#3127](https://github.com/apache/arrow-rs/issues/3127) +- socks5 proxy support for the object\_store crate [\#2989](https://github.com/apache/arrow-rs/issues/2989) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Cannot query S3 paths containing whitespace [\#2799](https://github.com/apache/arrow-rs/issues/2799) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] **Fixed bugs:** -- S3 Signature Error Performing List With Prefix Containing Spaces [\#2800](https://github.com/apache/arrow-rs/issues/2800) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- Erratic Behaviour if Incorrect S3 Region Configured [\#2795](https://github.com/apache/arrow-rs/issues/2795) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- object\_store\(gcp\): GCP complains about content-length for copy [\#3235](https://github.com/apache/arrow-rs/issues/3235) +- object\_store\(aws\): EntityTooSmall error on multi-part upload [\#3233](https://github.com/apache/arrow-rs/issues/3233) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] **Merged pull requests:** -- Support for overriding instance metadata endpoint [\#2811](https://github.com/apache/arrow-rs/pull/2811) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([wjones127](https://github.com/wjones127)) -- Allow Configuring non-TLS HTTP Connections in AmazonS3Builder::from\_env [\#2807](https://github.com/apache/arrow-rs/pull/2807) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([avantgardnerio](https://github.com/avantgardnerio)) -- Fix S3 query canonicalization \(\#2800\) [\#2801](https://github.com/apache/arrow-rs/pull/2801) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- Handle incomplete HTTP redirects missing LOCATION \(\#2795\) [\#2796](https://github.com/apache/arrow-rs/pull/2796) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- Handle S3 virtual host request type [\#2782](https://github.com/apache/arrow-rs/pull/2782) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([askoa](https://github.com/askoa)) -- Fix object\_store multipart uploads on S3 Compatible Stores [\#2731](https://github.com/apache/arrow-rs/pull/2731) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([mildbyte](https://github.com/mildbyte)) +- Add more ClientConfig Options for Object Store RequestBuilder \(\#3127\) [\#3256](https://github.com/apache/arrow-rs/pull/3256) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Add ObjectStore ClientConfig [\#3252](https://github.com/apache/arrow-rs/pull/3252) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- fix\(object\_store,gcp\): test copy\_if\_not\_exist [\#3236](https://github.com/apache/arrow-rs/pull/3236) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([wjones127](https://github.com/wjones127)) +- fix\(object\_store,aws,gcp\): multipart upload enforce size limit of 5 MiB not 5MB [\#3234](https://github.com/apache/arrow-rs/pull/3234) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([wjones127](https://github.com/wjones127)) +- object\_store: add support for using proxy\_url for connection testing [\#3109](https://github.com/apache/arrow-rs/pull/3109) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([sum12](https://github.com/sum12)) +- Update AWS SDK [\#2974](https://github.com/apache/arrow-rs/pull/2974) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Update quick-xml requirement from 0.25.0 to 0.26.0 [\#2918](https://github.com/apache/arrow-rs/pull/2918) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- Support building object_store and parquet on wasm32-unknown-unknown target [\#2896](https://github.com/apache/arrow-rs/pull/2899) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([jondo2010](https://github.com/jondo2010)) +- Add experimental AWS\_PROFILE support \(\#2178\) [\#2891](https://github.com/apache/arrow-rs/pull/2891) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) diff --git a/Cargo.toml b/Cargo.toml index fd7442f..9b1dee5 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "object_store" -version = "0.5.1" +version = "0.5.2" edition = "2021" license = "MIT/Apache-2.0" readme = "README.md" diff --git a/dev/release/update_change_log.sh b/dev/release/update_change_log.sh index 865acde..cf070d3 100755 --- a/dev/release/update_change_log.sh +++ b/dev/release/update_change_log.sh @@ -29,8 +29,8 @@ set -e -SINCE_TAG="object_store_0.5.0" -FUTURE_RELEASE="object_store_0.5.1" +SINCE_TAG="object_store_0.5.1" +FUTURE_RELEASE="object_store_0.5.2" SOURCE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" SOURCE_TOP_DIR="$(cd "${SOURCE_DIR}/../../" && pwd)" @@ -49,8 +49,8 @@ docker run -it --rm -e CHANGELOG_GITHUB_TOKEN="$CHANGELOG_GITHUB_TOKEN" -v "$(pw --cache-file=.githubchangeloggenerator.cache \ --cache-log=.githubchangeloggenerator.cache.log \ --http-cache \ - --max-issues=300 \ - --exclude-tags-regex "^\d+\.\d+\.\d+$" \ + --max-issues=600 \ + --exclude-tags-regex "(^\d+\.\d+\.\d+$)|(rc)" \ --since-tag ${SINCE_TAG} \ --future-release ${FUTURE_RELEASE} From baa43b6865ba7284d2227d4096d6df29f02e4e00 Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Fri, 2 Dec 2022 15:38:11 -0800 Subject: [PATCH 062/397] Skip aws integration test (#3262) --- src/aws/credential.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/src/aws/credential.rs b/src/aws/credential.rs index 32430d7..900af24 100644 --- a/src/aws/credential.rs +++ b/src/aws/credential.rs @@ -664,6 +664,7 @@ mod tests { async fn test_instance_metadata() { if env::var("TEST_INTEGRATION").is_err() { eprintln!("skipping AWS integration test"); + return; } // For example https://github.com/aws/amazon-ec2-metadata-mock From ca72f533a95d465f56aeb9c056644322e4a99a8c Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Mon, 5 Dec 2022 22:52:20 +0000 Subject: [PATCH 063/397] Reload token from AWS_WEB_IDENTITY_TOKEN_FILE (#3274) * Reload token from AWS_WEB_IDENTITY_TOKEN_FILE * Clippy * Update object_store/src/aws/credential.rs Co-authored-by: Andrew Lamb Co-authored-by: Andrew Lamb --- src/aws/credential.rs | 11 +++++++---- src/aws/mod.rs | 11 +++-------- 2 files changed, 10 insertions(+), 12 deletions(-) diff --git a/src/aws/credential.rs b/src/aws/credential.rs index 900af24..199899d 100644 --- a/src/aws/credential.rs +++ b/src/aws/credential.rs @@ -341,7 +341,7 @@ impl CredentialProvider for InstanceCredentialProvider { #[derive(Debug)] pub struct WebIdentityProvider { pub cache: TokenCache>, - pub token: String, + pub token_path: String, pub role_arn: String, pub session_name: String, pub endpoint: String, @@ -355,7 +355,7 @@ impl CredentialProvider for WebIdentityProvider { web_identity( &self.client, &self.retry_config, - &self.token, + &self.token_path, &self.role_arn, &self.session_name, &self.endpoint, @@ -477,11 +477,14 @@ impl From for AwsCredential { async fn web_identity( client: &Client, retry_config: &RetryConfig, - token: &str, + token_path: &str, role_arn: &str, session_name: &str, endpoint: &str, ) -> Result>, StdError> { + let token = std::fs::read_to_string(token_path) + .map_err(|e| format!("Failed to read token file '{}': {}", token_path, e))?; + let bytes = client .request(Method::POST, endpoint) .query(&[ @@ -490,7 +493,7 @@ async fn web_identity( ("RoleArn", role_arn), ("RoleSessionName", session_name), ("Version", "2011-06-15"), - ("WebIdentityToken", token), + ("WebIdentityToken", &token), ]) .send_retry(retry_config) .await? diff --git a/src/aws/mod.rs b/src/aws/mod.rs index c92b8c2..aa419d6 100644 --- a/src/aws/mod.rs +++ b/src/aws/mod.rs @@ -116,9 +116,6 @@ enum Error { #[snafu(display("Received header containing non-ASCII data"))] BadHeader { source: reqwest::header::ToStrError }, - - #[snafu(display("Error reading token file: {}", source))] - ReadTokenFile { source: std::io::Error }, } impl From for super::Error { @@ -588,13 +585,11 @@ impl AmazonS3Builder { (Some(_), None, _) => return Err(Error::MissingSecretAccessKey.into()), // TODO: Replace with `AmazonS3Builder::credentials_from_env` _ => match ( - std::env::var_os("AWS_WEB_IDENTITY_TOKEN_FILE"), + std::env::var("AWS_WEB_IDENTITY_TOKEN_FILE"), std::env::var("AWS_ROLE_ARN"), ) { - (Some(token_file), Ok(role_arn)) => { + (Ok(token_path), Ok(role_arn)) => { info!("Using WebIdentity credential provider"); - let token = std::fs::read_to_string(token_file) - .context(ReadTokenFileSnafu)?; let session_name = std::env::var("AWS_ROLE_SESSION_NAME") .unwrap_or_else(|_| "WebIdentitySession".to_string()); @@ -610,7 +605,7 @@ impl AmazonS3Builder { Box::new(WebIdentityProvider { cache: Default::default(), - token, + token_path, session_name, role_arn, endpoint, From d00d417c8fc821cba918168130ea0f202ba70c98 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Tue, 6 Dec 2022 17:35:31 +0000 Subject: [PATCH 064/397] Disable getrandom object_store (#3278) --- Cargo.toml | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 9b1dee5..f378315 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -51,15 +51,13 @@ rand = { version = "0.8", default-features = false, features = ["std", "std_rng" reqwest = { version = "0.11", default-features = false, features = ["rustls-tls"], optional = true } ring = { version = "0.16", default-features = false, features = ["std"], optional = true } rustls-pemfile = { version = "1.0", default-features = false, optional = true } -# Fix for wasm32-unknown-unknown (see https://docs.rs/getrandom/latest/getrandom/#webassembly-support) -getrandom = { version = "0.2", features = ["js"], optional = true } # AWS Profile support aws-types = { version = "0.51", optional = true } aws-config = { version = "0.51", optional = true } [features] -cloud = ["serde", "serde_json", "quick-xml", "reqwest", "reqwest/json", "reqwest/stream", "chrono/serde", "base64", "rand", "ring", "getrandom"] +cloud = ["serde", "serde_json", "quick-xml", "reqwest", "reqwest/json", "reqwest/stream", "chrono/serde", "base64", "rand", "ring"] azure = ["cloud"] gcp = ["cloud", "rustls-pemfile"] aws = ["cloud"] From 3cf04865533a5ba9b5c3b2d4c922555d2722636a Mon Sep 17 00:00:00 2001 From: Robert Pack <42610831+roeap@users.noreply.github.com> Date: Mon, 12 Dec 2022 16:35:03 +0100 Subject: [PATCH 065/397] feat(object_store): add PrefixObjectStore (#3329) * feat(object_store): add PrefixObjectStore * Apply suggestions from code review Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> * chore: PR comments * refactor: infallible full_path Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> --- src/lib.rs | 1 + src/prefix.rs | 281 ++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 282 insertions(+) create mode 100644 src/prefix.rs diff --git a/src/lib.rs b/src/lib.rs index ec41f38..0cd5661 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -170,6 +170,7 @@ pub mod limit; pub mod local; pub mod memory; pub mod path; +pub mod prefix; pub mod throttle; #[cfg(any(feature = "gcp", feature = "aws", feature = "azure"))] diff --git a/src/prefix.rs b/src/prefix.rs new file mode 100644 index 0000000..d61fc22 --- /dev/null +++ b/src/prefix.rs @@ -0,0 +1,281 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! An object store wrapper handling a constant path prefix +use bytes::Bytes; +use futures::{stream::BoxStream, StreamExt, TryStreamExt}; +use std::ops::Range; +use tokio::io::AsyncWrite; + +use crate::path::Path; +use crate::{ + GetResult, ListResult, MultipartId, ObjectMeta, ObjectStore, + Result as ObjectStoreResult, +}; + +/// Store wrapper that applies a constant prefix to all paths handled by the store. +#[derive(Debug, Clone)] +pub struct PrefixObjectStore { + prefix: Path, + inner: T, +} + +impl std::fmt::Display for PrefixObjectStore { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "PrefixObjectStore({})", self.prefix.as_ref()) + } +} + +impl PrefixObjectStore { + /// Create a new instance of [`PrefixObjectStore`] + pub fn new(store: T, prefix: impl Into) -> Self { + Self { + prefix: prefix.into(), + inner: store, + } + } + + /// Create the full path from a path relative to prefix + fn full_path(&self, location: &Path) -> Path { + self.prefix.parts().chain(location.parts()).collect() + } + + /// Strip the constant prefix from a given path + fn strip_prefix(&self, path: &Path) -> Option { + Some(path.prefix_match(&self.prefix)?.collect()) + } +} + +#[async_trait::async_trait] +impl ObjectStore for PrefixObjectStore { + /// Save the provided bytes to the specified location. + async fn put(&self, location: &Path, bytes: Bytes) -> ObjectStoreResult<()> { + let full_path = self.full_path(location); + self.inner.put(&full_path, bytes).await + } + + /// Return the bytes that are stored at the specified location. + async fn get(&self, location: &Path) -> ObjectStoreResult { + let full_path = self.full_path(location); + self.inner.get(&full_path).await + } + + /// Return the bytes that are stored at the specified location + /// in the given byte range + async fn get_range( + &self, + location: &Path, + range: Range, + ) -> ObjectStoreResult { + let full_path = self.full_path(location); + self.inner.get_range(&full_path, range).await + } + + /// Return the metadata for the specified location + async fn head(&self, location: &Path) -> ObjectStoreResult { + let full_path = self.full_path(location); + self.inner.head(&full_path).await.map(|meta| ObjectMeta { + last_modified: meta.last_modified, + size: meta.size, + location: self.strip_prefix(&meta.location).unwrap_or(meta.location), + }) + } + + /// Delete the object at the specified location. + async fn delete(&self, location: &Path) -> ObjectStoreResult<()> { + let full_path = self.full_path(location); + self.inner.delete(&full_path).await + } + + /// List all the objects with the given prefix. + /// + /// Prefixes are evaluated on a path segment basis, i.e. `foo/bar/` is a prefix of `foo/bar/x` but not of + /// `foo/bar_baz/x`. + async fn list( + &self, + prefix: Option<&Path>, + ) -> ObjectStoreResult>> { + Ok(self + .inner + .list(Some(&self.full_path(prefix.unwrap_or(&Path::from("/"))))) + .await? + .map_ok(|meta| ObjectMeta { + last_modified: meta.last_modified, + size: meta.size, + location: self.strip_prefix(&meta.location).unwrap_or(meta.location), + }) + .boxed()) + } + + /// List objects with the given prefix and an implementation specific + /// delimiter. Returns common prefixes (directories) in addition to object + /// metadata. + /// + /// Prefixes are evaluated on a path segment basis, i.e. `foo/bar/` is a prefix of `foo/bar/x` but not of + /// `foo/bar_baz/x`. + async fn list_with_delimiter( + &self, + prefix: Option<&Path>, + ) -> ObjectStoreResult { + self.inner + .list_with_delimiter(Some( + &self.full_path(prefix.unwrap_or(&Path::from("/"))), + )) + .await + .map(|lst| ListResult { + common_prefixes: lst + .common_prefixes + .iter() + .filter_map(|p| self.strip_prefix(p)) + .collect(), + objects: lst + .objects + .iter() + .filter_map(|meta| { + Some(ObjectMeta { + last_modified: meta.last_modified, + size: meta.size, + location: self.strip_prefix(&meta.location)?, + }) + }) + .collect(), + }) + } + + /// Copy an object from one path to another in the same object store. + /// + /// If there exists an object at the destination, it will be overwritten. + async fn copy(&self, from: &Path, to: &Path) -> ObjectStoreResult<()> { + let full_from = self.full_path(from); + let full_to = self.full_path(to); + self.inner.copy(&full_from, &full_to).await + } + + /// Copy an object from one path to another, only if destination is empty. + /// + /// Will return an error if the destination already has an object. + async fn copy_if_not_exists(&self, from: &Path, to: &Path) -> ObjectStoreResult<()> { + let full_from = self.full_path(from); + let full_to = self.full_path(to); + self.inner.copy_if_not_exists(&full_from, &full_to).await + } + + /// Move an object from one path to another in the same object store. + /// + /// Will return an error if the destination already has an object. + async fn rename_if_not_exists( + &self, + from: &Path, + to: &Path, + ) -> ObjectStoreResult<()> { + let full_from = self.full_path(from); + let full_to = self.full_path(to); + self.inner.rename_if_not_exists(&full_from, &full_to).await + } + + async fn put_multipart( + &self, + location: &Path, + ) -> ObjectStoreResult<(MultipartId, Box)> { + let full_path = self.full_path(location); + self.inner.put_multipart(&full_path).await + } + + async fn abort_multipart( + &self, + location: &Path, + multipart_id: &MultipartId, + ) -> ObjectStoreResult<()> { + let full_path = self.full_path(location); + self.inner.abort_multipart(&full_path, multipart_id).await + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::local::LocalFileSystem; + use crate::test_util::flatten_list_stream; + use crate::tests::{ + copy_if_not_exists, list_uses_directories_correctly, list_with_delimiter, + put_get_delete_list, rename_and_copy, stream_get, + }; + + use tempfile::TempDir; + + #[tokio::test] + async fn prefix_test() { + let root = TempDir::new().unwrap(); + let inner = LocalFileSystem::new_with_prefix(root.path()).unwrap(); + let integration = PrefixObjectStore::new(inner, "prefix"); + + put_get_delete_list(&integration).await; + list_uses_directories_correctly(&integration).await; + list_with_delimiter(&integration).await; + rename_and_copy(&integration).await; + copy_if_not_exists(&integration).await; + stream_get(&integration).await; + } + + #[tokio::test] + async fn prefix_test_applies_prefix() { + let tmpdir = TempDir::new().unwrap(); + let local = LocalFileSystem::new_with_prefix(tmpdir.path()).unwrap(); + + let location = Path::from("prefix/test_file.json"); + let data = Bytes::from("arbitrary data"); + let expected_data = data.clone(); + + local.put(&location, data).await.unwrap(); + + let prefix = PrefixObjectStore::new(local, "prefix"); + let location_prefix = Path::from("test_file.json"); + + let content_list = flatten_list_stream(&prefix, None).await.unwrap(); + assert_eq!(content_list, &[location_prefix.clone()]); + + let root = Path::from("/"); + let content_list = flatten_list_stream(&prefix, Some(&root)).await.unwrap(); + assert_eq!(content_list, &[location_prefix.clone()]); + + let read_data = prefix + .get(&location_prefix) + .await + .unwrap() + .bytes() + .await + .unwrap(); + assert_eq!(&*read_data, expected_data); + + let target_prefix = Path::from("/test_written.json"); + prefix + .put(&target_prefix, expected_data.clone()) + .await + .unwrap(); + + prefix.delete(&location_prefix).await.unwrap(); + + let local = LocalFileSystem::new_with_prefix(tmpdir.path()).unwrap(); + + let err = local.get(&location).await.unwrap_err(); + assert!(matches!(err, crate::Error::NotFound { .. }), "{}", err); + + let location = Path::from("prefix/test_written.json"); + let read_data = local.get(&location).await.unwrap().bytes().await.unwrap(); + assert_eq!(&*read_data, expected_data) + } +} From e7ddb6c104ac2efff7928e956dbd751a87b1757d Mon Sep 17 00:00:00 2001 From: Robert Pack <42610831+roeap@users.noreply.github.com> Date: Mon, 12 Dec 2022 18:08:51 +0100 Subject: [PATCH 066/397] feat(object_store): parse well-known storage urls (#3327) * feat(object_store): add url parsing to azure builder * feat(object_store): add url parsing to aws builder * feat(object_store): add url parsing to gcs builder * feat(object_store): parse gcs service account from env * fix: typo * docs(object_store): fix example / template urls * feat(object_store): parse S3 virtually hosted urls * refactor: raise url parsing errors on build * fix: properly set virtual_hosted_style_request in url parsing --- src/aws/mod.rs | 97 ++++++++++++++++++++++++++++++++++++++++- src/azure/mod.rs | 111 ++++++++++++++++++++++++++++++++++++++++++++++- src/gcp/mod.rs | 95 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 300 insertions(+), 3 deletions(-) diff --git a/src/aws/mod.rs b/src/aws/mod.rs index aa419d6..0fcfbaf 100644 --- a/src/aws/mod.rs +++ b/src/aws/mod.rs @@ -42,6 +42,7 @@ use std::ops::Range; use std::sync::Arc; use tokio::io::AsyncWrite; use tracing::info; +use url::Url; use crate::aws::client::{S3Client, S3Config}; use crate::aws::credential::{ @@ -116,6 +117,18 @@ enum Error { #[snafu(display("Received header containing non-ASCII data"))] BadHeader { source: reqwest::header::ToStrError }, + + #[snafu(display("Unable parse source url. Url: {}, Error: {}", url, source))] + UnableToParseUrl { + source: url::ParseError, + url: String, + }, + + #[snafu(display( + "Unknown url scheme cannot be parsed into storage location: {}", + scheme + ))] + UnknownUrlScheme { scheme: String }, } impl From for super::Error { @@ -359,6 +372,7 @@ pub struct AmazonS3Builder { metadata_endpoint: Option, profile: Option, client_options: ClientOptions, + url_parse_error: Option, } impl AmazonS3Builder { @@ -430,6 +444,67 @@ impl AmazonS3Builder { builder } + /// Parse available connection info form a well-known storage URL. + /// + /// The supported url schemes are: + /// + /// - `s3:///` + /// - `s3a:///` + /// - `https://s3..amazonaws.com` + /// - `https://.s3..amazonaws.com` + /// + /// Please note that this is a best effort implementation, and will not fail for malformed URLs, + /// but rather warn and ignore the passed url. The url also has no effect on how the + /// storage is accessed - e.g. which driver or protocol is used for reading from the location. + /// + /// # Example + /// ``` + /// use object_store::aws::AmazonS3Builder; + /// + /// let s3 = AmazonS3Builder::from_env() + /// .with_url("s3://bucket/path") + /// .build(); + /// ``` + pub fn with_url(mut self, url: impl AsRef) -> Self { + let maybe_parsed = Url::parse(url.as_ref()); + match maybe_parsed { + Ok(parsed) => match parsed.scheme() { + "s3" | "s3a" => { + self.bucket_name = parsed.host_str().map(|host| host.to_owned()); + } + "https" => { + if let Some(host) = parsed.host_str() { + let parts = host.splitn(4, '.').collect::>(); + if parts.len() == 4 && parts[0] == "s3" && parts[2] == "amazonaws" + { + self.bucket_name = Some(parts[1].to_string()); + } + if parts.len() == 4 + && parts[1] == "s3" + && parts[3] == "amazonaws.com" + { + self.bucket_name = Some(parts[0].to_string()); + self.region = Some(parts[2].to_string()); + self.virtual_hosted_style_request = true; + } + } + } + other => { + self.url_parse_error = Some(Error::UnknownUrlScheme { + scheme: other.into(), + }); + } + }, + Err(err) => { + self.url_parse_error = Some(Error::UnableToParseUrl { + source: err, + url: url.as_ref().into(), + }); + } + }; + self + } + /// Set the AWS Access Key (required) pub fn with_access_key_id(mut self, access_key_id: impl Into) -> Self { self.access_key_id = Some(access_key_id.into()); @@ -567,6 +642,10 @@ impl AmazonS3Builder { /// Create a [`AmazonS3`] instance from the provided values, /// consuming `self`. pub fn build(self) -> Result { + if let Some(err) = self.url_parse_error { + return Err(err.into()); + } + let bucket = self.bucket_name.context(MissingBucketNameSnafu)?; let region = self.region.context(MissingRegionSnafu)?; @@ -642,8 +721,8 @@ impl AmazonS3Builder { let endpoint: String; let bucket_endpoint: String; - //If `endpoint` is provided then its assumed to be consistent with - // `virutal_hosted_style_request`. i.e. if `virtual_hosted_style_request` is true then + // If `endpoint` is provided then its assumed to be consistent with + // `virtual_hosted_style_request`. i.e. if `virtual_hosted_style_request` is true then // `endpoint` should have bucket name included. if self.virtual_hosted_style_request { endpoint = self.endpoint.unwrap_or_else(|| { @@ -940,4 +1019,18 @@ mod tests { err ); } + + #[test] + fn s3_test_urls() { + let builder = AmazonS3Builder::new().with_url("s3://bucket/path"); + assert_eq!(builder.bucket_name, Some("bucket".to_string())); + + let builder = AmazonS3Builder::new().with_url("https://s3.bucket.amazonaws.com"); + assert_eq!(builder.bucket_name, Some("bucket".to_string())); + + let builder = + AmazonS3Builder::new().with_url("https://bucket.s3.region.amazonaws.com"); + assert_eq!(builder.bucket_name, Some("bucket".to_string())); + assert_eq!(builder.region, Some("region".to_string())) + } } diff --git a/src/azure/mod.rs b/src/azure/mod.rs index 4b7131e..2cc4fe1 100644 --- a/src/azure/mod.rs +++ b/src/azure/mod.rs @@ -114,6 +114,12 @@ enum Error { #[snafu(display("Azure credential error: {}", source), context(false))] Credential { source: credential::Error }, + + #[snafu(display( + "Unknown url scheme cannot be parsed into storage location: {}", + scheme + ))] + UnknownUrlScheme { scheme: String }, } impl From for super::Error { @@ -361,6 +367,7 @@ pub struct MicrosoftAzureBuilder { use_emulator: bool, retry_config: RetryConfig, client_options: ClientOptions, + url_parse_error: Option, } impl Debug for MicrosoftAzureBuilder { @@ -379,7 +386,7 @@ impl MicrosoftAzureBuilder { Default::default() } - /// Create an instance of [MicrosoftAzureBuilder] with values pre-populated from environment variables. + /// Create an instance of [`MicrosoftAzureBuilder`] with values pre-populated from environment variables. /// /// Variables extracted from environment: /// * AZURE_STORAGE_ACCOUNT_NAME: storage account name @@ -424,6 +431,78 @@ impl MicrosoftAzureBuilder { builder } + /// Parse available connection info form a well-known storage URL. + /// + /// The supported url schemes are: + /// + /// - `abfs[s]:///` (according to [fsspec](https://github.com/fsspec/adlfs)) + /// - `abfs[s]://@.dfs.core.windows.net/` + /// - `az:///` (according to [fsspec](https://github.com/fsspec/adlfs)) + /// - `adl:///` (according to [fsspec](https://github.com/fsspec/adlfs)) + /// - `azure:///` (custom) + /// - `https://.dfs.core.windows.net` + /// - `https://.blob.core.windows.net` + /// + /// Please note that this is a best effort implementation, and will not fail for malformed URLs, + /// but rather warn and ignore the passed url. The url also has no effect on how the + /// storage is accessed - e.g. which driver or protocol is used for reading from the location. + /// + /// # Example + /// ``` + /// use object_store::azure::MicrosoftAzureBuilder; + /// + /// let azure = MicrosoftAzureBuilder::from_env() + /// .with_url("abfss://file_system@account.dfs.core.windows.net/") + /// .build(); + /// ``` + pub fn with_url(mut self, url: impl AsRef) -> Self { + let maybe_parsed = Url::parse(url.as_ref()); + match maybe_parsed { + Ok(parsed) => match parsed.scheme() { + "az" | "adl" | "azure" => { + self.container_name = parsed.host_str().map(|host| host.to_owned()); + } + "abfs" | "abfss" => { + // abfs(s) might refer to the fsspec convention abfs:/// + // or the convention for the hadoop driver abfs[s]://@.dfs.core.windows.net/ + if parsed.username().is_empty() { + self.container_name = + parsed.host_str().map(|host| host.to_owned()); + } else if let Some(host) = parsed.host_str() { + let parts = host.splitn(2, '.').collect::>(); + if parts.len() == 2 && parts[1] == "dfs.core.windows.net" { + self.container_name = Some(parsed.username().to_owned()); + self.account_name = Some(parts[0].to_string()); + } + } + } + "https" => { + if let Some(host) = parsed.host_str() { + let parts = host.splitn(2, '.').collect::>(); + if parts.len() == 2 + && (parts[1] == "dfs.core.windows.net" + || parts[1] == "blob.core.windows.net") + { + self.account_name = Some(parts[0].to_string()); + } + } + } + other => { + self.url_parse_error = Some(Error::UnknownUrlScheme { + scheme: other.into(), + }); + } + }, + Err(err) => { + self.url_parse_error = Some(Error::UnableToParseUrl { + source: err, + url: url.as_ref().into(), + }); + } + }; + self + } + /// Set the Azure Account (required) pub fn with_account(mut self, account: impl Into) -> Self { self.account_name = Some(account.into()); @@ -529,8 +608,13 @@ impl MicrosoftAzureBuilder { retry_config, authority_host, mut client_options, + url_parse_error, } = self; + if let Some(err) = url_parse_error { + return Err(err.into()); + } + let container = container_name.ok_or(Error::MissingContainerName {})?; let (is_emulator, storage_url, auth, account) = if use_emulator { @@ -716,4 +800,29 @@ mod tests { copy_if_not_exists(&integration).await; stream_get(&integration).await; } + + #[test] + fn azure_blob_test_urls() { + let builder = MicrosoftAzureBuilder::new() + .with_url("abfss://file_system@account.dfs.core.windows.net/"); + assert_eq!(builder.account_name, Some("account".to_string())); + assert_eq!(builder.container_name, Some("file_system".to_string())); + + let builder = MicrosoftAzureBuilder::new().with_url("abfs://container/path"); + assert_eq!(builder.container_name, Some("container".to_string())); + + let builder = MicrosoftAzureBuilder::new().with_url("az://container"); + assert_eq!(builder.container_name, Some("container".to_string())); + + let builder = MicrosoftAzureBuilder::new().with_url("az://container/path"); + assert_eq!(builder.container_name, Some("container".to_string())); + + let builder = MicrosoftAzureBuilder::new() + .with_url("https://account.dfs.core.windows.net/"); + assert_eq!(builder.account_name, Some("account".to_string())); + + let builder = MicrosoftAzureBuilder::new() + .with_url("https://account.blob.core.windows.net/"); + assert_eq!(builder.account_name, Some("account".to_string())) + } } diff --git a/src/gcp/mod.rs b/src/gcp/mod.rs index f93cbde..b3bd572 100644 --- a/src/gcp/mod.rs +++ b/src/gcp/mod.rs @@ -44,6 +44,7 @@ use reqwest::header::RANGE; use reqwest::{header, Client, Method, Response, StatusCode}; use snafu::{ResultExt, Snafu}; use tokio::io::AsyncWrite; +use url::Url; use crate::client::pagination::stream_paginated; use crate::client::retry::RetryExt; @@ -129,6 +130,18 @@ enum Error { source: crate::client::retry::Error, path: String, }, + + #[snafu(display("Unable parse source url. Url: {}, Error: {}", url, source))] + UnableToParseUrl { + source: url::ParseError, + url: String, + }, + + #[snafu(display( + "Unknown url scheme cannot be parsed into storage location: {}", + scheme + ))] + UnknownUrlScheme { scheme: String }, } impl From for super::Error { @@ -766,6 +779,7 @@ pub struct GoogleCloudStorageBuilder { service_account_path: Option, retry_config: RetryConfig, client_options: ClientOptions, + url_parse_error: Option, } impl Default for GoogleCloudStorageBuilder { @@ -775,6 +789,7 @@ impl Default for GoogleCloudStorageBuilder { service_account_path: None, retry_config: Default::default(), client_options: ClientOptions::new().with_allow_http(true), + url_parse_error: None, } } } @@ -785,6 +800,75 @@ impl GoogleCloudStorageBuilder { Default::default() } + /// Create an instance of [`GoogleCloudStorageBuilder`] with values pre-populated from environment variables. + /// + /// Variables extracted from environment: + /// * GOOGLE_SERVICE_ACCOUNT: location of service account file + /// * SERVICE_ACCOUNT: (alias) location of service account file + /// + /// # Example + /// ``` + /// use object_store::gcp::GoogleCloudStorageBuilder; + /// + /// let azure = GoogleCloudStorageBuilder::from_env() + /// .with_bucket_name("foo") + /// .build(); + /// ``` + pub fn from_env() -> Self { + let mut builder = Self::default(); + + if let Ok(service_account_path) = std::env::var("SERVICE_ACCOUNT") { + builder.service_account_path = Some(service_account_path); + } + + if let Ok(service_account_path) = std::env::var("GOOGLE_SERVICE_ACCOUNT") { + builder.service_account_path = Some(service_account_path); + } + + builder + } + + /// Parse available connection info form a well-known storage URL. + /// + /// The supported url schemes are: + /// + /// - `gs:///` + /// + /// Please note that this is a best effort implementation, and will not fail for malformed URLs, + /// but rather warn and ignore the passed url. The url also has no effect on how the + /// storage is accessed - e.g. which driver or protocol is used for reading from the location. + /// + /// # Example + /// ``` + /// use object_store::gcp::GoogleCloudStorageBuilder; + /// + /// let gcs = GoogleCloudStorageBuilder::from_env() + /// .with_url("gs://bucket/path") + /// .build(); + /// ``` + pub fn with_url(mut self, url: impl AsRef) -> Self { + let maybe_parsed = Url::parse(url.as_ref()); + match maybe_parsed { + Ok(parsed) => match parsed.scheme() { + "gs" => { + self.bucket_name = parsed.host_str().map(|host| host.to_owned()); + } + other => { + self.url_parse_error = Some(Error::UnknownUrlScheme { + scheme: other.into(), + }); + } + }, + Err(err) => { + self.url_parse_error = Some(Error::UnableToParseUrl { + source: err, + url: url.as_ref().into(), + }); + } + }; + self + } + /// Set the bucket name (required) pub fn with_bucket_name(mut self, bucket_name: impl Into) -> Self { self.bucket_name = Some(bucket_name.into()); @@ -838,8 +922,13 @@ impl GoogleCloudStorageBuilder { service_account_path, retry_config, client_options, + url_parse_error, } = self; + if let Some(err) = url_parse_error { + return Err(err.into()); + } + let bucket_name = bucket_name.ok_or(Error::MissingBucketName {})?; let service_account_path = service_account_path.ok_or(Error::MissingServiceAccountPath)?; @@ -1095,4 +1184,10 @@ mod test { err ); } + + #[test] + fn gcs_test_urls() { + let builder = GoogleCloudStorageBuilder::new().with_url("gs://bucket/path"); + assert_eq!(builder.bucket_name, Some("bucket".to_string())) + } } From 863fe08a40cfeaa75a0840d51eaab4a0c8cb0f81 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Mon, 12 Dec 2022 19:08:01 +0000 Subject: [PATCH 067/397] Update base64 to 0.20 (#3335) * Update base64 to 0.20 * Fix object_store --- Cargo.toml | 2 +- src/gcp/credential.rs | 10 ++++++++-- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index f378315..a662a81 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -43,7 +43,7 @@ url = "2.2" walkdir = "2" # Cloud storage support -base64 = { version = "0.13", default-features = false, optional = true } +base64 = { version = "0.20", default-features = false, features = ["std"], optional = true } quick-xml = { version = "0.26.0", features = ["serialize"], optional = true } serde = { version = "1.0", default-features = false, features = ["derive"], optional = true } serde_json = { version = "1.0", default-features = false, optional = true } diff --git a/src/gcp/credential.rs b/src/gcp/credential.rs index 5b8cdb8..a2a98a3 100644 --- a/src/gcp/credential.rs +++ b/src/gcp/credential.rs @@ -18,11 +18,17 @@ use crate::client::retry::RetryExt; use crate::client::token::TemporaryToken; use crate::RetryConfig; +use base64::engine::fast_portable::FastPortable; use reqwest::{Client, Method}; use ring::signature::RsaKeyPair; use snafu::{ResultExt, Snafu}; use std::time::{Duration, Instant}; +const URL_SAFE_NO_PAD: FastPortable = FastPortable::from( + &base64::alphabet::URL_SAFE, + base64::engine::fast_portable::NO_PAD, +); + #[derive(Debug, Snafu)] pub enum Error { #[snafu(display("No RSA key found in pem file"))] @@ -166,7 +172,7 @@ impl OAuthProvider { ) .context(SignSnafu)?; - let signature = base64::encode_config(&sig_bytes, base64::URL_SAFE_NO_PAD); + let signature = base64::encode_engine(&sig_bytes, &URL_SAFE_NO_PAD); let jwt = [message, signature].join("."); let body = [ @@ -218,5 +224,5 @@ fn decode_first_rsa_key(private_key_pem: String) -> Result { fn b64_encode_obj(obj: &T) -> Result { let string = serde_json::to_string(obj).context(EncodeSnafu)?; - Ok(base64::encode_config(string, base64::URL_SAFE_NO_PAD)) + Ok(base64::encode_engine(string, &URL_SAFE_NO_PAD)) } From ad91fc36593e2b3b6422b17efe79b4f651b1368b Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 15 Dec 2022 21:29:43 +0000 Subject: [PATCH 068/397] Update AWS SDK (#3349) --- Cargo.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index a662a81..8973254 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -53,8 +53,8 @@ ring = { version = "0.16", default-features = false, features = ["std"], optiona rustls-pemfile = { version = "1.0", default-features = false, optional = true } # AWS Profile support -aws-types = { version = "0.51", optional = true } -aws-config = { version = "0.51", optional = true } +aws-types = { version = "0.52", optional = true } +aws-config = { version = "0.52", optional = true } [features] cloud = ["serde", "serde_json", "quick-xml", "reqwest", "reqwest/json", "reqwest/stream", "chrono/serde", "base64", "rand", "ring"] From c147d7acd50298cd0dfe83b04a5bd4f19e21f2b9 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Fri, 16 Dec 2022 13:54:24 +0000 Subject: [PATCH 069/397] Upstream newline_delimited_stream and ChunkedStore from DataFusion (#3341) * Upstream newline_delimited_stream and ChunkedStore from DataFusion * Clippy --- src/chunked.rs | 247 +++++++++++++++++++++++++++++++++++++++++++ src/delimited.rs | 270 +++++++++++++++++++++++++++++++++++++++++++++++ src/lib.rs | 3 + 3 files changed, 520 insertions(+) create mode 100644 src/chunked.rs create mode 100644 src/delimited.rs diff --git a/src/chunked.rs b/src/chunked.rs new file mode 100644 index 0000000..76865ef --- /dev/null +++ b/src/chunked.rs @@ -0,0 +1,247 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! A [`ChunkedStore`] that can be used to test streaming behaviour + +use std::fmt::{Debug, Display, Formatter}; +use std::io::{BufReader, Read}; +use std::ops::Range; +use std::sync::Arc; + +use async_trait::async_trait; +use bytes::{BufMut, Bytes, BytesMut}; +use futures::stream::BoxStream; +use futures::StreamExt; +use tokio::io::AsyncWrite; + +use crate::path::Path; +use crate::util::maybe_spawn_blocking; +use crate::{GetResult, ListResult, ObjectMeta, ObjectStore}; +use crate::{MultipartId, Result}; + +/// Wraps a [`ObjectStore`] and makes its get response return chunks +/// in a controllable manner. +/// +/// A `ChunkedStore` makes the memory consumption and performance of +/// the wrapped [`ObjectStore`] worse. It is intended for use within +/// tests, to control the chunks in the produced output streams. For +/// example, it is used to verify the delimiting logic in +/// newline_delimited_stream. +#[derive(Debug)] +pub struct ChunkedStore { + inner: Arc, + chunk_size: usize, +} + +impl ChunkedStore { + /// Creates a new [`ChunkedStore`] with the specified chunk_size + pub fn new(inner: Arc, chunk_size: usize) -> Self { + Self { inner, chunk_size } + } +} + +impl Display for ChunkedStore { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!(f, "ChunkedStore({})", self.inner) + } +} + +#[async_trait] +impl ObjectStore for ChunkedStore { + async fn put(&self, location: &Path, bytes: Bytes) -> Result<()> { + self.inner.put(location, bytes).await + } + + async fn put_multipart( + &self, + location: &Path, + ) -> Result<(MultipartId, Box)> { + self.inner.put_multipart(location).await + } + + async fn abort_multipart( + &self, + location: &Path, + multipart_id: &MultipartId, + ) -> Result<()> { + self.inner.abort_multipart(location, multipart_id).await + } + + async fn get(&self, location: &Path) -> Result { + match self.inner.get(location).await? { + GetResult::File(std_file, ..) => { + let reader = BufReader::new(std_file); + let chunk_size = self.chunk_size; + Ok(GetResult::Stream( + futures::stream::try_unfold(reader, move |mut reader| async move { + let (r, out, reader) = maybe_spawn_blocking(move || { + let mut out = Vec::with_capacity(chunk_size); + let r = (&mut reader) + .take(chunk_size as u64) + .read_to_end(&mut out) + .map_err(|err| crate::Error::Generic { + store: "ChunkedStore", + source: Box::new(err), + })?; + Ok((r, out, reader)) + }) + .await?; + + match r { + 0 => Ok(None), + _ => Ok(Some((out.into(), reader))), + } + }) + .boxed(), + )) + } + GetResult::Stream(stream) => { + let buffer = BytesMut::new(); + Ok(GetResult::Stream( + futures::stream::unfold( + (stream, buffer, false, self.chunk_size), + |(mut stream, mut buffer, mut exhausted, chunk_size)| async move { + // Keep accumulating bytes until we reach capacity as long as + // the stream can provide them: + if exhausted { + return None; + } + while buffer.len() < chunk_size { + match stream.next().await { + None => { + exhausted = true; + let slice = buffer.split_off(0).freeze(); + return Some(( + Ok(slice), + (stream, buffer, exhausted, chunk_size), + )); + } + Some(Ok(bytes)) => { + buffer.put(bytes); + } + Some(Err(e)) => { + return Some(( + Err(crate::Error::Generic { + store: "ChunkedStore", + source: Box::new(e), + }), + (stream, buffer, exhausted, chunk_size), + )) + } + }; + } + // Return the chunked values as the next value in the stream + let slice = buffer.split_to(chunk_size).freeze(); + Some((Ok(slice), (stream, buffer, exhausted, chunk_size))) + }, + ) + .boxed(), + )) + } + } + } + + async fn get_range(&self, location: &Path, range: Range) -> Result { + self.inner.get_range(location, range).await + } + + async fn head(&self, location: &Path) -> Result { + self.inner.head(location).await + } + + async fn delete(&self, location: &Path) -> Result<()> { + self.inner.delete(location).await + } + + async fn list( + &self, + prefix: Option<&Path>, + ) -> Result>> { + self.inner.list(prefix).await + } + + async fn list_with_delimiter(&self, prefix: Option<&Path>) -> Result { + self.inner.list_with_delimiter(prefix).await + } + + async fn copy(&self, from: &Path, to: &Path) -> Result<()> { + self.inner.copy(from, to).await + } + + async fn copy_if_not_exists(&self, from: &Path, to: &Path) -> Result<()> { + self.inner.copy_if_not_exists(from, to).await + } +} + +#[cfg(test)] +mod tests { + use futures::StreamExt; + + use crate::local::LocalFileSystem; + use crate::memory::InMemory; + use crate::path::Path; + use crate::tests::*; + + use super::*; + + #[tokio::test] + async fn test_chunked_basic() { + let location = Path::parse("test").unwrap(); + let store: Arc = Arc::new(InMemory::new()); + store + .put(&location, Bytes::from(vec![0; 1001])) + .await + .unwrap(); + + for chunk_size in [10, 20, 31] { + let store = ChunkedStore::new(Arc::clone(&store), chunk_size); + let mut s = match store.get(&location).await.unwrap() { + GetResult::Stream(s) => s, + _ => unreachable!(), + }; + + let mut remaining = 1001; + while let Some(next) = s.next().await { + let size = next.unwrap().len(); + let expected = remaining.min(chunk_size); + assert_eq!(size, expected); + remaining -= expected; + } + assert_eq!(remaining, 0); + } + } + + #[tokio::test] + async fn test_chunked() { + let temporary = tempfile::tempdir().unwrap(); + let integrations: &[Arc] = &[ + Arc::new(InMemory::new()), + Arc::new(LocalFileSystem::new_with_prefix(temporary.path()).unwrap()), + ]; + + for integration in integrations { + let integration = ChunkedStore::new(Arc::clone(integration), 100); + + put_get_delete_list(&integration).await; + list_uses_directories_correctly(&integration).await; + list_with_delimiter(&integration).await; + rename_and_copy(&integration).await; + copy_if_not_exists(&integration).await; + stream_get(&integration).await; + } + } +} diff --git a/src/delimited.rs b/src/delimited.rs new file mode 100644 index 0000000..1321486 --- /dev/null +++ b/src/delimited.rs @@ -0,0 +1,270 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Utility for streaming newline delimited files from object storage + +use std::collections::VecDeque; + +use bytes::Bytes; +use futures::{Stream, StreamExt}; +use snafu::{ensure, Snafu}; + +use super::Result; + +#[derive(Debug, Snafu)] +enum Error { + #[snafu(display("encountered unterminated string"))] + UnterminatedString, + + #[snafu(display("encountered trailing escape character"))] + TrailingEscape, +} + +impl From for super::Error { + fn from(err: Error) -> Self { + Self::Generic { + store: "LineDelimiter", + source: Box::new(err), + } + } +} + +/// The ASCII encoding of `"` +const QUOTE: u8 = b'"'; + +/// The ASCII encoding of `\n` +const NEWLINE: u8 = b'\n'; + +/// The ASCII encoding of `\` +const ESCAPE: u8 = b'\\'; + +/// [`LineDelimiter`] is provided with a stream of [`Bytes`] and returns an iterator +/// of [`Bytes`] containing a whole number of new line delimited records +#[derive(Debug, Default)] +struct LineDelimiter { + /// Complete chunks of [`Bytes`] + complete: VecDeque, + /// Remainder bytes that form the next record + remainder: Vec, + /// True if the last character was the escape character + is_escape: bool, + /// True if currently processing a quoted string + is_quote: bool, +} + +impl LineDelimiter { + /// Creates a new [`LineDelimiter`] with the provided delimiter + fn new() -> Self { + Self::default() + } + + /// Adds the next set of [`Bytes`] + fn push(&mut self, val: impl Into) { + let val: Bytes = val.into(); + + let is_escape = &mut self.is_escape; + let is_quote = &mut self.is_quote; + let mut record_ends = val.iter().enumerate().filter_map(|(idx, v)| { + if *is_escape { + *is_escape = false; + None + } else if *v == ESCAPE { + *is_escape = true; + None + } else if *v == QUOTE { + *is_quote = !*is_quote; + None + } else if *is_quote { + None + } else { + (*v == NEWLINE).then_some(idx + 1) + } + }); + + let start_offset = match self.remainder.is_empty() { + true => 0, + false => match record_ends.next() { + Some(idx) => { + self.remainder.extend_from_slice(&val[0..idx]); + self.complete + .push_back(Bytes::from(std::mem::take(&mut self.remainder))); + idx + } + None => { + self.remainder.extend_from_slice(&val); + return; + } + }, + }; + let end_offset = record_ends.last().unwrap_or(start_offset); + if start_offset != end_offset { + self.complete.push_back(val.slice(start_offset..end_offset)); + } + + if end_offset != val.len() { + self.remainder.extend_from_slice(&val[end_offset..]) + } + } + + /// Marks the end of the stream, delimiting any remaining bytes + /// + /// Returns `true` if there is no remaining data to be read + fn finish(&mut self) -> Result { + if !self.remainder.is_empty() { + ensure!(!self.is_quote, UnterminatedStringSnafu); + ensure!(!self.is_quote, TrailingEscapeSnafu); + + self.complete + .push_back(Bytes::from(std::mem::take(&mut self.remainder))) + } + Ok(self.complete.is_empty()) + } +} + +impl Iterator for LineDelimiter { + type Item = Bytes; + + fn next(&mut self) -> Option { + self.complete.pop_front() + } +} + +/// Given a [`Stream`] of [`Bytes`] returns a [`Stream`] where each +/// yielded [`Bytes`] contains a whole number of new line delimited records +/// accounting for `\` style escapes and `"` quotes +pub fn newline_delimited_stream(s: S) -> impl Stream> +where + S: Stream> + Unpin, +{ + let delimiter = LineDelimiter::new(); + + futures::stream::unfold( + (s, delimiter, false), + |(mut s, mut delimiter, mut exhausted)| async move { + loop { + if let Some(next) = delimiter.next() { + return Some((Ok(next), (s, delimiter, exhausted))); + } else if exhausted { + return None; + } + + match s.next().await { + Some(Ok(bytes)) => delimiter.push(bytes), + Some(Err(e)) => return Some((Err(e), (s, delimiter, exhausted))), + None => { + exhausted = true; + match delimiter.finish() { + Ok(true) => return None, + Ok(false) => continue, + Err(e) => return Some((Err(e), (s, delimiter, exhausted))), + } + } + } + } + }, + ) +} + +#[cfg(test)] +mod tests { + use futures::stream::{BoxStream, TryStreamExt}; + + use super::*; + + #[test] + fn test_delimiter() { + let mut delimiter = LineDelimiter::new(); + delimiter.push("hello\nworld"); + delimiter.push("\n\n"); + + assert_eq!(delimiter.next().unwrap(), Bytes::from("hello\n")); + assert_eq!(delimiter.next().unwrap(), Bytes::from("world\n")); + assert_eq!(delimiter.next().unwrap(), Bytes::from("\n")); + assert!(delimiter.next().is_none()); + } + + #[test] + fn test_delimiter_escaped() { + let mut delimiter = LineDelimiter::new(); + delimiter.push(""); + delimiter.push("fo\\\n\"foo"); + delimiter.push("bo\n\"bar\n"); + delimiter.push("\"he"); + delimiter.push("llo\"\n"); + assert_eq!( + delimiter.next().unwrap(), + Bytes::from("fo\\\n\"foobo\n\"bar\n") + ); + assert_eq!(delimiter.next().unwrap(), Bytes::from("\"hello\"\n")); + assert!(delimiter.next().is_none()); + + // Verify can push further data + delimiter.push("\"foo\nbar\",\"fiz\\\"inner\\\"\"\nhello"); + assert!(!delimiter.finish().unwrap()); + + assert_eq!( + delimiter.next().unwrap(), + Bytes::from("\"foo\nbar\",\"fiz\\\"inner\\\"\"\n") + ); + assert_eq!(delimiter.next().unwrap(), Bytes::from("hello")); + assert!(delimiter.finish().unwrap()); + assert!(delimiter.next().is_none()); + } + + #[tokio::test] + async fn test_delimiter_stream() { + let input = vec!["hello\nworld\nbin", "go\ncup", "cakes"]; + let input_stream = + futures::stream::iter(input.into_iter().map(|s| Ok(Bytes::from(s)))); + let stream = newline_delimited_stream(input_stream); + + let results: Vec<_> = stream.try_collect().await.unwrap(); + assert_eq!( + results, + vec![ + Bytes::from("hello\nworld\n"), + Bytes::from("bingo\n"), + Bytes::from("cupcakes") + ] + ) + } + #[tokio::test] + async fn test_delimiter_unfold_stream() { + let input_stream: BoxStream<'static, Result> = futures::stream::unfold( + VecDeque::from(["hello\nworld\nbin", "go\ncup", "cakes"]), + |mut input| async move { + if !input.is_empty() { + Some((Ok(Bytes::from(input.pop_front().unwrap())), input)) + } else { + None + } + }, + ) + .boxed(); + let stream = newline_delimited_stream(input_stream); + + let results: Vec<_> = stream.try_collect().await.unwrap(); + assert_eq!( + results, + vec![ + Bytes::from("hello\nworld\n"), + Bytes::from("bingo\n"), + Bytes::from("cupcakes") + ] + ) + } +} diff --git a/src/lib.rs b/src/lib.rs index 0cd5661..85e8737 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -163,6 +163,9 @@ compile_error!("Features 'gcp', 'aws', 'azure' are not supported on wasm."); pub mod aws; #[cfg(feature = "azure")] pub mod azure; +#[cfg(not(target_arch = "wasm32"))] +pub mod chunked; +pub mod delimited; #[cfg(feature = "gcp")] pub mod gcp; pub mod limit; From e643abef1f4f6708886b00c7f9ec922d3ba99ad3 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Fri, 16 Dec 2022 19:27:12 +0000 Subject: [PATCH 070/397] More clippy lint fixes (#3355) --- src/aws/client.rs | 2 +- src/azure/client.rs | 2 +- src/azure/credential.rs | 2 +- src/local.rs | 4 ++-- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/aws/client.rs b/src/aws/client.rs index ccc0a9c..d2d2aef 100644 --- a/src/aws/client.rs +++ b/src/aws/client.rs @@ -131,7 +131,7 @@ impl TryFrom for ListResult { let common_prefixes = value .common_prefixes .into_iter() - .map(|x| Ok(Path::parse(&x.prefix)?)) + .map(|x| Ok(Path::parse(x.prefix)?)) .collect::>()?; let objects = value diff --git a/src/azure/client.rs b/src/azure/client.rs index b537f5e..fedd85e 100644 --- a/src/azure/client.rs +++ b/src/azure/client.rs @@ -407,7 +407,7 @@ impl TryFrom for ListResult { .blob_prefix .unwrap_or_default() .into_iter() - .map(|x| Ok(Path::parse(&x.name)?)) + .map(|x| Ok(Path::parse(x.name)?)) .collect::>()?; let objects = value diff --git a/src/azure/credential.rs b/src/azure/credential.rs index 721fcae..38e6e64 100644 --- a/src/azure/credential.rs +++ b/src/azure/credential.rs @@ -153,7 +153,7 @@ fn generate_authorization( key: &str, ) -> String { let str_to_sign = string_to_sign(h, u, method, account); - let auth = hmac_sha256(base64::decode(key).unwrap(), &str_to_sign); + let auth = hmac_sha256(base64::decode(key).unwrap(), str_to_sign); format!("SharedKey {}:{}", account, base64::encode(auth)) } diff --git a/src/local.rs b/src/local.rs index f7b7ad7..2ef87ad 100644 --- a/src/local.rs +++ b/src/local.rs @@ -396,7 +396,7 @@ impl ObjectStore for LocalFileSystem { None => self.config.root.to_file_path().unwrap(), }; - let walkdir = WalkDir::new(&root_path) + let walkdir = WalkDir::new(root_path) // Don't include the root directory itself .min_depth(1) .follow_links(true); @@ -748,7 +748,7 @@ impl AsyncWrite for LocalUpload { self.inner_state = LocalUploadState::Complete; file.sync_all()?; std::mem::drop(file); - std::fs::rename(&staging_path, &self.dest)?; + std::fs::rename(staging_path, &self.dest)?; Poll::Ready(Ok(())) } _ => { From b50cf4722f06b7a0860dcddb1b16d48b228c5843 Mon Sep 17 00:00:00 2001 From: Your friendly neighborhood geek Date: Mon, 19 Dec 2022 14:38:35 +0530 Subject: [PATCH 071/397] add support for content-type in `ClientOptions` (#3358) * add support for content-type in `ClientOptions` - currently only supported for aws & azure * add ClientOptions to GoogleCloudStorageClient - add methods `filename` and `extension` to `Path` --- src/aws/client.rs | 8 +++++++- src/azure/client.rs | 5 +++++ src/client/mod.rs | 39 +++++++++++++++++++++++++++++++++++++++ src/gcp/mod.rs | 16 ++++++++++++++-- src/path/mod.rs | 45 +++++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 110 insertions(+), 3 deletions(-) diff --git a/src/aws/client.rs b/src/aws/client.rs index d2d2aef..0e22bfc 100644 --- a/src/aws/client.rs +++ b/src/aws/client.rs @@ -29,7 +29,9 @@ use crate::{ use bytes::{Buf, Bytes}; use chrono::{DateTime, Utc}; use percent_encoding::{utf8_percent_encode, PercentEncode}; -use reqwest::{Client as ReqwestClient, Method, Response, StatusCode}; +use reqwest::{ + header::CONTENT_TYPE, Client as ReqwestClient, Method, Response, StatusCode, +}; use serde::{Deserialize, Serialize}; use snafu::{ResultExt, Snafu}; use std::ops::Range; @@ -279,6 +281,10 @@ impl S3Client { builder = builder.body(bytes) } + if let Some(value) = self.config().client_options.get_content_type(path) { + builder = builder.header(CONTENT_TYPE, value); + } + let response = builder .query(query) .with_aws_sigv4(credential.as_ref(), &self.config.region, "s3") diff --git a/src/azure/client.rs b/src/azure/client.rs index fedd85e..440c379 100644 --- a/src/azure/client.rs +++ b/src/azure/client.rs @@ -28,6 +28,7 @@ use crate::{ use bytes::{Buf, Bytes}; use chrono::{DateTime, TimeZone, Utc}; use itertools::Itertools; +use reqwest::header::CONTENT_TYPE; use reqwest::{ header::{HeaderValue, CONTENT_LENGTH, IF_NONE_MATCH, RANGE}, Client as ReqwestClient, Method, Response, StatusCode, @@ -207,6 +208,10 @@ impl AzureClient { builder = builder.query(query); } + if let Some(value) = self.config().client_options.get_content_type(path) { + builder = builder.header(CONTENT_TYPE, value); + } + if let Some(bytes) = bytes { builder = builder .header(CONTENT_LENGTH, HeaderValue::from(bytes.len())) diff --git a/src/client/mod.rs b/src/client/mod.rs index 47e6863..9df7b50 100644 --- a/src/client/mod.rs +++ b/src/client/mod.rs @@ -26,8 +26,11 @@ pub mod token; use reqwest::header::{HeaderMap, HeaderValue}; use reqwest::{Client, ClientBuilder, Proxy}; +use std::collections::HashMap; use std::time::Duration; +use crate::path::Path; + fn map_client_error(e: reqwest::Error) -> super::Error { super::Error::Generic { store: "HTTP client", @@ -42,6 +45,8 @@ static DEFAULT_USER_AGENT: &str = #[derive(Debug, Clone, Default)] pub struct ClientOptions { user_agent: Option, + content_type_map: HashMap, + default_content_type: Option, default_headers: Option, proxy_url: Option, allow_http: bool, @@ -70,6 +75,22 @@ impl ClientOptions { self } + /// Set the default CONTENT_TYPE for uploads + pub fn with_default_content_type(mut self, mime: impl Into) -> Self { + self.default_content_type = Some(mime.into()); + self + } + + /// Set the CONTENT_TYPE for a given file extension + pub fn with_content_type_for_suffix( + mut self, + extension: impl Into, + mime: impl Into, + ) -> Self { + self.content_type_map.insert(extension.into(), mime.into()); + self + } + /// Sets the default headers for every request pub fn with_default_headers(mut self, headers: HeaderMap) -> Self { self.default_headers = Some(headers); @@ -165,6 +186,24 @@ impl ClientOptions { self } + /// Get the mime type for the file in `path` to be uploaded + /// + /// Gets the file extension from `path`, and returns the + /// mime type if it was defined initially through + /// `ClientOptions::with_content_type_for_suffix` + /// + /// Otherwise returns the default mime type if it was defined + /// earlier through `ClientOptions::with_default_content_type` + pub fn get_content_type(&self, path: &Path) -> Option<&str> { + match path.extension() { + Some(extension) => match self.content_type_map.get(extension) { + Some(ct) => Some(ct.as_str()), + None => self.default_content_type.as_deref(), + }, + None => self.default_content_type.as_deref(), + } + } + pub(crate) fn client(&self) -> super::Result { let mut builder = ClientBuilder::new(); diff --git a/src/gcp/mod.rs b/src/gcp/mod.rs index b3bd572..c83ab64 100644 --- a/src/gcp/mod.rs +++ b/src/gcp/mod.rs @@ -258,6 +258,7 @@ struct GoogleCloudStorageClient { bucket_name_encoded: String, retry_config: RetryConfig, + client_options: ClientOptions, // TODO: Hook this up in tests max_list_results: Option, @@ -328,10 +329,15 @@ impl GoogleCloudStorageClient { self.base_url, self.bucket_name_encoded ); + let content_type = self + .client_options + .get_content_type(path) + .unwrap_or("application/octet-stream"); + self.client .request(Method::POST, url) .bearer_auth(token) - .header(header::CONTENT_TYPE, "application/octet-stream") + .header(header::CONTENT_TYPE, content_type) .header(header::CONTENT_LENGTH, payload.len()) .query(&[("uploadType", "media"), ("name", path.as_ref())]) .body(payload) @@ -347,11 +353,16 @@ impl GoogleCloudStorageClient { let token = self.get_token().await?; let url = format!("{}/{}/{}", self.base_url, self.bucket_name_encoded, path); + let content_type = self + .client_options + .get_content_type(path) + .unwrap_or("application/octet-stream"); + let response = self .client .request(Method::POST, &url) .bearer_auth(token) - .header(header::CONTENT_TYPE, "application/octet-stream") + .header(header::CONTENT_TYPE, content_type) .header(header::CONTENT_LENGTH, "0") .query(&[("uploads", "")]) .send_retry(&self.retry_config) @@ -967,6 +978,7 @@ impl GoogleCloudStorageBuilder { bucket_name, bucket_name_encoded: encoded_bucket_name, retry_config, + client_options, max_list_results: None, }), }) diff --git a/src/path/mod.rs b/src/path/mod.rs index 59ad471..020e5f5 100644 --- a/src/path/mod.rs +++ b/src/path/mod.rs @@ -229,6 +229,27 @@ impl Path { } } + /// Returns the last path segment containing the filename stored in this [`Path`] + pub fn filename(&self) -> Option<&str> { + match self.raw.is_empty() { + true => None, + false => self.raw.split(DELIMITER).last(), + } + } + + /// Returns the extension of the file stored in this [`Path`], if any + pub fn extension(&self) -> Option<&str> { + self.filename() + .and_then(|f| f.rsplit_once('.')) + .and_then(|(_, extension)| { + if extension.is_empty() { + None + } else { + Some(extension) + } + }) + } + /// Returns an iterator of the [`PathPart`] of this [`Path`] after `prefix` /// /// Returns `None` if the prefix does not match @@ -551,4 +572,28 @@ mod tests { assert_eq!(a.raw, b.raw); assert_eq!(b.raw, c.raw); } + + #[test] + fn filename_from_path() { + let a = Path::from("foo/bar"); + let b = Path::from("foo/bar.baz"); + let c = Path::from("foo.bar/baz"); + + assert_eq!(a.filename(), Some("bar")); + assert_eq!(b.filename(), Some("bar.baz")); + assert_eq!(c.filename(), Some("baz")); + } + + #[test] + fn file_extension() { + let a = Path::from("foo/bar"); + let b = Path::from("foo/bar.baz"); + let c = Path::from("foo.bar/baz"); + let d = Path::from("foo.bar/baz.qux"); + + assert_eq!(a.extension(), None); + assert_eq!(b.extension(), Some("baz")); + assert_eq!(c.extension(), None); + assert_eq!(d.extension(), Some("qux")); + } } From 5210f7bc85102bba6f60faf415dd542e078f7362 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Mon, 19 Dec 2022 21:01:40 +0000 Subject: [PATCH 072/397] Add parquet ObjectStore integration (#3370) * Add parquet ObjectStore integration * Apply suggestions from code review Co-authored-by: Andrew Lamb * Add tests * Fix merge conflict Co-authored-by: Andrew Lamb --- src/lib.rs | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 85e8737..6078c1c 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -33,9 +33,18 @@ //! //! # Create an [`ObjectStore`] implementation: //! -//! * [Google Cloud Storage](https://cloud.google.com/storage/): [`GoogleCloudStorageBuilder`](gcp::GoogleCloudStorageBuilder) -//! * [Amazon S3](https://aws.amazon.com/s3/): [`AmazonS3Builder`](aws::AmazonS3Builder) -//! * [Azure Blob Storage](https://azure.microsoft.com/en-gb/services/storage/blobs/):: [`MicrosoftAzureBuilder`](azure::MicrosoftAzureBuilder) +#![cfg_attr( + feature = "gcp", + doc = "* [Google Cloud Storage](https://cloud.google.com/storage/): [`GoogleCloudStorageBuilder`](gcp::GoogleCloudStorageBuilder)" +)] +#![cfg_attr( + feature = "aws", + doc = "* [Amazon S3](https://aws.amazon.com/s3/): [`AmazonS3Builder`](aws::AmazonS3Builder)" +)] +#![cfg_attr( + feature = "azure", + doc = "* [Azure Blob Storage](https://azure.microsoft.com/en-gb/services/storage/blobs/): [`MicrosoftAzureBuilder`](azure::MicrosoftAzureBuilder)" +)] //! * In Memory: [`InMemory`](memory::InMemory) //! * Local filesystem: [`LocalFileSystem`](local::LocalFileSystem) //! From 0646ab8defcdda75b63f9ead89b562cd32f69eaa Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 22 Dec 2022 09:13:44 +0000 Subject: [PATCH 073/397] Add HttpStore (#3294) (#3380) --- Cargo.toml | 1 + src/azure/client.rs | 18 +-- src/azure/mod.rs | 3 +- src/client/mod.rs | 2 + src/http/client.rs | 372 ++++++++++++++++++++++++++++++++++++++++++++ src/http/mod.rs | 281 +++++++++++++++++++++++++++++++++ src/lib.rs | 14 +- src/util.rs | 18 ++- 8 files changed, 689 insertions(+), 20 deletions(-) create mode 100644 src/http/client.rs create mode 100644 src/http/mod.rs diff --git a/Cargo.toml b/Cargo.toml index 8973254..fd033d5 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -61,6 +61,7 @@ cloud = ["serde", "serde_json", "quick-xml", "reqwest", "reqwest/json", "reqwest azure = ["cloud"] gcp = ["cloud", "rustls-pemfile"] aws = ["cloud"] +http = ["cloud"] # Experimental support for AWS_PROFILE aws_profile = ["aws", "aws-config", "aws-types"] diff --git a/src/azure/client.rs b/src/azure/client.rs index 440c379..50f8363 100644 --- a/src/azure/client.rs +++ b/src/azure/client.rs @@ -20,20 +20,20 @@ use crate::azure::credential::*; use crate::client::pagination::stream_paginated; use crate::client::retry::RetryExt; use crate::path::DELIMITER; -use crate::util::{format_http_range, format_prefix}; +use crate::util::{deserialize_rfc1123, format_http_range, format_prefix}; use crate::{ BoxStream, ClientOptions, ListResult, ObjectMeta, Path, Result, RetryConfig, StreamExt, }; use bytes::{Buf, Bytes}; -use chrono::{DateTime, TimeZone, Utc}; +use chrono::{DateTime, Utc}; use itertools::Itertools; use reqwest::header::CONTENT_TYPE; use reqwest::{ header::{HeaderValue, CONTENT_LENGTH, IF_NONE_MATCH, RANGE}, Client as ReqwestClient, Method, Response, StatusCode, }; -use serde::{Deserialize, Deserializer, Serialize}; +use serde::{Deserialize, Serialize}; use snafu::{ResultExt, Snafu}; use std::collections::HashMap; use std::ops::Range; @@ -479,7 +479,7 @@ impl TryFrom for ObjectMeta { #[derive(Debug, Clone, PartialEq, Eq, Deserialize)] #[serde(rename_all = "PascalCase")] struct BlobProperties { - #[serde(deserialize_with = "deserialize_http_date", rename = "Last-Modified")] + #[serde(deserialize_with = "deserialize_rfc1123", rename = "Last-Modified")] pub last_modified: DateTime, pub etag: String, #[serde(rename = "Content-Length")] @@ -492,16 +492,6 @@ struct BlobProperties { pub content_language: Option, } -// deserialize dates used in Azure payloads according to rfc1123 -fn deserialize_http_date<'de, D>(deserializer: D) -> Result, D::Error> -where - D: Deserializer<'de>, -{ - let s = String::deserialize(deserializer)?; - Utc.datetime_from_str(&s, RFC1123_FMT) - .map_err(serde::de::Error::custom) -} - #[derive(Debug, Clone, PartialEq, Eq)] pub(crate) struct BlockId(Bytes); diff --git a/src/azure/mod.rs b/src/azure/mod.rs index 2cc4fe1..4224ae6 100644 --- a/src/azure/mod.rs +++ b/src/azure/mod.rs @@ -46,6 +46,7 @@ use std::sync::Arc; use tokio::io::AsyncWrite; use url::Url; +use crate::util::RFC1123_FMT; pub use credential::authority_hosts; mod client; @@ -219,7 +220,7 @@ impl ObjectStore for MicrosoftAzure { .to_str() .context(BadHeaderSnafu)?; let last_modified = Utc - .datetime_from_str(last_modified, credential::RFC1123_FMT) + .datetime_from_str(last_modified, RFC1123_FMT) .context(InvalidLastModifiedSnafu { last_modified })?; let content_length = headers diff --git a/src/client/mod.rs b/src/client/mod.rs index 9df7b50..f07377e 100644 --- a/src/client/mod.rs +++ b/src/client/mod.rs @@ -20,8 +20,10 @@ pub mod backoff; #[cfg(test)] pub mod mock_server; +#[cfg(any(feature = "aws", feature = "gcp", feature = "azure"))] pub mod pagination; pub mod retry; +#[cfg(any(feature = "aws", feature = "gcp", feature = "azure"))] pub mod token; use reqwest::header::{HeaderMap, HeaderValue}; diff --git a/src/http/client.rs b/src/http/client.rs new file mode 100644 index 0000000..799c5be --- /dev/null +++ b/src/http/client.rs @@ -0,0 +1,372 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::client::retry::{self, RetryConfig, RetryExt}; +use crate::path::{Path, DELIMITER}; +use crate::util::{deserialize_rfc1123, format_http_range}; +use crate::{ClientOptions, ObjectMeta, Result}; +use bytes::{Buf, Bytes}; +use chrono::{DateTime, Utc}; +use percent_encoding::percent_decode_str; +use reqwest::header::{CONTENT_TYPE, RANGE}; +use reqwest::{Method, Response, StatusCode}; +use serde::Deserialize; +use snafu::{OptionExt, ResultExt, Snafu}; +use std::ops::Range; +use url::Url; + +#[derive(Debug, Snafu)] +enum Error { + #[snafu(display("Request error: {}", source))] + Request { source: retry::Error }, + + #[snafu(display("Request error: {}", source))] + Reqwest { source: reqwest::Error }, + + #[snafu(display("Error decoding PROPFIND response: {}", source))] + InvalidPropFind { source: quick_xml::de::DeError }, + + #[snafu(display("Missing content size for {}", href))] + MissingSize { href: String }, + + #[snafu(display("Error getting properties of \"{}\" got \"{}\"", href, status))] + PropStatus { href: String, status: String }, + + #[snafu(display("Failed to parse href \"{}\": {}", href, source))] + InvalidHref { + href: String, + source: url::ParseError, + }, + + #[snafu(display("Path \"{}\" contained non-unicode characters: {}", path, source))] + NonUnicode { + path: String, + source: std::str::Utf8Error, + }, + + #[snafu(display("Encountered invalid path \"{}\": {}", path, source))] + InvalidPath { + path: String, + source: crate::path::Error, + }, +} + +impl From for crate::Error { + fn from(err: Error) -> Self { + Self::Generic { + store: "HTTP", + source: Box::new(err), + } + } +} + +/// Internal client for HttpStore +#[derive(Debug)] +pub struct Client { + url: Url, + client: reqwest::Client, + retry_config: RetryConfig, + client_options: ClientOptions, +} + +impl Client { + pub fn new( + url: Url, + client_options: ClientOptions, + retry_config: RetryConfig, + ) -> Result { + let client = client_options.client()?; + Ok(Self { + url, + retry_config, + client_options, + client, + }) + } + + pub fn base_url(&self) -> &Url { + &self.url + } + + fn path_url(&self, location: &Path) -> Url { + let mut url = self.url.clone(); + url.path_segments_mut().unwrap().extend(location.parts()); + url + } + + /// Create a directory with `path` using MKCOL + async fn make_directory(&self, path: &str) -> Result<(), Error> { + let method = Method::from_bytes(b"MKCOL").unwrap(); + let mut url = self.url.clone(); + url.path_segments_mut() + .unwrap() + .extend(path.split(DELIMITER)); + + self.client + .request(method, url) + .send_retry(&self.retry_config) + .await + .context(RequestSnafu)?; + + Ok(()) + } + + /// Recursively create parent directories + async fn create_parent_directories(&self, location: &Path) -> Result<()> { + let mut stack = vec![]; + + // Walk backwards until a request succeeds + let mut last_prefix = location.as_ref(); + while let Some((prefix, _)) = last_prefix.rsplit_once(DELIMITER) { + last_prefix = prefix; + + match self.make_directory(prefix).await { + Ok(_) => break, + Err(Error::Request { source }) + if matches!(source.status(), Some(StatusCode::CONFLICT)) => + { + // Need to create parent + stack.push(prefix) + } + Err(e) => return Err(e.into()), + } + } + + // Retry the failed requests, which should now succeed + for prefix in stack.into_iter().rev() { + self.make_directory(prefix).await?; + } + + Ok(()) + } + + pub async fn put(&self, location: &Path, bytes: Bytes) -> Result<()> { + let mut retry = false; + loop { + let url = self.path_url(location); + let mut builder = self.client.put(url).body(bytes.clone()); + if let Some(value) = self.client_options.get_content_type(location) { + builder = builder.header(CONTENT_TYPE, value); + } + + match builder.send_retry(&self.retry_config).await { + Ok(_) => return Ok(()), + Err(source) => match source.status() { + // Some implementations return 404 instead of 409 + Some(StatusCode::CONFLICT | StatusCode::NOT_FOUND) if !retry => { + retry = true; + self.create_parent_directories(location).await? + } + _ => return Err(Error::Request { source }.into()), + }, + } + } + } + + pub async fn list( + &self, + location: Option<&Path>, + depth: &str, + ) -> Result { + let url = location + .map(|path| self.path_url(path)) + .unwrap_or_else(|| self.url.clone()); + + let method = Method::from_bytes(b"PROPFIND").unwrap(); + let result = self + .client + .request(method, url) + .header("Depth", depth) + .send_retry(&self.retry_config) + .await; + + let response = match result { + Ok(result) => result.bytes().await.context(ReqwestSnafu)?, + Err(e) if matches!(e.status(), Some(StatusCode::NOT_FOUND)) => { + return match depth { + "0" => { + let path = location.map(|x| x.as_ref()).unwrap_or(""); + Err(crate::Error::NotFound { + path: path.to_string(), + source: Box::new(e), + }) + } + _ => { + // If prefix not found, return empty result set + Ok(Default::default()) + } + }; + } + Err(source) => return Err(Error::Request { source }.into()), + }; + + let status = quick_xml::de::from_reader(response.reader()) + .context(InvalidPropFindSnafu)?; + Ok(status) + } + + pub async fn delete(&self, path: &Path) -> Result<()> { + let url = self.path_url(path); + self.client + .delete(url) + .send_retry(&self.retry_config) + .await + .context(RequestSnafu)?; + Ok(()) + } + + pub async fn get( + &self, + location: &Path, + range: Option>, + ) -> Result { + let url = self.path_url(location); + let mut builder = self.client.get(url); + + if let Some(range) = range { + builder = builder.header(RANGE, format_http_range(range)); + } + + builder + .send_retry(&self.retry_config) + .await + .map_err(|source| match source.status() { + Some(StatusCode::NOT_FOUND) => crate::Error::NotFound { + source: Box::new(source), + path: location.to_string(), + }, + _ => Error::Request { source }.into(), + }) + } + + pub async fn copy(&self, from: &Path, to: &Path, overwrite: bool) -> Result<()> { + let from = self.path_url(from); + let to = self.path_url(to); + let method = Method::from_bytes(b"COPY").unwrap(); + + let mut builder = self + .client + .request(method, from) + .header("Destination", to.as_str()); + + if !overwrite { + builder = builder.header("Overwrite", "F"); + } + + match builder.send_retry(&self.retry_config).await { + Ok(_) => Ok(()), + Err(e) + if !overwrite + && matches!(e.status(), Some(StatusCode::PRECONDITION_FAILED)) => + { + Err(crate::Error::AlreadyExists { + path: to.to_string(), + source: Box::new(e), + }) + } + Err(source) => Err(Error::Request { source }.into()), + } + } +} + +/// The response returned by a PROPFIND request, i.e. list +#[derive(Deserialize, Default)] +pub struct MultiStatus { + pub response: Vec, +} + +#[derive(Deserialize)] +pub struct MultiStatusResponse { + href: String, + #[serde(rename = "propstat")] + prop_stat: PropStat, +} + +impl MultiStatusResponse { + /// Returns an error if this response is not OK + pub fn check_ok(&self) -> Result<()> { + match self.prop_stat.status.contains("200 OK") { + true => Ok(()), + false => Err(Error::PropStatus { + href: self.href.clone(), + status: self.prop_stat.status.clone(), + } + .into()), + } + } + + /// Returns the resolved path of this element relative to `base_url` + pub fn path(&self, base_url: &Url) -> Result { + let url = Url::options() + .base_url(Some(base_url)) + .parse(&self.href) + .context(InvalidHrefSnafu { href: &self.href })?; + + // Reverse any percent encoding + let path = percent_decode_str(url.path()) + .decode_utf8() + .context(NonUnicodeSnafu { path: url.path() })?; + + Ok(Path::parse(path.as_ref()).context(InvalidPathSnafu { path })?) + } + + fn size(&self) -> Result { + let size = self + .prop_stat + .prop + .content_length + .context(MissingSizeSnafu { href: &self.href })?; + Ok(size) + } + + /// Returns this objects metadata as [`ObjectMeta`] + pub fn object_meta(&self, base_url: &Url) -> Result { + Ok(ObjectMeta { + location: self.path(base_url)?, + last_modified: self.prop_stat.prop.last_modified, + size: self.size()?, + }) + } + + /// Returns true if this is a directory / collection + pub fn is_dir(&self) -> bool { + self.prop_stat.prop.resource_type.collection.is_some() + } +} + +#[derive(Deserialize)] +pub struct PropStat { + prop: Prop, + status: String, +} + +#[derive(Deserialize)] +pub struct Prop { + #[serde(deserialize_with = "deserialize_rfc1123", rename = "getlastmodified")] + last_modified: DateTime, + + #[serde(rename = "getcontentlength")] + content_length: Option, + + #[serde(rename = "resourcetype")] + resource_type: ResourceType, +} + +#[derive(Deserialize)] +pub struct ResourceType { + collection: Option<()>, +} diff --git a/src/http/mod.rs b/src/http/mod.rs new file mode 100644 index 0000000..25997d8 --- /dev/null +++ b/src/http/mod.rs @@ -0,0 +1,281 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! An object store implementation for generic HTTP servers +//! +//! This follows [rfc2518] commonly known called [WebDAV] +//! +//! Basic get support will work out of the box with most HTTP servers, +//! even those that don't explicitly support [rfc2518] +//! +//! Other operations such as list, delete, copy, etc... will likely +//! require server-side configuration. A list of HTTP servers with support +//! can be found [here](https://wiki.archlinux.org/title/WebDAV#Server) +//! +//! Multipart uploads are not currently supported +//! +//! [rfc2518]: https://datatracker.ietf.org/doc/html/rfc2518 +//! [WebDAV]: https://en.wikipedia.org/wiki/WebDAV + +use std::ops::Range; + +use async_trait::async_trait; +use bytes::Bytes; +use futures::stream::BoxStream; +use futures::{StreamExt, TryStreamExt}; +use snafu::{OptionExt, ResultExt, Snafu}; +use tokio::io::AsyncWrite; +use url::Url; + +use crate::http::client::Client; +use crate::path::Path; +use crate::{ + ClientOptions, GetResult, ListResult, MultipartId, ObjectMeta, ObjectStore, Result, + RetryConfig, +}; + +mod client; + +#[derive(Debug, Snafu)] +enum Error { + #[snafu(display("Must specify a URL"))] + MissingUrl, + + #[snafu(display("Invalid URL: {}", source))] + InvalidUrl { source: reqwest::Error }, + + #[snafu(display("Object is a directory"))] + IsDirectory, + + #[snafu(display("PROPFIND response contained no valid objects"))] + NoObjects, + + #[snafu(display("PROPFIND response contained more than one object"))] + MultipleObjects, + + #[snafu(display("Request error: {}", source))] + Reqwest { source: reqwest::Error }, +} + +impl From for crate::Error { + fn from(err: Error) -> Self { + Self::Generic { + store: "HTTP", + source: Box::new(err), + } + } +} + +/// An [`ObjectStore`] implementation for generic HTTP servers +/// +/// See [`crate::http`] for more information +#[derive(Debug)] +pub struct HttpStore { + client: Client, +} + +impl std::fmt::Display for HttpStore { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "HttpStore") + } +} + +#[async_trait] +impl ObjectStore for HttpStore { + async fn put(&self, location: &Path, bytes: Bytes) -> Result<()> { + self.client.put(location, bytes).await + } + + async fn put_multipart( + &self, + _location: &Path, + ) -> Result<(MultipartId, Box)> { + Err(super::Error::NotImplemented) + } + + async fn abort_multipart( + &self, + _location: &Path, + _multipart_id: &MultipartId, + ) -> Result<()> { + Err(super::Error::NotImplemented) + } + + async fn get(&self, location: &Path) -> Result { + let response = self.client.get(location, None).await?; + let stream = response + .bytes_stream() + .map_err(|source| Error::Reqwest { source }.into()) + .boxed(); + + Ok(GetResult::Stream(stream)) + } + + async fn get_range(&self, location: &Path, range: Range) -> Result { + let bytes = self + .client + .get(location, Some(range)) + .await? + .bytes() + .await + .context(ReqwestSnafu)?; + Ok(bytes) + } + + async fn head(&self, location: &Path) -> Result { + let status = self.client.list(Some(location), "0").await?; + match status.response.len() { + 1 => { + let response = status.response.into_iter().next().unwrap(); + response.check_ok()?; + match response.is_dir() { + true => Err(Error::IsDirectory.into()), + false => response.object_meta(self.client.base_url()), + } + } + 0 => Err(Error::NoObjects.into()), + _ => Err(Error::MultipleObjects.into()), + } + } + + async fn delete(&self, location: &Path) -> Result<()> { + self.client.delete(location).await + } + + async fn list( + &self, + prefix: Option<&Path>, + ) -> Result>> { + let status = self.client.list(prefix, "infinity").await?; + Ok(futures::stream::iter( + status + .response + .into_iter() + .filter(|r| !r.is_dir()) + .map(|response| { + response.check_ok()?; + response.object_meta(self.client.base_url()) + }), + ) + .boxed()) + } + + async fn list_with_delimiter(&self, prefix: Option<&Path>) -> Result { + let status = self.client.list(prefix, "1").await?; + let prefix_len = prefix.map(|p| p.as_ref().len()).unwrap_or(0); + + let mut objects: Vec = Vec::with_capacity(status.response.len()); + let mut common_prefixes = Vec::with_capacity(status.response.len()); + for response in status.response { + response.check_ok()?; + match response.is_dir() { + false => objects.push(response.object_meta(self.client.base_url())?), + true => { + let path = response.path(self.client.base_url())?; + // Exclude the current object + if path.as_ref().len() > prefix_len { + common_prefixes.push(path); + } + } + } + } + + Ok(ListResult { + common_prefixes, + objects, + }) + } + + async fn copy(&self, from: &Path, to: &Path) -> Result<()> { + self.client.copy(from, to, true).await + } + + async fn copy_if_not_exists(&self, from: &Path, to: &Path) -> Result<()> { + self.client.copy(from, to, false).await + } +} + +/// Configure a connection to a generic HTTP server +#[derive(Debug, Default)] +pub struct HttpBuilder { + url: Option>, + client_options: ClientOptions, + retry_config: RetryConfig, +} + +impl HttpBuilder { + /// Create a new [`HttpBuilder`] with default values. + pub fn new() -> Self { + Default::default() + } + + /// Set the URL + pub fn with_url(mut self, url: impl reqwest::IntoUrl) -> Self { + self.url = Some(url.into_url().context(InvalidUrlSnafu).map_err(Into::into)); + self + } + + /// Set the retry configuration + pub fn with_retry(mut self, retry_config: RetryConfig) -> Self { + self.retry_config = retry_config; + self + } + + /// Sets the client options, overriding any already set + pub fn with_client_options(mut self, options: ClientOptions) -> Self { + self.client_options = options; + self + } + + /// Build an [`HttpStore`] with the configured options + pub fn build(self) -> Result { + let url = self.url.context(MissingUrlSnafu)??; + Ok(HttpStore { + client: Client::new(url, self.client_options, self.retry_config)?, + }) + } +} + +#[cfg(test)] +mod tests { + use crate::tests::*; + + use super::*; + + #[tokio::test] + async fn http_test() { + dotenv::dotenv().ok(); + let force = std::env::var("TEST_INTEGRATION"); + if force.is_err() { + eprintln!("skipping HTTP integration test - set TEST_INTEGRATION to run"); + return; + } + let url = std::env::var("HTTP_URL").expect("HTTP_URL must be set"); + let options = ClientOptions::new().with_allow_http(true); + let integration = HttpBuilder::new() + .with_url(url) + .with_client_options(options) + .build() + .unwrap(); + + put_get_delete_list_opts(&integration, false).await; + list_uses_directories_correctly(&integration).await; + list_with_delimiter(&integration).await; + rename_and_copy(&integration).await; + copy_if_not_exists(&integration).await; + } +} diff --git a/src/lib.rs b/src/lib.rs index 6078c1c..0c416c2 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -45,6 +45,10 @@ feature = "azure", doc = "* [Azure Blob Storage](https://azure.microsoft.com/en-gb/services/storage/blobs/): [`MicrosoftAzureBuilder`](azure::MicrosoftAzureBuilder)" )] +#![cfg_attr( + feature = "http", + doc = "* [HTTP Storage](https://datatracker.ietf.org/doc/html/rfc2518): [`HttpBuilder`](http::HttpBuilder)" +)] //! * In Memory: [`InMemory`](memory::InMemory) //! * Local filesystem: [`LocalFileSystem`](local::LocalFileSystem) //! @@ -177,6 +181,8 @@ pub mod chunked; pub mod delimited; #[cfg(feature = "gcp")] pub mod gcp; +#[cfg(feature = "http")] +pub mod http; pub mod limit; #[cfg(not(target_arch = "wasm32"))] pub mod local; @@ -185,10 +191,10 @@ pub mod path; pub mod prefix; pub mod throttle; -#[cfg(any(feature = "gcp", feature = "aws", feature = "azure"))] +#[cfg(any(feature = "gcp", feature = "aws", feature = "azure", feature = "http"))] mod client; -#[cfg(any(feature = "gcp", feature = "aws", feature = "azure"))] +#[cfg(any(feature = "gcp", feature = "aws", feature = "azure", feature = "http"))] pub use client::{backoff::BackoffConfig, retry::RetryConfig}; #[cfg(any(feature = "azure", feature = "aws", feature = "gcp"))] @@ -210,7 +216,7 @@ use std::io::{Read, Seek, SeekFrom}; use std::ops::Range; use tokio::io::AsyncWrite; -#[cfg(any(feature = "azure", feature = "aws", feature = "gcp"))] +#[cfg(any(feature = "azure", feature = "aws", feature = "gcp", feature = "http"))] pub use client::ClientOptions; /// An alias for a dynamically dispatched object store implementation. @@ -1003,7 +1009,7 @@ mod tests { let paths = flatten_list_stream(storage, None).await.unwrap(); for f in &paths { - let _ = storage.delete(f).await; + storage.delete(f).await.unwrap(); } } diff --git a/src/util.rs b/src/util.rs index 41c72d0..e592e7b 100644 --- a/src/util.rs +++ b/src/util.rs @@ -20,6 +20,22 @@ use super::Result; use bytes::Bytes; use futures::{stream::StreamExt, Stream, TryStreamExt}; +#[cfg(any(feature = "azure", feature = "http"))] +pub static RFC1123_FMT: &str = "%a, %d %h %Y %T GMT"; + +// deserialize dates according to rfc1123 +#[cfg(any(feature = "azure", feature = "http"))] +pub fn deserialize_rfc1123<'de, D>( + deserializer: D, +) -> Result, D::Error> +where + D: serde::Deserializer<'de>, +{ + let s: String = serde::Deserialize::deserialize(deserializer)?; + chrono::TimeZone::datetime_from_str(&chrono::Utc, &s, RFC1123_FMT) + .map_err(serde::de::Error::custom) +} + /// Returns the prefix to be passed to an object store #[cfg(any(feature = "aws", feature = "gcp", feature = "azure"))] pub fn format_prefix(prefix: Option<&crate::path::Path>) -> Option { @@ -30,7 +46,7 @@ pub fn format_prefix(prefix: Option<&crate::path::Path>) -> Option { /// Returns a formatted HTTP range header as per /// -#[cfg(any(feature = "aws", feature = "gcp", feature = "azure"))] +#[cfg(any(feature = "aws", feature = "gcp", feature = "azure", feature = "http"))] pub fn format_http_range(range: std::ops::Range) -> String { format!("bytes={}-{}", range.start, range.end.saturating_sub(1)) } From 914679c96e883ff2cb5b14f75865bb226db1f026 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Tue, 27 Dec 2022 10:22:56 +0000 Subject: [PATCH 074/397] Update quick-xml to 0.27 (#3395) * Update quick-xml * Fix Azure --- Cargo.toml | 2 +- src/aws/client.rs | 4 ++-- src/azure/client.rs | 4 ++-- src/gcp/mod.rs | 4 ++-- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index fd033d5..a9cc151 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -44,7 +44,7 @@ walkdir = "2" # Cloud storage support base64 = { version = "0.20", default-features = false, features = ["std"], optional = true } -quick-xml = { version = "0.26.0", features = ["serialize"], optional = true } +quick-xml = { version = "0.27.0", features = ["serialize"], optional = true } serde = { version = "1.0", default-features = false, features = ["derive"], optional = true } serde_json = { version = "1.0", default-features = false, optional = true } rand = { version = "0.8", default-features = false, features = ["std", "std_rng"], optional = true } diff --git a/src/aws/client.rs b/src/aws/client.rs index 0e22bfc..b40bcba 100644 --- a/src/aws/client.rs +++ b/src/aws/client.rs @@ -189,9 +189,9 @@ struct CompleteMultipart { #[derive(Debug, Serialize)] struct MultipartPart { - #[serde(rename = "$unflatten=ETag")] + #[serde(rename = "ETag")] e_tag: String, - #[serde(rename = "$unflatten=PartNumber")] + #[serde(rename = "PartNumber")] part_number: usize, } diff --git a/src/azure/client.rs b/src/azure/client.rs index 50f8363..556a2ad 100644 --- a/src/azure/client.rs +++ b/src/azure/client.rs @@ -410,7 +410,6 @@ impl TryFrom for ListResult { let common_prefixes = value .blobs .blob_prefix - .unwrap_or_default() .into_iter() .map(|x| Ok(Path::parse(x.name)?)) .collect::>()?; @@ -437,7 +436,8 @@ impl TryFrom for ListResult { #[derive(Debug, Clone, PartialEq, Eq, Deserialize)] #[serde(rename_all = "PascalCase")] struct Blobs { - pub blob_prefix: Option>, + #[serde(default)] + pub blob_prefix: Vec, #[serde(rename = "Blob", default)] pub blobs: Vec, } diff --git a/src/gcp/mod.rs b/src/gcp/mod.rs index c83ab64..c1424d9 100644 --- a/src/gcp/mod.rs +++ b/src/gcp/mod.rs @@ -221,9 +221,9 @@ struct InitiateMultipartUploadResult { #[derive(serde::Serialize, Debug)] #[serde(rename_all = "PascalCase", rename(serialize = "Part"))] struct MultipartPart { - #[serde(rename = "$unflatten=PartNumber")] + #[serde(rename = "PartNumber")] part_number: usize, - #[serde(rename = "$unflatten=ETag")] + #[serde(rename = "ETag")] e_tag: String, } From 00e2d3994bef8a4889458c37a5498b17ddcbc0f7 Mon Sep 17 00:00:00 2001 From: Will Jones Date: Tue, 27 Dec 2022 17:13:08 -0800 Subject: [PATCH 075/397] Make sure integration works on latest version of localstack (#3403) --- CONTRIBUTING.md | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index efcd5fe..550640d 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -39,7 +39,7 @@ To test the S3 integration against [localstack](https://localstack.cloud/) First start up a container running localstack ``` -$ podman run --rm -it -p 4566:4566 -p 4510-4559:4510-4559 localstack/localstack +$ podman run --rm -it -e PROVIDER_OVERRIDE_S3=asf -p 4566:4566 -p 4510-4559:4510-4559 localstack/localstack ``` Setup environment @@ -49,7 +49,9 @@ export TEST_INTEGRATION=1 export OBJECT_STORE_AWS_DEFAULT_REGION=us-east-1 export OBJECT_STORE_AWS_ACCESS_KEY_ID=test export OBJECT_STORE_AWS_SECRET_ACCESS_KEY=test -export AWS_ENDPOINT=http://128.0.0.1:4566 +export OBJECT_STORE_AWS_ENDPOINT=http://localhost:4566 +export AWS_ACCESS_KEY_ID=test +export AWS_SECRET_ACCESS_KEY=test export OBJECT_STORE_BUCKET=test-bucket ``` @@ -59,6 +61,12 @@ Create a bucket using the AWS CLI podman run --net=host --env-host amazon/aws-cli --endpoint-url=http://localhost:4566 s3 mb s3://test-bucket ``` +Or directly with: + +``` +aws s3 mb s3://test-bucket --endpoint-url=http://localhost:4566 +``` + Run tests ``` From fbf056176699b165c3f7a72d66413a5a99ae85af Mon Sep 17 00:00:00 2001 From: askoa <112126368+askoa@users.noreply.github.com> Date: Wed, 28 Dec 2022 04:55:25 -0500 Subject: [PATCH 076/397] object_store: Flush buffered multipart only during poll_shutdown (#3397) Co-authored-by: askoa --- src/multipart.rs | 56 ++++++++++++++++++++++++++++++++++-------------- 1 file changed, 40 insertions(+), 16 deletions(-) diff --git a/src/multipart.rs b/src/multipart.rs index de85914..65427d1 100644 --- a/src/multipart.rs +++ b/src/multipart.rs @@ -109,6 +109,43 @@ where } } +impl CloudMultiPartUpload +where + T: CloudMultiPartUploadImpl + Send + Sync, +{ + // The `poll_flush` function will only flush the in-progress tasks. + // The `final_flush` method called during `poll_shutdown` will flush + // the `current_buffer` along with in-progress tasks. + // Please see https://github.com/apache/arrow-rs/issues/3390 for more details. + fn final_flush( + mut self: Pin<&mut Self>, + cx: &mut std::task::Context<'_>, + ) -> Poll> { + // Poll current tasks + self.as_mut().poll_tasks(cx)?; + + // If current_buffer is not empty, see if it can be submitted + if !self.current_buffer.is_empty() && self.tasks.len() < self.max_concurrency { + let out_buffer: Vec = std::mem::take(&mut self.current_buffer); + let inner = Arc::clone(&self.inner); + let part_idx = self.current_part_idx; + self.tasks.push(Box::pin(async move { + let upload_part = inner.put_multipart_part(out_buffer, part_idx).await?; + Ok((part_idx, upload_part)) + })); + } + + self.as_mut().poll_tasks(cx)?; + + // If tasks and current_buffer are empty, return Ready + if self.tasks.is_empty() && self.current_buffer.is_empty() { + Poll::Ready(Ok(())) + } else { + Poll::Pending + } + } +} + impl AsyncWrite for CloudMultiPartUpload where T: CloudMultiPartUploadImpl + Send + Sync, @@ -158,21 +195,8 @@ where // Poll current tasks self.as_mut().poll_tasks(cx)?; - // If current_buffer is not empty, see if it can be submitted - if !self.current_buffer.is_empty() && self.tasks.len() < self.max_concurrency { - let out_buffer: Vec = std::mem::take(&mut self.current_buffer); - let inner = Arc::clone(&self.inner); - let part_idx = self.current_part_idx; - self.tasks.push(Box::pin(async move { - let upload_part = inner.put_multipart_part(out_buffer, part_idx).await?; - Ok((part_idx, upload_part)) - })); - } - - self.as_mut().poll_tasks(cx)?; - - // If tasks and current_buffer are empty, return Ready - if self.tasks.is_empty() && self.current_buffer.is_empty() { + // If tasks is empty, return Ready + if self.tasks.is_empty() { Poll::Ready(Ok(())) } else { Poll::Pending @@ -184,7 +208,7 @@ where cx: &mut std::task::Context<'_>, ) -> Poll> { // First, poll flush - match self.as_mut().poll_flush(cx) { + match self.as_mut().final_flush(cx) { Poll::Pending => return Poll::Pending, Poll::Ready(res) => res?, }; From 1c4242c093bf6ebd52a3ac32c91724ce59a86a26 Mon Sep 17 00:00:00 2001 From: GeauxEric Date: Sat, 31 Dec 2022 05:06:03 -0800 Subject: [PATCH 077/397] Add Put and Multipart Put doc examples (#3420) --- src/lib.rs | 54 ++++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 52 insertions(+), 2 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 0c416c2..425c5cd 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -61,7 +61,7 @@ //! * Concurrent Request Limit: [`LimitStore`](limit::LimitStore) //! //! -//! # Listing objects: +//! # List objects: //! //! Use the [`ObjectStore::list`] method to iterate over objects in //! remote storage or files in the local filesystem: @@ -114,7 +114,7 @@ //! ... //! ``` //! -//! # Fetching objects +//! # Fetch objects //! //! Use the [`ObjectStore::get`] method to fetch the data bytes //! from remote storage or files in the local filesystem as a stream. @@ -164,7 +164,57 @@ //! ```text //! Num zeros in data/file01.parquet is 657 //! ``` +//! # Put object +//! Use the [`ObjectStore::put`] method to save data in remote storage or local filesystem. //! +//! ``` +//! # use object_store::local::LocalFileSystem; +//! # fn get_object_store() -> LocalFileSystem { +//! # LocalFileSystem::new_with_prefix("/tmp").unwrap() +//! # } +//! # async fn put() { +//! use object_store::ObjectStore; +//! use std::sync::Arc; +//! use bytes::Bytes; +//! use object_store::path::Path; +//! +//! let object_store: Arc = Arc::new(get_object_store()); +//! let path: Path = "data/file1".try_into().unwrap(); +//! let bytes = Bytes::from_static(b"hello"); +//! object_store +//! .put(&path, bytes) +//! .await +//! .unwrap(); +//! # } +//! ``` +//! +//! # Multipart put object +//! Use the [`ObjectStore::put_multipart`] method to save large amount of data in chunks. +//! +//! ``` +//! # use object_store::local::LocalFileSystem; +//! # fn get_object_store() -> LocalFileSystem { +//! # LocalFileSystem::new_with_prefix("/tmp").unwrap() +//! # } +//! # async fn multi_upload() { +//! use object_store::ObjectStore; +//! use std::sync::Arc; +//! use bytes::Bytes; +//! use tokio::io::AsyncWriteExt; +//! use object_store::path::Path; +//! +//! let object_store: Arc = Arc::new(get_object_store()); +//! let path: Path = "data/large_file".try_into().unwrap(); +//! let (_id, mut writer) = object_store +//! .put_multipart(&path) +//! .await +//! .unwrap(); +//! let bytes = Bytes::from_static(b"hello"); +//! writer.write_all(&bytes).await.unwrap(); +//! writer.flush().await.unwrap(); +//! writer.shutdown().await.unwrap(); +//! # } +//! ``` #[cfg(all( target_arch = "wasm32", From 8021c76128b1fb83fc0f703f8032654ab88e5f98 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Mon, 2 Jan 2023 14:42:19 +0000 Subject: [PATCH 078/397] Derive Clone for ObjectStore builders and Make URL Parsing Stricter (#3419) (#3424) * Derive Clone for ObjectStore builders (#3419) Make URL parsing more strict * Review feedback --- src/aws/mod.rs | 112 +++++++++++++++------------ src/azure/mod.rs | 196 ++++++++++++++++++++++++----------------------- src/gcp/mod.rs | 95 ++++++++++++----------- src/http/mod.rs | 21 +++-- 4 files changed, 228 insertions(+), 196 deletions(-) diff --git a/src/aws/mod.rs b/src/aws/mod.rs index 0fcfbaf..786ccd2 100644 --- a/src/aws/mod.rs +++ b/src/aws/mod.rs @@ -36,6 +36,7 @@ use bytes::Bytes; use chrono::{DateTime, Utc}; use futures::stream::BoxStream; use futures::TryStreamExt; +use itertools::Itertools; use snafu::{OptionExt, ResultExt, Snafu}; use std::collections::BTreeSet; use std::ops::Range; @@ -129,6 +130,9 @@ enum Error { scheme ))] UnknownUrlScheme { scheme: String }, + + #[snafu(display("URL did not match any known pattern for scheme: {}", url))] + UrlNotRecognised { url: String }, } impl From for super::Error { @@ -358,7 +362,7 @@ impl CloudMultiPartUploadImpl for S3MultiPartUpload { /// .with_secret_access_key(SECRET_KEY) /// .build(); /// ``` -#[derive(Debug, Default)] +#[derive(Debug, Default, Clone)] pub struct AmazonS3Builder { access_key_id: Option, secret_access_key: Option, @@ -366,13 +370,13 @@ pub struct AmazonS3Builder { bucket_name: Option, endpoint: Option, token: Option, + url: Option, retry_config: RetryConfig, imdsv1_fallback: bool, virtual_hosted_style_request: bool, metadata_endpoint: Option, profile: Option, client_options: ClientOptions, - url_parse_error: Option, } impl AmazonS3Builder { @@ -453,9 +457,7 @@ impl AmazonS3Builder { /// - `https://s3..amazonaws.com` /// - `https://.s3..amazonaws.com` /// - /// Please note that this is a best effort implementation, and will not fail for malformed URLs, - /// but rather warn and ignore the passed url. The url also has no effect on how the - /// storage is accessed - e.g. which driver or protocol is used for reading from the location. + /// Note: Settings derived from the URL will override any others set on this builder /// /// # Example /// ``` @@ -465,44 +467,39 @@ impl AmazonS3Builder { /// .with_url("s3://bucket/path") /// .build(); /// ``` - pub fn with_url(mut self, url: impl AsRef) -> Self { - let maybe_parsed = Url::parse(url.as_ref()); - match maybe_parsed { - Ok(parsed) => match parsed.scheme() { - "s3" | "s3a" => { - self.bucket_name = parsed.host_str().map(|host| host.to_owned()); - } - "https" => { - if let Some(host) = parsed.host_str() { - let parts = host.splitn(4, '.').collect::>(); - if parts.len() == 4 && parts[0] == "s3" && parts[2] == "amazonaws" - { - self.bucket_name = Some(parts[1].to_string()); - } - if parts.len() == 4 - && parts[1] == "s3" - && parts[3] == "amazonaws.com" - { - self.bucket_name = Some(parts[0].to_string()); - self.region = Some(parts[2].to_string()); - self.virtual_hosted_style_request = true; - } - } + pub fn with_url(mut self, url: impl Into) -> Self { + self.url = Some(url.into()); + self + } + + /// Sets properties on this builder based on a URL + /// + /// This is a separate member function to allow fallible computation to + /// be deferred until [`Self::build`] which in turn allows deriving [`Clone`] + fn parse_url(&mut self, url: &str) -> Result<()> { + let parsed = Url::parse(url).context(UnableToParseUrlSnafu { url })?; + let host = parsed.host_str().context(UrlNotRecognisedSnafu { url })?; + let validate = |s: &str| match s.contains('.') { + true => Err(UrlNotRecognisedSnafu { url }.build()), + false => Ok(s.to_string()), + }; + + match parsed.scheme() { + "s3" | "s3a" => self.bucket_name = Some(validate(host)?), + "https" => match host.splitn(4, '.').collect_tuple() { + Some(("s3", bucket, "amazonaws", "com")) => { + self.bucket_name = Some(bucket.to_string()); } - other => { - self.url_parse_error = Some(Error::UnknownUrlScheme { - scheme: other.into(), - }); + Some((bucket, "s3", region, "amazonaws.com")) => { + self.bucket_name = Some(bucket.to_string()); + self.region = Some(region.to_string()); + self.virtual_hosted_style_request = true; } + _ => return Err(UrlNotRecognisedSnafu { url }.build().into()), }, - Err(err) => { - self.url_parse_error = Some(Error::UnableToParseUrl { - source: err, - url: url.as_ref().into(), - }); - } + scheme => return Err(UnknownUrlSchemeSnafu { scheme }.build().into()), }; - self + Ok(()) } /// Set the AWS Access Key (required) @@ -641,9 +638,9 @@ impl AmazonS3Builder { /// Create a [`AmazonS3`] instance from the provided values, /// consuming `self`. - pub fn build(self) -> Result { - if let Some(err) = self.url_parse_error { - return Err(err.into()); + pub fn build(mut self) -> Result { + if let Some(url) = self.url.take() { + self.parse_url(&url)?; } let bucket = self.bucket_name.context(MissingBucketNameSnafu)?; @@ -1022,15 +1019,36 @@ mod tests { #[test] fn s3_test_urls() { - let builder = AmazonS3Builder::new().with_url("s3://bucket/path"); + let mut builder = AmazonS3Builder::new(); + builder.parse_url("s3://bucket/path").unwrap(); assert_eq!(builder.bucket_name, Some("bucket".to_string())); - let builder = AmazonS3Builder::new().with_url("https://s3.bucket.amazonaws.com"); + let mut builder = AmazonS3Builder::new(); + builder + .parse_url("https://s3.bucket.amazonaws.com") + .unwrap(); assert_eq!(builder.bucket_name, Some("bucket".to_string())); - let builder = - AmazonS3Builder::new().with_url("https://bucket.s3.region.amazonaws.com"); + let mut builder = AmazonS3Builder::new(); + builder + .parse_url("https://bucket.s3.region.amazonaws.com") + .unwrap(); assert_eq!(builder.bucket_name, Some("bucket".to_string())); - assert_eq!(builder.region, Some("region".to_string())) + assert_eq!(builder.region, Some("region".to_string())); + assert!(builder.virtual_hosted_style_request); + + let err_cases = [ + "mailto://bucket/path", + "s3://bucket.mydomain/path", + "https://s3.bucket.mydomain.com", + "https://s3.bucket.foo.amazonaws.com", + "https://bucket.mydomain.region.amazonaws.com", + "https://bucket.s3.region.bar.amazonaws.com", + "https://bucket.foo.s3.amazonaws.com", + ]; + let mut builder = AmazonS3Builder::new(); + for case in err_cases { + builder.parse_url(case).unwrap_err(); + } } } diff --git a/src/azure/mod.rs b/src/azure/mod.rs index 4224ae6..7cf369d 100644 --- a/src/azure/mod.rs +++ b/src/azure/mod.rs @@ -37,7 +37,7 @@ use async_trait::async_trait; use bytes::Bytes; use chrono::{TimeZone, Utc}; use futures::{stream::BoxStream, StreamExt, TryStreamExt}; -use snafu::{ResultExt, Snafu}; +use snafu::{OptionExt, ResultExt, Snafu}; use std::collections::BTreeSet; use std::fmt::{Debug, Formatter}; use std::io; @@ -121,6 +121,9 @@ enum Error { scheme ))] UnknownUrlScheme { scheme: String }, + + #[snafu(display("URL did not match any known pattern for scheme: {}", url))] + UrlNotRecognised { url: String }, } impl From for super::Error { @@ -354,7 +357,7 @@ impl CloudMultiPartUploadImpl for AzureMultiPartUpload { /// .with_container_name(BUCKET_NAME) /// .build(); /// ``` -#[derive(Default)] +#[derive(Default, Clone)] pub struct MicrosoftAzureBuilder { account_name: Option, access_key: Option, @@ -365,10 +368,10 @@ pub struct MicrosoftAzureBuilder { tenant_id: Option, sas_query_pairs: Option>, authority_host: Option, + url: Option, use_emulator: bool, retry_config: RetryConfig, client_options: ClientOptions, - url_parse_error: Option, } impl Debug for MicrosoftAzureBuilder { @@ -444,9 +447,7 @@ impl MicrosoftAzureBuilder { /// - `https://.dfs.core.windows.net` /// - `https://.blob.core.windows.net` /// - /// Please note that this is a best effort implementation, and will not fail for malformed URLs, - /// but rather warn and ignore the passed url. The url also has no effect on how the - /// storage is accessed - e.g. which driver or protocol is used for reading from the location. + /// Note: Settings derived from the URL will override any others set on this builder /// /// # Example /// ``` @@ -456,52 +457,48 @@ impl MicrosoftAzureBuilder { /// .with_url("abfss://file_system@account.dfs.core.windows.net/") /// .build(); /// ``` - pub fn with_url(mut self, url: impl AsRef) -> Self { - let maybe_parsed = Url::parse(url.as_ref()); - match maybe_parsed { - Ok(parsed) => match parsed.scheme() { - "az" | "adl" | "azure" => { - self.container_name = parsed.host_str().map(|host| host.to_owned()); - } - "abfs" | "abfss" => { - // abfs(s) might refer to the fsspec convention abfs:/// - // or the convention for the hadoop driver abfs[s]://@.dfs.core.windows.net/ - if parsed.username().is_empty() { - self.container_name = - parsed.host_str().map(|host| host.to_owned()); - } else if let Some(host) = parsed.host_str() { - let parts = host.splitn(2, '.').collect::>(); - if parts.len() == 2 && parts[1] == "dfs.core.windows.net" { - self.container_name = Some(parsed.username().to_owned()); - self.account_name = Some(parts[0].to_string()); - } - } - } - "https" => { - if let Some(host) = parsed.host_str() { - let parts = host.splitn(2, '.').collect::>(); - if parts.len() == 2 - && (parts[1] == "dfs.core.windows.net" - || parts[1] == "blob.core.windows.net") - { - self.account_name = Some(parts[0].to_string()); - } - } + pub fn with_url(mut self, url: impl Into) -> Self { + self.url = Some(url.into()); + self + } + + /// Sets properties on this builder based on a URL + /// + /// This is a separate member function to allow fallible computation to + /// be deferred until [`Self::build`] which in turn allows deriving [`Clone`] + fn parse_url(&mut self, url: &str) -> Result<()> { + let parsed = Url::parse(url).context(UnableToParseUrlSnafu { url })?; + let host = parsed.host_str().context(UrlNotRecognisedSnafu { url })?; + + let validate = |s: &str| match s.contains('.') { + true => Err(UrlNotRecognisedSnafu { url }.build()), + false => Ok(s.to_string()), + }; + + match parsed.scheme() { + "az" | "adl" | "azure" => self.container_name = Some(validate(host)?), + "abfs" | "abfss" => { + // abfs(s) might refer to the fsspec convention abfs:/// + // or the convention for the hadoop driver abfs[s]://@.dfs.core.windows.net/ + if parsed.username().is_empty() { + self.container_name = Some(validate(host)?); + } else if let Some(a) = host.strip_suffix(".dfs.core.windows.net") { + self.container_name = Some(validate(parsed.username())?); + self.account_name = Some(validate(a)?); + } else { + return Err(UrlNotRecognisedSnafu { url }.build().into()); } - other => { - self.url_parse_error = Some(Error::UnknownUrlScheme { - scheme: other.into(), - }); + } + "https" => match host.split_once('.') { + Some((a, "dfs.core.windows.net")) + | Some((a, "blob.core.windows.net")) => { + self.account_name = Some(validate(a)?); } + _ => return Err(UrlNotRecognisedSnafu { url }.build().into()), }, - Err(err) => { - self.url_parse_error = Some(Error::UnableToParseUrl { - source: err, - url: url.as_ref().into(), - }); - } - }; - self + scheme => return Err(UnknownUrlSchemeSnafu { scheme }.build().into()), + } + Ok(()) } /// Set the Azure Account (required) @@ -595,63 +592,49 @@ impl MicrosoftAzureBuilder { /// Configure a connection to container with given name on Microsoft Azure /// Blob store. - pub fn build(self) -> Result { - let Self { - account_name, - access_key, - container_name, - bearer_token, - client_id, - client_secret, - tenant_id, - sas_query_pairs, - use_emulator, - retry_config, - authority_host, - mut client_options, - url_parse_error, - } = self; - - if let Some(err) = url_parse_error { - return Err(err.into()); + pub fn build(mut self) -> Result { + if let Some(url) = self.url.take() { + self.parse_url(&url)?; } - let container = container_name.ok_or(Error::MissingContainerName {})?; + let container = self.container_name.ok_or(Error::MissingContainerName {})?; - let (is_emulator, storage_url, auth, account) = if use_emulator { - let account_name = - account_name.unwrap_or_else(|| EMULATOR_ACCOUNT.to_string()); + let (is_emulator, storage_url, auth, account) = if self.use_emulator { + let account_name = self + .account_name + .unwrap_or_else(|| EMULATOR_ACCOUNT.to_string()); // Allow overriding defaults. Values taken from // from https://docs.rs/azure_storage/0.2.0/src/azure_storage/core/clients/storage_account_client.rs.html#129-141 let url = url_from_env("AZURITE_BLOB_STORAGE_URL", "http://127.0.0.1:10000")?; - let account_key = - access_key.unwrap_or_else(|| EMULATOR_ACCOUNT_KEY.to_string()); + let account_key = self + .access_key + .unwrap_or_else(|| EMULATOR_ACCOUNT_KEY.to_string()); let credential = credential::CredentialProvider::AccessKey(account_key); - client_options = client_options.with_allow_http(true); + self.client_options = self.client_options.with_allow_http(true); (true, url, credential, account_name) } else { - let account_name = account_name.ok_or(Error::MissingAccount {})?; + let account_name = self.account_name.ok_or(Error::MissingAccount {})?; let account_url = format!("https://{}.blob.core.windows.net", &account_name); let url = Url::parse(&account_url) .context(UnableToParseUrlSnafu { url: account_url })?; - let credential = if let Some(bearer_token) = bearer_token { + let credential = if let Some(bearer_token) = self.bearer_token { Ok(credential::CredentialProvider::AccessKey(bearer_token)) - } else if let Some(access_key) = access_key { + } else if let Some(access_key) = self.access_key { Ok(credential::CredentialProvider::AccessKey(access_key)) } else if let (Some(client_id), Some(client_secret), Some(tenant_id)) = - (client_id, client_secret, tenant_id) + (self.client_id, self.client_secret, self.tenant_id) { let client_credential = credential::ClientSecretOAuthProvider::new( client_id, client_secret, tenant_id, - authority_host, + self.authority_host, ); Ok(credential::CredentialProvider::ClientSecret( client_credential, )) - } else if let Some(query_pairs) = sas_query_pairs { + } else if let Some(query_pairs) = self.sas_query_pairs { Ok(credential::CredentialProvider::SASToken(query_pairs)) } else { Err(Error::MissingCredentials {}) @@ -661,12 +644,12 @@ impl MicrosoftAzureBuilder { let config = client::AzureConfig { account, - retry_config, - service: storage_url, + is_emulator, container, + retry_config: self.retry_config, + client_options: self.client_options, + service: storage_url, credentials: auth, - is_emulator, - client_options, }; let client = Arc::new(client::AzureClient::new(config)?); @@ -804,26 +787,49 @@ mod tests { #[test] fn azure_blob_test_urls() { - let builder = MicrosoftAzureBuilder::new() - .with_url("abfss://file_system@account.dfs.core.windows.net/"); + let mut builder = MicrosoftAzureBuilder::new(); + builder + .parse_url("abfss://file_system@account.dfs.core.windows.net/") + .unwrap(); assert_eq!(builder.account_name, Some("account".to_string())); assert_eq!(builder.container_name, Some("file_system".to_string())); - let builder = MicrosoftAzureBuilder::new().with_url("abfs://container/path"); + let mut builder = MicrosoftAzureBuilder::new(); + builder.parse_url("abfs://container/path").unwrap(); assert_eq!(builder.container_name, Some("container".to_string())); - let builder = MicrosoftAzureBuilder::new().with_url("az://container"); + let mut builder = MicrosoftAzureBuilder::new(); + builder.parse_url("az://container").unwrap(); assert_eq!(builder.container_name, Some("container".to_string())); - let builder = MicrosoftAzureBuilder::new().with_url("az://container/path"); + let mut builder = MicrosoftAzureBuilder::new(); + builder.parse_url("az://container/path").unwrap(); assert_eq!(builder.container_name, Some("container".to_string())); - let builder = MicrosoftAzureBuilder::new() - .with_url("https://account.dfs.core.windows.net/"); + let mut builder = MicrosoftAzureBuilder::new(); + builder + .parse_url("https://account.dfs.core.windows.net/") + .unwrap(); assert_eq!(builder.account_name, Some("account".to_string())); - let builder = MicrosoftAzureBuilder::new() - .with_url("https://account.blob.core.windows.net/"); - assert_eq!(builder.account_name, Some("account".to_string())) + let mut builder = MicrosoftAzureBuilder::new(); + builder + .parse_url("https://account.blob.core.windows.net/") + .unwrap(); + assert_eq!(builder.account_name, Some("account".to_string())); + + let err_cases = [ + "mailto://account.blob.core.windows.net/", + "az://blob.mydomain/", + "abfs://container.foo/path", + "abfss://file_system@account.foo.dfs.core.windows.net/", + "abfss://file_system.bar@account.dfs.core.windows.net/", + "https://blob.mydomain/", + "https://blob.foo.dfs.core.windows.net/", + ]; + let mut builder = MicrosoftAzureBuilder::new(); + for case in err_cases { + builder.parse_url(case).unwrap_err(); + } } } diff --git a/src/gcp/mod.rs b/src/gcp/mod.rs index c1424d9..f263874 100644 --- a/src/gcp/mod.rs +++ b/src/gcp/mod.rs @@ -42,7 +42,7 @@ use futures::{stream::BoxStream, StreamExt, TryStreamExt}; use percent_encoding::{percent_encode, NON_ALPHANUMERIC}; use reqwest::header::RANGE; use reqwest::{header, Client, Method, Response, StatusCode}; -use snafu::{ResultExt, Snafu}; +use snafu::{OptionExt, ResultExt, Snafu}; use tokio::io::AsyncWrite; use url::Url; @@ -142,6 +142,9 @@ enum Error { scheme ))] UnknownUrlScheme { scheme: String }, + + #[snafu(display("URL did not match any known pattern for scheme: {}", url))] + UrlNotRecognised { url: String }, } impl From for super::Error { @@ -784,13 +787,13 @@ fn reader_credentials_file( /// .with_bucket_name(BUCKET_NAME) /// .build(); /// ``` -#[derive(Debug)] +#[derive(Debug, Clone)] pub struct GoogleCloudStorageBuilder { bucket_name: Option, + url: Option, service_account_path: Option, retry_config: RetryConfig, client_options: ClientOptions, - url_parse_error: Option, } impl Default for GoogleCloudStorageBuilder { @@ -800,7 +803,7 @@ impl Default for GoogleCloudStorageBuilder { service_account_path: None, retry_config: Default::default(), client_options: ClientOptions::new().with_allow_http(true), - url_parse_error: None, + url: None, } } } @@ -845,9 +848,7 @@ impl GoogleCloudStorageBuilder { /// /// - `gs:///` /// - /// Please note that this is a best effort implementation, and will not fail for malformed URLs, - /// but rather warn and ignore the passed url. The url also has no effect on how the - /// storage is accessed - e.g. which driver or protocol is used for reading from the location. + /// Note: Settings derived from the URL will override any others set on this builder /// /// # Example /// ``` @@ -857,29 +858,31 @@ impl GoogleCloudStorageBuilder { /// .with_url("gs://bucket/path") /// .build(); /// ``` - pub fn with_url(mut self, url: impl AsRef) -> Self { - let maybe_parsed = Url::parse(url.as_ref()); - match maybe_parsed { - Ok(parsed) => match parsed.scheme() { - "gs" => { - self.bucket_name = parsed.host_str().map(|host| host.to_owned()); - } - other => { - self.url_parse_error = Some(Error::UnknownUrlScheme { - scheme: other.into(), - }); - } - }, - Err(err) => { - self.url_parse_error = Some(Error::UnableToParseUrl { - source: err, - url: url.as_ref().into(), - }); - } - }; + pub fn with_url(mut self, url: impl Into) -> Self { + self.url = Some(url.into()); self } + /// Sets properties on this builder based on a URL + /// + /// This is a separate member function to allow fallible computation to + /// be deferred until [`Self::build`] which in turn allows deriving [`Clone`] + fn parse_url(&mut self, url: &str) -> Result<()> { + let parsed = Url::parse(url).context(UnableToParseUrlSnafu { url })?; + let host = parsed.host_str().context(UrlNotRecognisedSnafu { url })?; + + let validate = |s: &str| match s.contains('.') { + true => Err(UrlNotRecognisedSnafu { url }.build()), + false => Ok(s.to_string()), + }; + + match parsed.scheme() { + "gs" => self.bucket_name = Some(validate(host)?), + scheme => return Err(UnknownUrlSchemeSnafu { scheme }.build().into()), + } + Ok(()) + } + /// Set the bucket name (required) pub fn with_bucket_name(mut self, bucket_name: impl Into) -> Self { self.bucket_name = Some(bucket_name.into()); @@ -927,24 +930,17 @@ impl GoogleCloudStorageBuilder { /// Configure a connection to Google Cloud Storage, returning a /// new [`GoogleCloudStorage`] and consuming `self` - pub fn build(self) -> Result { - let Self { - bucket_name, - service_account_path, - retry_config, - client_options, - url_parse_error, - } = self; - - if let Some(err) = url_parse_error { - return Err(err.into()); + pub fn build(mut self) -> Result { + if let Some(url) = self.url.take() { + self.parse_url(&url)?; } - let bucket_name = bucket_name.ok_or(Error::MissingBucketName {})?; - let service_account_path = - service_account_path.ok_or(Error::MissingServiceAccountPath)?; + let bucket_name = self.bucket_name.ok_or(Error::MissingBucketName {})?; + let service_account_path = self + .service_account_path + .ok_or(Error::MissingServiceAccountPath)?; - let client = client_options.client()?; + let client = self.client_options.client()?; let credentials = reader_credentials_file(service_account_path)?; // TODO: https://cloud.google.com/storage/docs/authentication#oauth-scopes @@ -977,8 +973,8 @@ impl GoogleCloudStorageBuilder { token_cache: Default::default(), bucket_name, bucket_name_encoded: encoded_bucket_name, - retry_config, - client_options, + retry_config: self.retry_config, + client_options: self.client_options, max_list_results: None, }), }) @@ -1199,7 +1195,14 @@ mod test { #[test] fn gcs_test_urls() { - let builder = GoogleCloudStorageBuilder::new().with_url("gs://bucket/path"); - assert_eq!(builder.bucket_name, Some("bucket".to_string())) + let mut builder = GoogleCloudStorageBuilder::new(); + builder.parse_url("gs://bucket/path").unwrap(); + assert_eq!(builder.bucket_name, Some("bucket".to_string())); + + let err_cases = ["mailto://bucket/path", "gs://bucket.mydomain/path"]; + let mut builder = GoogleCloudStorageBuilder::new(); + for case in err_cases { + builder.parse_url(case).unwrap_err(); + } } } diff --git a/src/http/mod.rs b/src/http/mod.rs index 25997d8..f05e700 100644 --- a/src/http/mod.rs +++ b/src/http/mod.rs @@ -55,8 +55,11 @@ enum Error { #[snafu(display("Must specify a URL"))] MissingUrl, - #[snafu(display("Invalid URL: {}", source))] - InvalidUrl { source: reqwest::Error }, + #[snafu(display("Unable parse source url. Url: {}, Error: {}", url, source))] + UnableToParseUrl { + source: url::ParseError, + url: String, + }, #[snafu(display("Object is a directory"))] IsDirectory, @@ -210,9 +213,9 @@ impl ObjectStore for HttpStore { } /// Configure a connection to a generic HTTP server -#[derive(Debug, Default)] +#[derive(Debug, Default, Clone)] pub struct HttpBuilder { - url: Option>, + url: Option, client_options: ClientOptions, retry_config: RetryConfig, } @@ -224,8 +227,8 @@ impl HttpBuilder { } /// Set the URL - pub fn with_url(mut self, url: impl reqwest::IntoUrl) -> Self { - self.url = Some(url.into_url().context(InvalidUrlSnafu).map_err(Into::into)); + pub fn with_url(mut self, url: impl Into) -> Self { + self.url = Some(url.into()); self } @@ -243,9 +246,11 @@ impl HttpBuilder { /// Build an [`HttpStore`] with the configured options pub fn build(self) -> Result { - let url = self.url.context(MissingUrlSnafu)??; + let url = self.url.context(MissingUrlSnafu)?; + let parsed = Url::parse(&url).context(UnableToParseUrlSnafu { url })?; + Ok(HttpStore { - client: Client::new(url, self.client_options, self.retry_config)?, + client: Client::new(parsed, self.client_options, self.retry_config)?, }) } } From 42bf18c2da58652689af64a91ad6dab3c9505e9e Mon Sep 17 00:00:00 2001 From: Robert Pack <42610831+roeap@users.noreply.github.com> Date: Wed, 4 Jan 2023 21:03:45 +0100 Subject: [PATCH 079/397] object_store: builder configuration api (#3436) * feat: draf configuration api for azure * feat: add configuration for aws and gcp * fix: clippy * feat: allow passing typed config keys * refactor: implement try_from for config keys * chore: PR feedback * refactor: make options api fallible * fix: docs errors * chore: remove helpers * test: test sas key splitting and un-nit nits --- src/aws/mod.rs | 348 +++++++++++++++++++++++++++++++++++++++++++---- src/azure/mod.rs | 339 +++++++++++++++++++++++++++++++++++++++++---- src/gcp/mod.rs | 174 +++++++++++++++++++++++- src/lib.rs | 7 + src/util.rs | 9 ++ 5 files changed, 822 insertions(+), 55 deletions(-) diff --git a/src/aws/mod.rs b/src/aws/mod.rs index 786ccd2..4b633d9 100644 --- a/src/aws/mod.rs +++ b/src/aws/mod.rs @@ -37,9 +37,11 @@ use chrono::{DateTime, Utc}; use futures::stream::BoxStream; use futures::TryStreamExt; use itertools::Itertools; +use serde::{Deserialize, Serialize}; use snafu::{OptionExt, ResultExt, Snafu}; use std::collections::BTreeSet; use std::ops::Range; +use std::str::FromStr; use std::sync::Arc; use tokio::io::AsyncWrite; use tracing::info; @@ -51,6 +53,7 @@ use crate::aws::credential::{ StaticCredentialProvider, WebIdentityProvider, }; use crate::multipart::{CloudMultiPartUpload, CloudMultiPartUploadImpl, UploadPart}; +use crate::util::str_is_truthy; use crate::{ ClientOptions, GetResult, ListResult, MultipartId, ObjectMeta, ObjectStore, Path, Result, RetryConfig, StreamExt, @@ -133,13 +136,21 @@ enum Error { #[snafu(display("URL did not match any known pattern for scheme: {}", url))] UrlNotRecognised { url: String }, + + #[snafu(display("Configuration key: '{}' is not known.", key))] + UnknownConfigurationKey { key: String }, } impl From for super::Error { - fn from(err: Error) -> Self { - Self::Generic { - store: "S3", - source: Box::new(err), + fn from(source: Error) -> Self { + match source { + Error::UnknownConfigurationKey { key } => { + Self::UnknownConfigurationKey { store: "S3", key } + } + _ => Self::Generic { + store: "S3", + source: Box::new(source), + }, } } } @@ -379,6 +390,184 @@ pub struct AmazonS3Builder { client_options: ClientOptions, } +/// Configuration keys for [`AmazonS3Builder`] +/// +/// Configuration via keys can be dome via the [`try_with_option`](AmazonS3Builder::try_with_option) +/// or [`with_options`](AmazonS3Builder::try_with_options) methods on the builder. +/// +/// # Example +/// ``` +/// use std::collections::HashMap; +/// use object_store::aws::{AmazonS3Builder, AmazonS3ConfigKey}; +/// +/// let options = HashMap::from([ +/// ("aws_access_key_id", "my-access-key-id"), +/// ("aws_secret_access_key", "my-secret-access-key"), +/// ]); +/// let typed_options = vec![ +/// (AmazonS3ConfigKey::DefaultRegion, "my-default-region"), +/// ]; +/// let azure = AmazonS3Builder::new() +/// .try_with_options(options) +/// .unwrap() +/// .try_with_options(typed_options) +/// .unwrap() +/// .try_with_option(AmazonS3ConfigKey::Region, "my-region") +/// .unwrap(); +/// ``` +#[derive(PartialEq, Eq, Hash, Clone, Debug, Copy, Serialize, Deserialize)] +pub enum AmazonS3ConfigKey { + /// AWS Access Key + /// + /// See [`AmazonS3Builder::with_access_key_id`] for details. + /// + /// Supported keys: + /// - `aws_access_key_id` + /// - `access_key_id` + AccessKeyId, + + /// Secret Access Key + /// + /// See [`AmazonS3Builder::with_secret_access_key`] for details. + /// + /// Supported keys: + /// - `aws_secret_access_key` + /// - `secret_access_key` + SecretAccessKey, + + /// Region + /// + /// See [`AmazonS3Builder::with_region`] for details. + /// + /// Supported keys: + /// - `aws_region` + /// - `region` + Region, + + /// Default region + /// + /// See [`AmazonS3Builder::with_region`] for details. + /// + /// Supported keys: + /// - `aws_default_region` + /// - `default_region` + DefaultRegion, + + /// Bucket name + /// + /// See [`AmazonS3Builder::with_bucket_name`] for details. + /// + /// Supported keys: + /// - `aws_bucket` + /// - `aws_bucket_name` + /// - `bucket` + /// - `bucket_name` + Bucket, + + /// Sets custom endpoint for communicating with AWS S3. + /// + /// See [`AmazonS3Builder::with_endpoint`] for details. + /// + /// Supported keys: + /// - `aws_endpoint` + /// - `aws_endpoint_url` + /// - `endpoint` + /// - `endpoint_url` + Endpoint, + + /// Token to use for requests (passed to underlying provider) + /// + /// See [`AmazonS3Builder::with_token`] for details. + /// + /// Supported keys: + /// - `aws_session_token` + /// - `aws_token` + /// - `session_token` + /// - `token` + Token, + + /// Fall back to ImdsV1 + /// + /// See [`AmazonS3Builder::with_imdsv1_fallback`] for details. + /// + /// Supported keys: + /// - `aws_imdsv1_fallback` + /// - `imdsv1_fallback` + ImdsV1Fallback, + + /// If virtual hosted style request has to be used + /// + /// See [`AmazonS3Builder::with_virtual_hosted_style_request`] for details. + /// + /// Supported keys: + /// - `aws_virtual_hosted_style_request` + /// - `virtual_hosted_style_request` + VirtualHostedStyleRequest, + + /// Set the instance metadata endpoint + /// + /// See [`AmazonS3Builder::with_metadata_endpoint`] for details. + /// + /// Supported keys: + /// - `aws_metadata_endpoint` + /// - `metadata_endpoint` + MetadataEndpoint, + + /// AWS profile name + /// + /// Supported keys: + /// - `aws_profile` + /// - `profile` + Profile, +} + +impl AsRef for AmazonS3ConfigKey { + fn as_ref(&self) -> &str { + match self { + Self::AccessKeyId => "aws_access_key_id", + Self::SecretAccessKey => "aws_secret_access_key", + Self::Region => "aws_region", + Self::Bucket => "aws_bucket", + Self::Endpoint => "aws_endpoint", + Self::Token => "aws_session_token", + Self::ImdsV1Fallback => "aws_imdsv1_fallback", + Self::VirtualHostedStyleRequest => "aws_virtual_hosted_style_request", + Self::DefaultRegion => "aws_default_region", + Self::MetadataEndpoint => "aws_metadata_endpoint", + Self::Profile => "aws_profile", + } + } +} + +impl FromStr for AmazonS3ConfigKey { + type Err = super::Error; + + fn from_str(s: &str) -> Result { + match s { + "aws_access_key_id" | "access_key_id" => Ok(Self::AccessKeyId), + "aws_secret_access_key" | "secret_access_key" => Ok(Self::SecretAccessKey), + "aws_default_region" | "default_region" => Ok(Self::DefaultRegion), + "aws_region" | "region" => Ok(Self::Region), + "aws_bucket" | "aws_bucket_name" | "bucket_name" | "bucket" => { + Ok(Self::Bucket) + } + "aws_endpoint_url" | "aws_endpoint" | "endpoint_url" | "endpoint" => { + Ok(Self::Endpoint) + } + "aws_session_token" | "aws_token" | "session_token" | "token" => { + Ok(Self::Token) + } + "aws_virtual_hosted_style_request" | "virtual_hosted_style_request" => { + Ok(Self::VirtualHostedStyleRequest) + } + "aws_profile" | "profile" => Ok(Self::Profile), + "aws_imdsv1_fallback" | "imdsv1_fallback" => Ok(Self::ImdsV1Fallback), + "aws_metadata_endpoint" | "metadata_endpoint" => Ok(Self::MetadataEndpoint), + _ => Err(Error::UnknownConfigurationKey { key: s.into() }.into()), + } + } +} + impl AmazonS3Builder { /// Create a new [`AmazonS3Builder`] with default values. pub fn new() -> Self { @@ -407,28 +596,16 @@ impl AmazonS3Builder { pub fn from_env() -> Self { let mut builder: Self = Default::default(); - if let Ok(access_key_id) = std::env::var("AWS_ACCESS_KEY_ID") { - builder.access_key_id = Some(access_key_id); - } - - if let Ok(secret_access_key) = std::env::var("AWS_SECRET_ACCESS_KEY") { - builder.secret_access_key = Some(secret_access_key); - } - - if let Ok(secret) = std::env::var("AWS_DEFAULT_REGION") { - builder.region = Some(secret); - } - - if let Ok(endpoint) = std::env::var("AWS_ENDPOINT") { - builder.endpoint = Some(endpoint); - } - - if let Ok(token) = std::env::var("AWS_SESSION_TOKEN") { - builder.token = Some(token); - } - - if let Ok(profile) = std::env::var("AWS_PROFILE") { - builder.profile = Some(profile); + for (os_key, os_value) in std::env::vars_os() { + if let (Some(key), Some(value)) = (os_key.to_str(), os_value.to_str()) { + if key.starts_with("AWS_") { + if let Ok(config_key) = + AmazonS3ConfigKey::from_str(&key.to_ascii_lowercase()) + { + builder = builder.try_with_option(config_key, value).unwrap(); + } + } + } } // This env var is set in ECS @@ -442,7 +619,7 @@ impl AmazonS3Builder { if let Ok(text) = std::env::var("AWS_ALLOW_HTTP") { builder.client_options = - builder.client_options.with_allow_http(text == "true"); + builder.client_options.with_allow_http(str_is_truthy(&text)); } builder @@ -472,6 +649,55 @@ impl AmazonS3Builder { self } + /// Set an option on the builder via a key - value pair. + /// + /// This method will return an `UnknownConfigKey` error if key cannot be parsed into [`AmazonS3ConfigKey`]. + pub fn try_with_option( + mut self, + key: impl AsRef, + value: impl Into, + ) -> Result { + match AmazonS3ConfigKey::from_str(key.as_ref())? { + AmazonS3ConfigKey::AccessKeyId => self.access_key_id = Some(value.into()), + AmazonS3ConfigKey::SecretAccessKey => { + self.secret_access_key = Some(value.into()) + } + AmazonS3ConfigKey::Region => self.region = Some(value.into()), + AmazonS3ConfigKey::Bucket => self.bucket_name = Some(value.into()), + AmazonS3ConfigKey::Endpoint => self.endpoint = Some(value.into()), + AmazonS3ConfigKey::Token => self.token = Some(value.into()), + AmazonS3ConfigKey::ImdsV1Fallback => { + self.imdsv1_fallback = str_is_truthy(&value.into()) + } + AmazonS3ConfigKey::VirtualHostedStyleRequest => { + self.virtual_hosted_style_request = str_is_truthy(&value.into()) + } + AmazonS3ConfigKey::DefaultRegion => { + self.region = self.region.or_else(|| Some(value.into())) + } + AmazonS3ConfigKey::MetadataEndpoint => { + self.metadata_endpoint = Some(value.into()) + } + AmazonS3ConfigKey::Profile => self.profile = Some(value.into()), + }; + Ok(self) + } + + /// Hydrate builder from key value pairs + /// + /// This method will return an `UnknownConfigKey` error if any key cannot be parsed into [`AmazonS3ConfigKey`]. + pub fn try_with_options< + I: IntoIterator, impl Into)>, + >( + mut self, + options: I, + ) -> Result { + for (key, value) in options { + self = self.try_with_option(key, value)?; + } + Ok(self) + } + /// Sets properties on this builder based on a URL /// /// This is a separate member function to allow fallible computation to @@ -773,6 +999,7 @@ mod tests { put_get_delete_list_opts, rename_and_copy, stream_get, }; use bytes::Bytes; + use std::collections::HashMap; use std::env; const NON_EXISTENT_NAME: &str = "nonexistentname"; @@ -915,6 +1142,73 @@ mod tests { assert_eq!(builder.metadata_endpoint.unwrap(), metadata_uri); } + #[test] + fn s3_test_config_from_map() { + let aws_access_key_id = "object_store:fake_access_key_id".to_string(); + let aws_secret_access_key = "object_store:fake_secret_key".to_string(); + let aws_default_region = "object_store:fake_default_region".to_string(); + let aws_endpoint = "object_store:fake_endpoint".to_string(); + let aws_session_token = "object_store:fake_session_token".to_string(); + let options = HashMap::from([ + ("aws_access_key_id", aws_access_key_id.clone()), + ("aws_secret_access_key", aws_secret_access_key), + ("aws_default_region", aws_default_region.clone()), + ("aws_endpoint", aws_endpoint.clone()), + ("aws_session_token", aws_session_token.clone()), + ]); + + let builder = AmazonS3Builder::new() + .try_with_options(&options) + .unwrap() + .try_with_option("aws_secret_access_key", "new-secret-key") + .unwrap(); + assert_eq!(builder.access_key_id.unwrap(), aws_access_key_id.as_str()); + assert_eq!(builder.secret_access_key.unwrap(), "new-secret-key"); + assert_eq!(builder.region.unwrap(), aws_default_region); + assert_eq!(builder.endpoint.unwrap(), aws_endpoint); + assert_eq!(builder.token.unwrap(), aws_session_token); + } + + #[test] + fn s3_test_config_from_typed_map() { + let aws_access_key_id = "object_store:fake_access_key_id".to_string(); + let aws_secret_access_key = "object_store:fake_secret_key".to_string(); + let aws_default_region = "object_store:fake_default_region".to_string(); + let aws_endpoint = "object_store:fake_endpoint".to_string(); + let aws_session_token = "object_store:fake_session_token".to_string(); + let options = HashMap::from([ + (AmazonS3ConfigKey::AccessKeyId, aws_access_key_id.clone()), + (AmazonS3ConfigKey::SecretAccessKey, aws_secret_access_key), + (AmazonS3ConfigKey::DefaultRegion, aws_default_region.clone()), + (AmazonS3ConfigKey::Endpoint, aws_endpoint.clone()), + (AmazonS3ConfigKey::Token, aws_session_token.clone()), + ]); + + let builder = AmazonS3Builder::new() + .try_with_options(&options) + .unwrap() + .try_with_option(AmazonS3ConfigKey::SecretAccessKey, "new-secret-key") + .unwrap(); + assert_eq!(builder.access_key_id.unwrap(), aws_access_key_id.as_str()); + assert_eq!(builder.secret_access_key.unwrap(), "new-secret-key"); + assert_eq!(builder.region.unwrap(), aws_default_region); + assert_eq!(builder.endpoint.unwrap(), aws_endpoint); + assert_eq!(builder.token.unwrap(), aws_session_token); + } + + #[test] + fn s3_test_config_fallible_options() { + let aws_access_key_id = "object_store:fake_access_key_id".to_string(); + let aws_secret_access_key = "object_store:fake_secret_key".to_string(); + let options = HashMap::from([ + ("aws_access_key_id", aws_access_key_id), + ("invalid-key", aws_secret_access_key), + ]); + + let builder = AmazonS3Builder::new().try_with_options(&options); + assert!(builder.is_err()); + } + #[tokio::test] async fn s3_test() { let config = maybe_skip_integration!(); diff --git a/src/azure/mod.rs b/src/azure/mod.rs index 7cf369d..416883a 100644 --- a/src/azure/mod.rs +++ b/src/azure/mod.rs @@ -37,16 +37,18 @@ use async_trait::async_trait; use bytes::Bytes; use chrono::{TimeZone, Utc}; use futures::{stream::BoxStream, StreamExt, TryStreamExt}; +use percent_encoding::percent_decode_str; +use serde::{Deserialize, Serialize}; use snafu::{OptionExt, ResultExt, Snafu}; -use std::collections::BTreeSet; use std::fmt::{Debug, Formatter}; use std::io; use std::ops::Range; use std::sync::Arc; +use std::{collections::BTreeSet, str::FromStr}; use tokio::io::AsyncWrite; use url::Url; -use crate::util::RFC1123_FMT; +use crate::util::{str_is_truthy, RFC1123_FMT}; pub use credential::authority_hosts; mod client; @@ -124,13 +126,28 @@ enum Error { #[snafu(display("URL did not match any known pattern for scheme: {}", url))] UrlNotRecognised { url: String }, + + #[snafu(display("Failed parsing an SAS key"))] + DecodeSasKey { source: std::str::Utf8Error }, + + #[snafu(display("Missing component in SAS query pair"))] + MissingSasComponent {}, + + #[snafu(display("Configuration key: '{}' is not known.", key))] + UnknownConfigurationKey { key: String }, } impl From for super::Error { fn from(source: Error) -> Self { - Self::Generic { - store: "MicrosoftAzure", - source: Box::new(source), + match source { + Error::UnknownConfigurationKey { key } => Self::UnknownConfigurationKey { + store: "MicrosoftAzure", + key, + }, + _ => Self::Generic { + store: "MicrosoftAzure", + source: Box::new(source), + }, } } } @@ -367,6 +384,7 @@ pub struct MicrosoftAzureBuilder { client_secret: Option, tenant_id: Option, sas_query_pairs: Option>, + sas_key: Option, authority_host: Option, url: Option, use_emulator: bool, @@ -374,6 +392,157 @@ pub struct MicrosoftAzureBuilder { client_options: ClientOptions, } +/// Configuration keys for [`MicrosoftAzureBuilder`] +/// +/// Configuration via keys can be dome via the [`try_with_option`](MicrosoftAzureBuilder::try_with_option) +/// or [`with_options`](MicrosoftAzureBuilder::try_with_options) methods on the builder. +/// +/// # Example +/// ``` +/// use std::collections::HashMap; +/// use object_store::azure::{MicrosoftAzureBuilder, AzureConfigKey}; +/// +/// let options = HashMap::from([ +/// ("azure_client_id", "my-client-id"), +/// ("azure_client_secret", "my-account-name"), +/// ]); +/// let typed_options = vec![ +/// (AzureConfigKey::AccountName, "my-account-name"), +/// ]; +/// let azure = MicrosoftAzureBuilder::new() +/// .try_with_options(options) +/// .unwrap() +/// .try_with_options(typed_options) +/// .unwrap() +/// .try_with_option(AzureConfigKey::AuthorityId, "my-tenant-id") +/// .unwrap(); +/// ``` +#[derive(PartialEq, Eq, Hash, Clone, Debug, Copy, Deserialize, Serialize)] +pub enum AzureConfigKey { + /// The name of the azure storage account + /// + /// Supported keys: + /// - `azure_storage_account_name` + /// - `account_name` + AccountName, + + /// Master key for accessing storage account + /// + /// Supported keys: + /// - `azure_storage_account_key` + /// - `azure_storage_access_key` + /// - `azure_storage_master_key` + /// - `access_key` + /// - `account_key` + /// - `master_key` + AccessKey, + + /// Service principal client id for authorizing requests + /// + /// Supported keys: + /// - `azure_storage_client_id` + /// - `azure_client_id` + /// - `client_id` + ClientId, + + /// Service principal client secret for authorizing requests + /// + /// Supported keys: + /// - `azure_storage_client_secret` + /// - `azure_client_secret` + /// - `client_secret` + ClientSecret, + + /// Tenant id used in oauth flows + /// + /// Supported keys: + /// - `azure_storage_tenant_id` + /// - `azure_storage_authority_id` + /// - `azure_tenant_id` + /// - `azure_authority_id` + /// - `tenant_id` + /// - `authority_id` + AuthorityId, + + /// Shared access signature. + /// + /// The signature is expected to be percent-encoded, much like they are provided + /// in the azure storage explorer or azure portal. + /// + /// Supported keys: + /// - `azure_storage_sas_key` + /// - `azure_storage_sas_token` + /// - `sas_key` + /// - `sas_token` + SasKey, + + /// Bearer token + /// + /// Supported keys: + /// - `azure_storage_token` + /// - `bearer_token` + /// - `token` + Token, + + /// Use object store with azurite storage emulator + /// + /// Supported keys: + /// - `azure_storage_use_emulator` + /// - `object_store_use_emulator` + /// - `use_emulator` + UseEmulator, +} + +impl AsRef for AzureConfigKey { + fn as_ref(&self) -> &str { + match self { + Self::AccountName => "azure_storage_account_name", + Self::AccessKey => "azure_storage_account_key", + Self::ClientId => "azure_storage_client_id", + Self::ClientSecret => "azure_storage_client_secret", + Self::AuthorityId => "azure_storage_tenant_id", + Self::SasKey => "azure_storage_sas_key", + Self::Token => "azure_storage_token", + Self::UseEmulator => "azure_storage_use_emulator", + } + } +} + +impl FromStr for AzureConfigKey { + type Err = super::Error; + + fn from_str(s: &str) -> Result { + match s { + "azure_storage_account_key" + | "azure_storage_access_key" + | "azure_storage_master_key" + | "master_key" + | "account_key" + | "access_key" => Ok(Self::AccessKey), + "azure_storage_account_name" | "account_name" => Ok(Self::AccountName), + "azure_storage_client_id" | "azure_client_id" | "client_id" => { + Ok(Self::ClientId) + } + "azure_storage_client_secret" | "azure_client_secret" | "client_secret" => { + Ok(Self::ClientSecret) + } + "azure_storage_tenant_id" + | "azure_storage_authority_id" + | "azure_tenant_id" + | "azure_authority_id" + | "tenant_id" + | "authority_id" => Ok(Self::AuthorityId), + "azure_storage_sas_key" + | "azure_storage_sas_token" + | "sas_key" + | "sas_token" => Ok(Self::SasKey), + "azure_storage_token" | "bearer_token" | "token" => Ok(Self::Token), + "azure_storage_use_emulator" | "use_emulator" => Ok(Self::UseEmulator), + _ => Err(Error::UnknownConfigurationKey { key: s.into() }.into()), + } + } +} + impl Debug for MicrosoftAzureBuilder { fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { write!( @@ -409,27 +578,21 @@ impl MicrosoftAzureBuilder { /// ``` pub fn from_env() -> Self { let mut builder = Self::default(); - - if let Ok(account_name) = std::env::var("AZURE_STORAGE_ACCOUNT_NAME") { - builder.account_name = Some(account_name); - } - - if let Ok(access_key) = std::env::var("AZURE_STORAGE_ACCOUNT_KEY") { - builder.access_key = Some(access_key); - } else if let Ok(access_key) = std::env::var("AZURE_STORAGE_ACCESS_KEY") { - builder.access_key = Some(access_key); - } - - if let Ok(client_id) = std::env::var("AZURE_STORAGE_CLIENT_ID") { - builder.client_id = Some(client_id); - } - - if let Ok(client_secret) = std::env::var("AZURE_STORAGE_CLIENT_SECRET") { - builder.client_secret = Some(client_secret); + for (os_key, os_value) in std::env::vars_os() { + if let (Some(key), Some(value)) = (os_key.to_str(), os_value.to_str()) { + if key.starts_with("AZURE_") { + if let Ok(config_key) = + AzureConfigKey::from_str(&key.to_ascii_lowercase()) + { + builder = builder.try_with_option(config_key, value).unwrap(); + } + } + } } - if let Ok(tenant_id) = std::env::var("AZURE_STORAGE_TENANT_ID") { - builder.tenant_id = Some(tenant_id); + if let Ok(text) = std::env::var("AZURE_ALLOW_HTTP") { + builder.client_options = + builder.client_options.with_allow_http(str_is_truthy(&text)); } builder @@ -462,6 +625,40 @@ impl MicrosoftAzureBuilder { self } + /// Set an option on the builder via a key - value pair. + pub fn try_with_option( + mut self, + key: impl AsRef, + value: impl Into, + ) -> Result { + match AzureConfigKey::from_str(key.as_ref())? { + AzureConfigKey::AccessKey => self.access_key = Some(value.into()), + AzureConfigKey::AccountName => self.account_name = Some(value.into()), + AzureConfigKey::ClientId => self.client_id = Some(value.into()), + AzureConfigKey::ClientSecret => self.client_secret = Some(value.into()), + AzureConfigKey::AuthorityId => self.tenant_id = Some(value.into()), + AzureConfigKey::SasKey => self.sas_key = Some(value.into()), + AzureConfigKey::Token => self.bearer_token = Some(value.into()), + AzureConfigKey::UseEmulator => { + self.use_emulator = str_is_truthy(&value.into()) + } + }; + Ok(self) + } + + /// Hydrate builder from key value pairs + pub fn try_with_options< + I: IntoIterator, impl Into)>, + >( + mut self, + options: I, + ) -> Result { + for (key, value) in options { + self = self.try_with_option(key, value)?; + } + Ok(self) + } + /// Sets properties on this builder based on a URL /// /// This is a separate member function to allow fallible computation to @@ -636,6 +833,8 @@ impl MicrosoftAzureBuilder { )) } else if let Some(query_pairs) = self.sas_query_pairs { Ok(credential::CredentialProvider::SASToken(query_pairs)) + } else if let Some(sas) = self.sas_key { + Ok(credential::CredentialProvider::SASToken(split_sas(&sas)?)) } else { Err(Error::MissingCredentials {}) }?; @@ -673,6 +872,25 @@ fn url_from_env(env_name: &str, default_url: &str) -> Result { Ok(url) } +fn split_sas(sas: &str) -> Result, Error> { + let sas = percent_decode_str(sas) + .decode_utf8() + .context(DecodeSasKeySnafu {})?; + let kv_str_pairs = sas + .trim_start_matches('?') + .split('&') + .filter(|s| !s.chars().all(char::is_whitespace)); + let mut pairs = Vec::new(); + for kv_pair_str in kv_str_pairs { + let (k, v) = kv_pair_str + .trim() + .split_once('=') + .ok_or(Error::MissingSasComponent {})?; + pairs.push((k.into(), v.into())) + } + Ok(pairs) +} + #[cfg(test)] mod tests { use super::*; @@ -680,6 +898,7 @@ mod tests { copy_if_not_exists, list_uses_directories_correctly, list_with_delimiter, put_get_delete_list, put_get_delete_list_opts, rename_and_copy, stream_get, }; + use std::collections::HashMap; use std::env; // Helper macro to skip tests if TEST_INTEGRATION and the Azure environment @@ -832,4 +1051,76 @@ mod tests { builder.parse_url(case).unwrap_err(); } } + + #[test] + fn azure_test_config_from_map() { + let azure_client_id = "object_store:fake_access_key_id"; + let azure_storage_account_name = "object_store:fake_secret_key"; + let azure_storage_token = "object_store:fake_default_region"; + let options = HashMap::from([ + ("azure_client_id", azure_client_id), + ("azure_storage_account_name", azure_storage_account_name), + ("azure_storage_token", azure_storage_token), + ]); + + let builder = MicrosoftAzureBuilder::new() + .try_with_options(options) + .unwrap(); + assert_eq!(builder.client_id.unwrap(), azure_client_id); + assert_eq!(builder.account_name.unwrap(), azure_storage_account_name); + assert_eq!(builder.bearer_token.unwrap(), azure_storage_token); + } + + #[test] + fn azure_test_config_from_typed_map() { + let azure_client_id = "object_store:fake_access_key_id".to_string(); + let azure_storage_account_name = "object_store:fake_secret_key".to_string(); + let azure_storage_token = "object_store:fake_default_region".to_string(); + let options = HashMap::from([ + (AzureConfigKey::ClientId, azure_client_id.clone()), + ( + AzureConfigKey::AccountName, + azure_storage_account_name.clone(), + ), + (AzureConfigKey::Token, azure_storage_token.clone()), + ]); + + let builder = MicrosoftAzureBuilder::new() + .try_with_options(&options) + .unwrap(); + assert_eq!(builder.client_id.unwrap(), azure_client_id); + assert_eq!(builder.account_name.unwrap(), azure_storage_account_name); + assert_eq!(builder.bearer_token.unwrap(), azure_storage_token); + } + + #[test] + fn azure_test_config_fallible_options() { + let azure_client_id = "object_store:fake_access_key_id".to_string(); + let azure_storage_token = "object_store:fake_default_region".to_string(); + let options = HashMap::from([ + ("azure_client_id", azure_client_id), + ("invalid-key", azure_storage_token), + ]); + + let builder = MicrosoftAzureBuilder::new().try_with_options(&options); + assert!(builder.is_err()); + } + + #[test] + fn azure_test_split_sas() { + let raw_sas = "?sv=2021-10-04&st=2023-01-04T17%3A48%3A57Z&se=2023-01-04T18%3A15%3A00Z&sr=c&sp=rcwl&sig=C7%2BZeEOWbrxPA3R0Cw%2Fw1EZz0%2B4KBvQexeKZKe%2BB6h0%3D"; + let expected = vec![ + ("sv".to_string(), "2021-10-04".to_string()), + ("st".to_string(), "2023-01-04T17:48:57Z".to_string()), + ("se".to_string(), "2023-01-04T18:15:00Z".to_string()), + ("sr".to_string(), "c".to_string()), + ("sp".to_string(), "rcwl".to_string()), + ( + "sig".to_string(), + "C7+ZeEOWbrxPA3R0Cw/w1EZz0+4KBvQexeKZKe+B6h0=".to_string(), + ), + ]; + let pairs = split_sas(raw_sas).unwrap(); + assert_eq!(expected, pairs); + } } diff --git a/src/gcp/mod.rs b/src/gcp/mod.rs index f263874..177812f 100644 --- a/src/gcp/mod.rs +++ b/src/gcp/mod.rs @@ -33,6 +33,7 @@ use std::collections::BTreeSet; use std::fs::File; use std::io::{self, BufReader}; use std::ops::Range; +use std::str::FromStr; use std::sync::Arc; use async_trait::async_trait; @@ -42,6 +43,7 @@ use futures::{stream::BoxStream, StreamExt, TryStreamExt}; use percent_encoding::{percent_encode, NON_ALPHANUMERIC}; use reqwest::header::RANGE; use reqwest::{header, Client, Method, Response, StatusCode}; +use serde::{Deserialize, Serialize}; use snafu::{OptionExt, ResultExt, Snafu}; use tokio::io::AsyncWrite; use url::Url; @@ -145,6 +147,9 @@ enum Error { #[snafu(display("URL did not match any known pattern for scheme: {}", url))] UrlNotRecognised { url: String }, + + #[snafu(display("Configuration key: '{}' is not known.", key))] + UnknownConfigurationKey { key: String }, } impl From for super::Error { @@ -164,6 +169,9 @@ impl From for super::Error { source: Box::new(source), path, }, + Error::UnknownConfigurationKey { key } => { + Self::UnknownConfigurationKey { store: "GCS", key } + } _ => Self::Generic { store: "GCS", source: Box::new(err), @@ -796,6 +804,74 @@ pub struct GoogleCloudStorageBuilder { client_options: ClientOptions, } +/// Configuration keys for [`GoogleCloudStorageBuilder`] +/// +/// Configuration via keys can be dome via the [`try_with_option`](GoogleCloudStorageBuilder::try_with_option) +/// or [`with_options`](GoogleCloudStorageBuilder::try_with_options) methods on the builder. +/// +/// # Example +/// ``` +/// use std::collections::HashMap; +/// use object_store::gcp::{GoogleCloudStorageBuilder, GoogleConfigKey}; +/// +/// let options = HashMap::from([ +/// ("google_service_account", "my-service-account"), +/// ]); +/// let typed_options = vec![ +/// (GoogleConfigKey::Bucket, "my-bucket"), +/// ]; +/// let azure = GoogleCloudStorageBuilder::new() +/// .try_with_options(options) +/// .unwrap() +/// .try_with_options(typed_options) +/// .unwrap() +/// .try_with_option(GoogleConfigKey::Bucket, "my-new-bucket") +/// .unwrap(); +/// ``` +#[derive(PartialEq, Eq, Hash, Clone, Debug, Copy, Serialize, Deserialize)] +pub enum GoogleConfigKey { + /// Path to the service account file + /// + /// Supported keys: + /// - `google_service_account` + /// - `service_account` + ServiceAccount, + + /// Bucket name + /// + /// See [`GoogleCloudStorageBuilder::with_bucket_name`] for details. + /// + /// Supported keys: + /// - `google_bucket` + /// - `google_bucket_name` + /// - `bucket` + /// - `bucket_name` + Bucket, +} + +impl AsRef for GoogleConfigKey { + fn as_ref(&self) -> &str { + match self { + Self::ServiceAccount => "google_service_account", + Self::Bucket => "google_bucket", + } + } +} + +impl FromStr for GoogleConfigKey { + type Err = super::Error; + + fn from_str(s: &str) -> Result { + match s { + "google_service_account" | "service_account" => Ok(Self::ServiceAccount), + "google_bucket" | "google_bucket_name" | "bucket" | "bucket_name" => { + Ok(Self::Bucket) + } + _ => Err(Error::UnknownConfigurationKey { key: s.into() }.into()), + } + } +} + impl Default for GoogleCloudStorageBuilder { fn default() -> Self { Self { @@ -835,8 +911,16 @@ impl GoogleCloudStorageBuilder { builder.service_account_path = Some(service_account_path); } - if let Ok(service_account_path) = std::env::var("GOOGLE_SERVICE_ACCOUNT") { - builder.service_account_path = Some(service_account_path); + for (os_key, os_value) in std::env::vars_os() { + if let (Some(key), Some(value)) = (os_key.to_str(), os_value.to_str()) { + if key.starts_with("GOOGLE_") { + if let Ok(config_key) = + GoogleConfigKey::from_str(&key.to_ascii_lowercase()) + { + builder = builder.try_with_option(config_key, value).unwrap(); + } + } + } } builder @@ -863,6 +947,34 @@ impl GoogleCloudStorageBuilder { self } + /// Set an option on the builder via a key - value pair. + pub fn try_with_option( + mut self, + key: impl AsRef, + value: impl Into, + ) -> Result { + match GoogleConfigKey::from_str(key.as_ref())? { + GoogleConfigKey::ServiceAccount => { + self.service_account_path = Some(value.into()) + } + GoogleConfigKey::Bucket => self.bucket_name = Some(value.into()), + }; + Ok(self) + } + + /// Hydrate builder from key value pairs + pub fn try_with_options< + I: IntoIterator, impl Into)>, + >( + mut self, + options: I, + ) -> Result { + for (key, value) in options { + self = self.try_with_option(key, value)?; + } + Ok(self) + } + /// Sets properties on this builder based on a URL /// /// This is a separate member function to allow fallible computation to @@ -995,9 +1107,9 @@ fn convert_object_meta(object: &Object) -> Result { #[cfg(test)] mod test { - use std::env; - use bytes::Bytes; + use std::collections::HashMap; + use std::env; use crate::{ tests::{ @@ -1205,4 +1317,58 @@ mod test { builder.parse_url(case).unwrap_err(); } } + + #[test] + fn gcs_test_config_from_map() { + let google_service_account = "object_store:fake_service_account".to_string(); + let google_bucket_name = "object_store:fake_bucket".to_string(); + let options = HashMap::from([ + ("google_service_account", google_service_account.clone()), + ("google_bucket_name", google_bucket_name.clone()), + ]); + + let builder = GoogleCloudStorageBuilder::new() + .try_with_options(&options) + .unwrap(); + assert_eq!( + builder.service_account_path.unwrap(), + google_service_account.as_str() + ); + assert_eq!(builder.bucket_name.unwrap(), google_bucket_name.as_str()); + } + + #[test] + fn gcs_test_config_from_typed_map() { + let google_service_account = "object_store:fake_service_account".to_string(); + let google_bucket_name = "object_store:fake_bucket".to_string(); + let options = HashMap::from([ + ( + GoogleConfigKey::ServiceAccount, + google_service_account.clone(), + ), + (GoogleConfigKey::Bucket, google_bucket_name.clone()), + ]); + + let builder = GoogleCloudStorageBuilder::new() + .try_with_options(&options) + .unwrap(); + assert_eq!( + builder.service_account_path.unwrap(), + google_service_account.as_str() + ); + assert_eq!(builder.bucket_name.unwrap(), google_bucket_name.as_str()); + } + + #[test] + fn gcs_test_config_fallible_options() { + let google_service_account = "object_store:fake_service_account".to_string(); + let google_bucket_name = "object_store:fake_bucket".to_string(); + let options = HashMap::from([ + ("google_service_account", google_service_account), + ("invalid-key", google_bucket_name), + ]); + + let builder = GoogleCloudStorageBuilder::new().try_with_options(&options); + assert!(builder.is_err()); + } } diff --git a/src/lib.rs b/src/lib.rs index 425c5cd..4ec58c3 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -555,6 +555,13 @@ pub enum Error { #[snafu(display("Operation not yet implemented."))] NotImplemented, + + #[snafu(display( + "Configuration key: '{}' is not valid for store '{}'.", + key, + store + ))] + UnknownConfigurationKey { store: &'static str, key: String }, } impl From for std::io::Error { diff --git a/src/util.rs b/src/util.rs index e592e7b..08bfd86 100644 --- a/src/util.rs +++ b/src/util.rs @@ -185,6 +185,15 @@ fn merge_ranges( ret } +#[allow(dead_code)] +pub(crate) fn str_is_truthy(val: &str) -> bool { + val.eq_ignore_ascii_case("1") + | val.eq_ignore_ascii_case("true") + | val.eq_ignore_ascii_case("on") + | val.eq_ignore_ascii_case("yes") + | val.eq_ignore_ascii_case("y") +} + #[cfg(test)] mod tests { use super::*; From b941d49c11c796e0e3d8951f3808a38aa00fecf4 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 5 Jan 2023 09:08:21 +0000 Subject: [PATCH 080/397] Prepare object_store 0.5.3 (#3457) --- CHANGELOG-old.md | 27 ++++++++++++++++++++ CHANGELOG.md | 42 +++++++++++++++++++------------- Cargo.toml | 2 +- dev/release/update_change_log.sh | 4 +-- 4 files changed, 55 insertions(+), 20 deletions(-) diff --git a/CHANGELOG-old.md b/CHANGELOG-old.md index 1397d8a..2813cfc 100644 --- a/CHANGELOG-old.md +++ b/CHANGELOG-old.md @@ -21,6 +21,33 @@ # Changelog +## [object_store_0.5.2](https://github.com/apache/arrow-rs/tree/object_store_0.5.2) (2022-12-02) + +[Full Changelog](https://github.com/apache/arrow-rs/compare/object_store_0.5.1...object_store_0.5.2) + +**Implemented enhancements:** + +- Object Store: Allow custom reqwest client [\#3127](https://github.com/apache/arrow-rs/issues/3127) +- socks5 proxy support for the object\_store crate [\#2989](https://github.com/apache/arrow-rs/issues/2989) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Cannot query S3 paths containing whitespace [\#2799](https://github.com/apache/arrow-rs/issues/2799) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] + +**Fixed bugs:** + +- object\_store\(gcp\): GCP complains about content-length for copy [\#3235](https://github.com/apache/arrow-rs/issues/3235) +- object\_store\(aws\): EntityTooSmall error on multi-part upload [\#3233](https://github.com/apache/arrow-rs/issues/3233) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] + +**Merged pull requests:** + +- Add more ClientConfig Options for Object Store RequestBuilder \(\#3127\) [\#3256](https://github.com/apache/arrow-rs/pull/3256) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Add ObjectStore ClientConfig [\#3252](https://github.com/apache/arrow-rs/pull/3252) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- fix\(object\_store,gcp\): test copy\_if\_not\_exist [\#3236](https://github.com/apache/arrow-rs/pull/3236) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([wjones127](https://github.com/wjones127)) +- fix\(object\_store,aws,gcp\): multipart upload enforce size limit of 5 MiB not 5MB [\#3234](https://github.com/apache/arrow-rs/pull/3234) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([wjones127](https://github.com/wjones127)) +- object\_store: add support for using proxy\_url for connection testing [\#3109](https://github.com/apache/arrow-rs/pull/3109) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([sum12](https://github.com/sum12)) +- Update AWS SDK [\#2974](https://github.com/apache/arrow-rs/pull/2974) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Update quick-xml requirement from 0.25.0 to 0.26.0 [\#2918](https://github.com/apache/arrow-rs/pull/2918) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- Support building object_store and parquet on wasm32-unknown-unknown target [\#2896](https://github.com/apache/arrow-rs/pull/2899) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([jondo2010](https://github.com/jondo2010)) +- Add experimental AWS\_PROFILE support \(\#2178\) [\#2891](https://github.com/apache/arrow-rs/pull/2891) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) + ## [object_store_0.5.1](https://github.com/apache/arrow-rs/tree/object_store_0.5.1) (2022-10-04) [Full Changelog](https://github.com/apache/arrow-rs/compare/object_store_0.5.0...object_store_0.5.1) diff --git a/CHANGELOG.md b/CHANGELOG.md index 528d649..41b029c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,32 +19,40 @@ # Changelog -## [object_store_0.5.2](https://github.com/apache/arrow-rs/tree/object_store_0.5.2) (2022-12-02) +## [object_store_0.5.3](https://github.com/apache/arrow-rs/tree/object_store_0.5.3) (2023-01-04) -[Full Changelog](https://github.com/apache/arrow-rs/compare/object_store_0.5.1...object_store_0.5.2) +[Full Changelog](https://github.com/apache/arrow-rs/compare/object_store_0.5.2...object_store_0.5.3) **Implemented enhancements:** -- Object Store: Allow custom reqwest client [\#3127](https://github.com/apache/arrow-rs/issues/3127) -- socks5 proxy support for the object\_store crate [\#2989](https://github.com/apache/arrow-rs/issues/2989) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- Cannot query S3 paths containing whitespace [\#2799](https://github.com/apache/arrow-rs/issues/2799) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Derive Clone for the builders in object-store. [\#3419](https://github.com/apache/arrow-rs/issues/3419) +- Add a constant prefix object store wrapper [\#3328](https://github.com/apache/arrow-rs/issues/3328) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Add support for content-type while uploading files through ObjectStore API [\#3300](https://github.com/apache/arrow-rs/issues/3300) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Add HttpStore [\#3294](https://github.com/apache/arrow-rs/issues/3294) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Add support for Azure Data Lake Storage Gen2 \(aka: ADLS Gen2\) in Object Store library [\#3283](https://github.com/apache/arrow-rs/issues/3283) +- object\_store: Add Put and Multipart Upload Doc Examples [\#2863](https://github.com/apache/arrow-rs/issues/2863) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -**Fixed bugs:** +**Closed issues:** -- object\_store\(gcp\): GCP complains about content-length for copy [\#3235](https://github.com/apache/arrow-rs/issues/3235) -- object\_store\(aws\): EntityTooSmall error on multi-part upload [\#3233](https://github.com/apache/arrow-rs/issues/3233) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Only flush buffered multi-part data on poll\_shutdown not on poll\_flush [\#3390](https://github.com/apache/arrow-rs/issues/3390) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] **Merged pull requests:** -- Add more ClientConfig Options for Object Store RequestBuilder \(\#3127\) [\#3256](https://github.com/apache/arrow-rs/pull/3256) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- Add ObjectStore ClientConfig [\#3252](https://github.com/apache/arrow-rs/pull/3252) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- fix\(object\_store,gcp\): test copy\_if\_not\_exist [\#3236](https://github.com/apache/arrow-rs/pull/3236) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([wjones127](https://github.com/wjones127)) -- fix\(object\_store,aws,gcp\): multipart upload enforce size limit of 5 MiB not 5MB [\#3234](https://github.com/apache/arrow-rs/pull/3234) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([wjones127](https://github.com/wjones127)) -- object\_store: add support for using proxy\_url for connection testing [\#3109](https://github.com/apache/arrow-rs/pull/3109) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([sum12](https://github.com/sum12)) -- Update AWS SDK [\#2974](https://github.com/apache/arrow-rs/pull/2974) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- Update quick-xml requirement from 0.25.0 to 0.26.0 [\#2918](https://github.com/apache/arrow-rs/pull/2918) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([dependabot[bot]](https://github.com/apps/dependabot)) -- Support building object_store and parquet on wasm32-unknown-unknown target [\#2896](https://github.com/apache/arrow-rs/pull/2899) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([jondo2010](https://github.com/jondo2010)) -- Add experimental AWS\_PROFILE support \(\#2178\) [\#2891](https://github.com/apache/arrow-rs/pull/2891) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- object\_store: builder configuration api [\#3436](https://github.com/apache/arrow-rs/pull/3436) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([roeap](https://github.com/roeap)) +- Derive Clone for ObjectStore builders and Make URL Parsing Stricter \(\#3419\) [\#3424](https://github.com/apache/arrow-rs/pull/3424) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Add Put and Multipart Put doc examples [\#3420](https://github.com/apache/arrow-rs/pull/3420) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([GeauxEric](https://github.com/GeauxEric)) +- object\_store: update localstack instructions [\#3403](https://github.com/apache/arrow-rs/pull/3403) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([wjones127](https://github.com/wjones127)) +- object\_store: Flush buffered multipart only during poll\_shutdown [\#3397](https://github.com/apache/arrow-rs/pull/3397) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([askoa](https://github.com/askoa)) +- Update quick-xml to 0.27 [\#3395](https://github.com/apache/arrow-rs/pull/3395) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Add HttpStore \(\#3294\) [\#3380](https://github.com/apache/arrow-rs/pull/3380) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- add support for content-type in `ClientOptions` [\#3358](https://github.com/apache/arrow-rs/pull/3358) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([ByteBaker](https://github.com/ByteBaker)) +- Update AWS SDK [\#3349](https://github.com/apache/arrow-rs/pull/3349) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Upstream newline\_delimited\_stream and ChunkedStore from DataFusion [\#3341](https://github.com/apache/arrow-rs/pull/3341) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- feat\(object\_store\): add PrefixObjectStore [\#3329](https://github.com/apache/arrow-rs/pull/3329) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([roeap](https://github.com/roeap)) +- feat\(object\_store\): parse well-known storage urls [\#3327](https://github.com/apache/arrow-rs/pull/3327) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([roeap](https://github.com/roeap)) +- Disable getrandom object\_store [\#3278](https://github.com/apache/arrow-rs/pull/3278) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Reload token from AWS\_WEB\_IDENTITY\_TOKEN\_FILE [\#3274](https://github.com/apache/arrow-rs/pull/3274) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Minor: skip aws integration test if TEST\_INTEGRATION is not set [\#3262](https://github.com/apache/arrow-rs/pull/3262) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([viirya](https://github.com/viirya)) diff --git a/Cargo.toml b/Cargo.toml index a9cc151..e61a127 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "object_store" -version = "0.5.2" +version = "0.5.3" edition = "2021" license = "MIT/Apache-2.0" readme = "README.md" diff --git a/dev/release/update_change_log.sh b/dev/release/update_change_log.sh index cf070d3..2f6c809 100755 --- a/dev/release/update_change_log.sh +++ b/dev/release/update_change_log.sh @@ -29,8 +29,8 @@ set -e -SINCE_TAG="object_store_0.5.1" -FUTURE_RELEASE="object_store_0.5.2" +SINCE_TAG="object_store_0.5.2" +FUTURE_RELEASE="object_store_0.5.3" SOURCE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" SOURCE_TOP_DIR="$(cd "${SOURCE_DIR}/../../" && pwd)" From 3e3102381b5f2fbfb8b908a997a68632d53dc429 Mon Sep 17 00:00:00 2001 From: "Valeriy V. Vorotyntsev" Date: Sat, 7 Jan 2023 18:16:49 +0200 Subject: [PATCH 081/397] [doc] Fix broken URLs (#3486) * [doc] Fix broken URLs Use proper syntax when [linking to items by name]. Before: https://docs.rs/arrow-array/latest/arrow_array/iterator/%5Bcrate::PrimitiveArray%5D After: https://docs.rs/arrow-array/latest/arrow_array/array/struct.PrimitiveArray.html [linking to items by name]: https://doc.rust-lang.org/rustdoc/write-documentation/linking-to-items-by-name.html * [doc] Use proper identifiers arrow-array: - `DecimalArray` is undefined. Use `PrimitiveArray` instead. - `arrow` crate is not among `arrow-array`'s dependencies, so its items cannot be referred to using ['intra-doc link'] syntax. ['intra-doc link']: https://doc.rust-lang.org/rustdoc/write-documentation/linking-to-items-by-name.html --- src/aws/mod.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/aws/mod.rs b/src/aws/mod.rs index 4b633d9..2017469 100644 --- a/src/aws/mod.rs +++ b/src/aws/mod.rs @@ -814,8 +814,8 @@ impl AmazonS3Builder { /// /// This option has no effect if not using instance credentials /// - /// [IMDSv2]: [https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/configuring-instance-metadata-service.html] - /// [SSRF attack]: [https://aws.amazon.com/blogs/security/defense-in-depth-open-firewalls-reverse-proxies-ssrf-vulnerabilities-ec2-instance-metadata-service/] + /// [IMDSv2]: https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/configuring-instance-metadata-service.html + /// [SSRF attack]: https://aws.amazon.com/blogs/security/defense-in-depth-open-firewalls-reverse-proxies-ssrf-vulnerabilities-ec2-instance-metadata-service/ /// pub fn with_imdsv1_fallback(mut self) -> Self { self.imdsv1_fallback = true; From 53c9e461720dba2e02b955bea5824fa405a4f7d0 Mon Sep 17 00:00:00 2001 From: Sean Smith Date: Mon, 9 Jan 2023 04:25:29 -0600 Subject: [PATCH 082/397] feat: Allow providing a service account key directly for GCS (#3489) * feat: Allow providing a service account key directly for GCP Use case: We're storing service accounts keys external to where the object store client is being created. We do not want to have to write the key to a file before creating the object store client. This change allows for providing the key directly. * Add additional aliases for specifying service account path "google_service_account_path" and "service_account_path" can now be used. * Add test asserting aliases set appropriate config option --- src/gcp/mod.rs | 144 +++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 128 insertions(+), 16 deletions(-) diff --git a/src/gcp/mod.rs b/src/gcp/mod.rs index 177812f..28972c4 100644 --- a/src/gcp/mod.rs +++ b/src/gcp/mod.rs @@ -121,8 +121,13 @@ enum Error { #[snafu(display("Missing bucket name"))] MissingBucketName {}, - #[snafu(display("Missing service account path"))] - MissingServiceAccountPath, + #[snafu(display("Missing service account path or key"))] + MissingServiceAccountPathOrKey, + + #[snafu(display( + "One of service account path or service account key may be provided." + ))] + ServiceAccountPathAndKeyProvided, #[snafu(display("GCP credential error: {}", source))] Credential { source: credential::Error }, @@ -800,14 +805,15 @@ pub struct GoogleCloudStorageBuilder { bucket_name: Option, url: Option, service_account_path: Option, + service_account_key: Option, retry_config: RetryConfig, client_options: ClientOptions, } /// Configuration keys for [`GoogleCloudStorageBuilder`] /// -/// Configuration via keys can be dome via the [`try_with_option`](GoogleCloudStorageBuilder::try_with_option) -/// or [`with_options`](GoogleCloudStorageBuilder::try_with_options) methods on the builder. +/// Configuration via keys can be done via the [`try_with_option`](GoogleCloudStorageBuilder::try_with_option) +/// or [`try_with_options`](GoogleCloudStorageBuilder::try_with_options) methods on the builder. /// /// # Example /// ``` @@ -835,8 +841,17 @@ pub enum GoogleConfigKey { /// Supported keys: /// - `google_service_account` /// - `service_account` + /// - `google_service_account_path` + /// - `service_account_path` ServiceAccount, + /// The serialized service account key. + /// + /// Supported keys: + /// - `google_service_account_key` + /// - `service_account_key` + ServiceAccountKey, + /// Bucket name /// /// See [`GoogleCloudStorageBuilder::with_bucket_name`] for details. @@ -853,6 +868,7 @@ impl AsRef for GoogleConfigKey { fn as_ref(&self) -> &str { match self { Self::ServiceAccount => "google_service_account", + Self::ServiceAccountKey => "google_service_account_key", Self::Bucket => "google_bucket", } } @@ -863,7 +879,13 @@ impl FromStr for GoogleConfigKey { fn from_str(s: &str) -> Result { match s { - "google_service_account" | "service_account" => Ok(Self::ServiceAccount), + "google_service_account" + | "service_account" + | "google_service_account_path" + | "service_account_path" => Ok(Self::ServiceAccount), + "google_service_account_key" | "service_account_key" => { + Ok(Self::ServiceAccountKey) + } "google_bucket" | "google_bucket_name" | "bucket" | "bucket_name" => { Ok(Self::Bucket) } @@ -877,6 +899,7 @@ impl Default for GoogleCloudStorageBuilder { Self { bucket_name: None, service_account_path: None, + service_account_key: None, retry_config: Default::default(), client_options: ClientOptions::new().with_allow_http(true), url: None, @@ -894,13 +917,17 @@ impl GoogleCloudStorageBuilder { /// /// Variables extracted from environment: /// * GOOGLE_SERVICE_ACCOUNT: location of service account file + /// * GOOGLE_SERVICE_ACCOUNT_PATH: (alias) location of service account file /// * SERVICE_ACCOUNT: (alias) location of service account file + /// * GOOGLE_SERVICE_ACCOUNT_KEY: JSON serialized service account key + /// * GOOGLE_BUCKET: bucket name + /// * GOOGLE_BUCKET_NAME: (alias) bucket name /// /// # Example /// ``` /// use object_store::gcp::GoogleCloudStorageBuilder; /// - /// let azure = GoogleCloudStorageBuilder::from_env() + /// let gcs = GoogleCloudStorageBuilder::from_env() /// .with_bucket_name("foo") /// .build(); /// ``` @@ -957,6 +984,9 @@ impl GoogleCloudStorageBuilder { GoogleConfigKey::ServiceAccount => { self.service_account_path = Some(value.into()) } + GoogleConfigKey::ServiceAccountKey => { + self.service_account_key = Some(value.into()) + } GoogleConfigKey::Bucket => self.bucket_name = Some(value.into()), }; Ok(self) @@ -1001,8 +1031,12 @@ impl GoogleCloudStorageBuilder { self } - /// Set the path to the service account file (required). Example - /// `"/tmp/gcs.json"` + /// Set the path to the service account file. + /// + /// This or [`GoogleCloudStorageBuilder::with_service_account_key`] must be + /// set. + /// + /// Example `"/tmp/gcs.json"`. /// /// Example contents of `gcs.json`: /// @@ -1022,6 +1056,19 @@ impl GoogleCloudStorageBuilder { self } + /// Set the service account key. The service account must be in the JSON + /// format. + /// + /// This or [`GoogleCloudStorageBuilder::with_service_account_path`] must be + /// set. + pub fn with_service_account_key( + mut self, + service_account: impl Into, + ) -> Self { + self.service_account_key = Some(service_account.into()); + self + } + /// Set the retry configuration pub fn with_retry(mut self, retry_config: RetryConfig) -> Self { self.retry_config = retry_config; @@ -1048,12 +1095,19 @@ impl GoogleCloudStorageBuilder { } let bucket_name = self.bucket_name.ok_or(Error::MissingBucketName {})?; - let service_account_path = self - .service_account_path - .ok_or(Error::MissingServiceAccountPath)?; let client = self.client_options.client()?; - let credentials = reader_credentials_file(service_account_path)?; + + let credentials = match (self.service_account_path, self.service_account_key) { + (Some(path), None) => reader_credentials_file(path)?, + (None, Some(key)) => { + serde_json::from_str(&key).context(DecodeCredentialsSnafu)? + } + (None, None) => return Err(Error::MissingServiceAccountPathOrKey.into()), + (Some(_), Some(_)) => { + return Err(Error::ServiceAccountPathAndKeyProvided.into()) + } + }; // TODO: https://cloud.google.com/storage/docs/authentication#oauth-scopes let scope = "https://www.googleapis.com/auth/devstorage.full_control"; @@ -1110,6 +1164,8 @@ mod test { use bytes::Bytes; use std::collections::HashMap; use std::env; + use std::io::Write; + use tempfile::NamedTempFile; use crate::{ tests::{ @@ -1121,6 +1177,7 @@ mod test { use super::*; + const FAKE_KEY: &str = r#"{"private_key": "private_key", "client_email":"client_email", "disable_oauth":true}"#; const NON_EXISTENT_NAME: &str = "nonexistentname"; // Helper macro to skip tests if TEST_INTEGRATION and the GCP environment variables are not set. @@ -1278,11 +1335,8 @@ mod test { #[tokio::test] async fn gcs_test_proxy_url() { - use std::io::Write; - use tempfile::NamedTempFile; let mut tfile = NamedTempFile::new().unwrap(); - let creds = r#"{"private_key": "private_key", "client_email":"client_email", "disable_oauth":true}"#; - write!(tfile, "{}", creds).unwrap(); + write!(tfile, "{}", FAKE_KEY).unwrap(); let service_account_path = tfile.path(); let gcs = GoogleCloudStorageBuilder::new() .with_service_account_path(service_account_path.to_str().unwrap()) @@ -1318,6 +1372,27 @@ mod test { } } + #[test] + fn gcs_test_service_account_key_only() { + let _ = GoogleCloudStorageBuilder::new() + .with_service_account_key(FAKE_KEY) + .with_bucket_name("foo") + .build() + .unwrap(); + } + + #[test] + fn gcs_test_service_account_key_and_path() { + let mut tfile = NamedTempFile::new().unwrap(); + write!(tfile, "{}", FAKE_KEY).unwrap(); + let _ = GoogleCloudStorageBuilder::new() + .with_service_account_key(FAKE_KEY) + .with_service_account_path(tfile.path().to_str().unwrap()) + .with_bucket_name("foo") + .build() + .unwrap_err(); + } + #[test] fn gcs_test_config_from_map() { let google_service_account = "object_store:fake_service_account".to_string(); @@ -1371,4 +1446,41 @@ mod test { let builder = GoogleCloudStorageBuilder::new().try_with_options(&options); assert!(builder.is_err()); } + + #[test] + fn gcs_test_config_aliases() { + // Service account path + for alias in [ + "google_service_account", + "service_account", + "google_service_account_path", + "service_account_path", + ] { + let builder = GoogleCloudStorageBuilder::new() + .try_with_options([(alias, "/fake/path.json")]) + .unwrap(); + assert_eq!("/fake/path.json", builder.service_account_path.unwrap()); + } + + // Service account key + for alias in ["google_service_account_key", "service_account_key"] { + let builder = GoogleCloudStorageBuilder::new() + .try_with_options([(alias, FAKE_KEY)]) + .unwrap(); + assert_eq!(FAKE_KEY, builder.service_account_key.unwrap()); + } + + // Bucket name + for alias in [ + "google_bucket", + "google_bucket_name", + "bucket", + "bucket_name", + ] { + let builder = GoogleCloudStorageBuilder::new() + .try_with_options([(alias, "fake_bucket")]) + .unwrap(); + assert_eq!("fake_bucket", builder.bucket_name.unwrap()); + } + } } From 4135b9c2fc936b121c5ca6ef39c62e74c228417e Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Tue, 10 Jan 2023 16:10:39 +0100 Subject: [PATCH 083/397] Remove azurite exception (#3497) --- src/azure/mod.rs | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/azure/mod.rs b/src/azure/mod.rs index 416883a..cbd5a35 100644 --- a/src/azure/mod.rs +++ b/src/azure/mod.rs @@ -965,10 +965,8 @@ mod tests { #[tokio::test] async fn azure_blob_test() { - let use_emulator = env::var("AZURE_USE_EMULATOR").is_ok(); let integration = maybe_skip_integration!().build().unwrap(); - // Azurite doesn't support listing with spaces - https://github.com/localstack/localstack/issues/6328 - put_get_delete_list_opts(&integration, use_emulator).await; + put_get_delete_list_opts(&integration, false).await; list_uses_directories_correctly(&integration).await; list_with_delimiter(&integration).await; rename_and_copy(&integration).await; From d351482fcd653c907d659f949d7e76a788e40291 Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Tue, 10 Jan 2023 23:01:34 -0800 Subject: [PATCH 084/397] Upgrade base64 to 0.21 (#3500) * Upgrade base64 to 0.21 * Move to function * Use prelude --- Cargo.toml | 2 +- src/azure/client.rs | 4 +++- src/azure/credential.rs | 6 ++++-- src/azure/mod.rs | 7 ++++++- src/gcp/credential.rs | 12 ++++-------- 5 files changed, 18 insertions(+), 13 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index e61a127..4be6d63 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -43,7 +43,7 @@ url = "2.2" walkdir = "2" # Cloud storage support -base64 = { version = "0.20", default-features = false, features = ["std"], optional = true } +base64 = { version = "0.21", default-features = false, features = ["std"], optional = true } quick-xml = { version = "0.27.0", features = ["serialize"], optional = true } serde = { version = "1.0", default-features = false, features = ["derive"], optional = true } serde_json = { version = "1.0", default-features = false, optional = true } diff --git a/src/azure/client.rs b/src/azure/client.rs index 556a2ad..426b3b1 100644 --- a/src/azure/client.rs +++ b/src/azure/client.rs @@ -25,6 +25,8 @@ use crate::{ BoxStream, ClientOptions, ListResult, ObjectMeta, Path, Result, RetryConfig, StreamExt, }; +use base64::prelude::BASE64_STANDARD; +use base64::Engine; use bytes::{Buf, Bytes}; use chrono::{DateTime, Utc}; use itertools::Itertools; @@ -528,7 +530,7 @@ impl BlockList { for block_id in &self.blocks { let node = format!( "\t{}\n", - base64::encode(block_id) + BASE64_STANDARD.encode(block_id) ); s.push_str(&node); } diff --git a/src/azure/credential.rs b/src/azure/credential.rs index 38e6e64..96ff8ce 100644 --- a/src/azure/credential.rs +++ b/src/azure/credential.rs @@ -19,6 +19,8 @@ use crate::client::retry::RetryExt; use crate::client::token::{TemporaryToken, TokenCache}; use crate::util::hmac_sha256; use crate::RetryConfig; +use base64::prelude::BASE64_STANDARD; +use base64::Engine; use chrono::Utc; use reqwest::header::ACCEPT; use reqwest::{ @@ -153,8 +155,8 @@ fn generate_authorization( key: &str, ) -> String { let str_to_sign = string_to_sign(h, u, method, account); - let auth = hmac_sha256(base64::decode(key).unwrap(), str_to_sign); - format!("SharedKey {}:{}", account, base64::encode(auth)) + let auth = hmac_sha256(BASE64_STANDARD.decode(key).unwrap(), str_to_sign); + format!("SharedKey {}:{}", account, BASE64_STANDARD.encode(auth)) } fn add_if_exists<'a>(h: &'a HeaderMap, key: &HeaderName) -> &'a str { diff --git a/src/azure/mod.rs b/src/azure/mod.rs index cbd5a35..3bce8e5 100644 --- a/src/azure/mod.rs +++ b/src/azure/mod.rs @@ -34,6 +34,8 @@ use crate::{ RetryConfig, }; use async_trait::async_trait; +use base64::prelude::BASE64_STANDARD; +use base64::Engine; use bytes::Bytes; use chrono::{TimeZone, Utc}; use futures::{stream::BoxStream, StreamExt, TryStreamExt}; @@ -330,7 +332,10 @@ impl CloudMultiPartUploadImpl for AzureMultiPartUpload { &self.location, Some(buf.into()), true, - &[("comp", "block"), ("blockid", &base64::encode(block_id))], + &[ + ("comp", "block"), + ("blockid", &BASE64_STANDARD.encode(block_id)), + ], ) .await?; diff --git a/src/gcp/credential.rs b/src/gcp/credential.rs index a2a98a3..cc157dd 100644 --- a/src/gcp/credential.rs +++ b/src/gcp/credential.rs @@ -18,17 +18,13 @@ use crate::client::retry::RetryExt; use crate::client::token::TemporaryToken; use crate::RetryConfig; -use base64::engine::fast_portable::FastPortable; +use base64::prelude::BASE64_URL_SAFE_NO_PAD; +use base64::Engine; use reqwest::{Client, Method}; use ring::signature::RsaKeyPair; use snafu::{ResultExt, Snafu}; use std::time::{Duration, Instant}; -const URL_SAFE_NO_PAD: FastPortable = FastPortable::from( - &base64::alphabet::URL_SAFE, - base64::engine::fast_portable::NO_PAD, -); - #[derive(Debug, Snafu)] pub enum Error { #[snafu(display("No RSA key found in pem file"))] @@ -172,7 +168,7 @@ impl OAuthProvider { ) .context(SignSnafu)?; - let signature = base64::encode_engine(&sig_bytes, &URL_SAFE_NO_PAD); + let signature = BASE64_URL_SAFE_NO_PAD.encode(sig_bytes); let jwt = [message, signature].join("."); let body = [ @@ -224,5 +220,5 @@ fn decode_first_rsa_key(private_key_pem: String) -> Result { fn b64_encode_obj(obj: &T) -> Result { let string = serde_json::to_string(obj).context(EncodeSnafu)?; - Ok(base64::encode_engine(string, &URL_SAFE_NO_PAD)) + Ok(BASE64_URL_SAFE_NO_PAD.encode(string)) } From e0dcd0cef52721aa2cf08b307664479921e05bd5 Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Tue, 17 Jan 2023 00:01:48 -0800 Subject: [PATCH 085/397] Update aws-config and aws-types requirements from 0.52 to 0.53 (#3539) --- Cargo.toml | 7 ++++--- src/aws/credential.rs | 2 +- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 4be6d63..8c9ede0 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -53,8 +53,9 @@ ring = { version = "0.16", default-features = false, features = ["std"], optiona rustls-pemfile = { version = "1.0", default-features = false, optional = true } # AWS Profile support -aws-types = { version = "0.52", optional = true } -aws-config = { version = "0.52", optional = true } +aws-types = { version = "0.53", optional = true } +aws-credential-types = { version = "0.53", optional = true } +aws-config = { version = "0.53", optional = true } [features] cloud = ["serde", "serde_json", "quick-xml", "reqwest", "reqwest/json", "reqwest/stream", "chrono/serde", "base64", "rand", "ring"] @@ -64,7 +65,7 @@ aws = ["cloud"] http = ["cloud"] # Experimental support for AWS_PROFILE -aws_profile = ["aws", "aws-config", "aws-types"] +aws_profile = ["aws", "aws-config", "aws-types", "aws-credential-types"] [dev-dependencies] # In alphabetical order dotenv = "0.15.0" diff --git a/src/aws/credential.rs b/src/aws/credential.rs index 199899d..3a6976d 100644 --- a/src/aws/credential.rs +++ b/src/aws/credential.rs @@ -518,7 +518,7 @@ mod profile { use super::*; use aws_config::profile::ProfileFileCredentialsProvider; use aws_config::provider_config::ProviderConfig; - use aws_types::credentials::ProvideCredentials; + use aws_credential_types::provider::ProvideCredentials; use aws_types::region::Region; use std::time::SystemTime; From 49af593109ccd9bf7eb09ce82c8ccb1dee8ce49b Mon Sep 17 00:00:00 2001 From: Daniel Poelzleithner Date: Wed, 25 Jan 2023 10:46:19 +0100 Subject: [PATCH 086/397] Add ClientOption.allow_insecure (#3600) * Add ClientOption.allow_insecure Add option to allow insecure https connections. In local isolated test environments, it is normal to use self signed, local certificates for automated integration testing. * clarify with_allow_invalid_certificates Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> --- src/client/mod.rs | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/src/client/mod.rs b/src/client/mod.rs index f07377e..d019e81 100644 --- a/src/client/mod.rs +++ b/src/client/mod.rs @@ -52,6 +52,7 @@ pub struct ClientOptions { default_headers: Option, proxy_url: Option, allow_http: bool, + allow_insecure: bool, timeout: Option, connect_timeout: Option, pool_idle_timeout: Option, @@ -106,6 +107,21 @@ impl ClientOptions { self.allow_http = allow_http; self } + /// Allows connections to invalid SSL certificates + /// * false (default): Only valid HTTPS certificates are allowed + /// * true: All HTTPS certificates are allowed + /// + /// # Warning + /// + /// You should think very carefully before using this method. If + /// invalid certificates are trusted, *any* certificate for *any* site + /// will be trusted for use. This includes expired certificates. This + /// introduces significant vulnerabilities, and should only be used + /// as a last resort or for testing + pub fn with_allow_invalid_certificates(mut self, allow_insecure: bool) -> Self { + self.allow_insecure = allow_insecure; + self + } /// Only use http1 connections pub fn with_http1_only(mut self) -> Self { @@ -259,6 +275,10 @@ impl ClientOptions { builder = builder.http2_prior_knowledge() } + if self.allow_insecure { + builder = builder.danger_accept_invalid_certs(self.allow_insecure) + } + builder .https_only(!self.allow_http) .build() From d4ad4152f22c64747deaed5bc7b92b9b33e10c59 Mon Sep 17 00:00:00 2001 From: Robert Pack <42610831+roeap@users.noreply.github.com> Date: Wed, 25 Jan 2023 15:50:58 +0100 Subject: [PATCH 087/397] [object_store] support azure managed and workload identities (#3581) * feat: add azure managed identity credential * test: azure managed identity credential * feat: add azure federated token credential * test: add workload identity test * refactor: PR feedback * Update object_store/src/azure/mod.rs Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> * refactor: id priorities * refactor: use managed identity as default credential * chore: remove usused parameter Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> --- src/azure/client.rs | 8 +- src/azure/credential.rs | 337 ++++++++++++++++++++++++++++++++++++++-- src/azure/mod.rs | 147 ++++++++++++++++-- 3 files changed, 459 insertions(+), 33 deletions(-) diff --git a/src/azure/client.rs b/src/azure/client.rs index 426b3b1..e42950b 100644 --- a/src/azure/client.rs +++ b/src/azure/client.rs @@ -169,9 +169,11 @@ impl AzureClient { CredentialProvider::AccessKey(key) => { Ok(AzureCredential::AccessKey(key.to_owned())) } - CredentialProvider::ClientSecret(cred) => { - let token = cred - .fetch_token(&self.client, &self.config.retry_config) + CredentialProvider::TokenCredential(cache, cred) => { + let token = cache + .get_or_insert_with(|| { + cred.fetch_token(&self.client, &self.config.retry_config) + }) .await .context(AuthorizationSnafu)?; Ok(AzureCredential::AuthorizationToken( diff --git a/src/azure/credential.rs b/src/azure/credential.rs index 96ff8ce..280d843 100644 --- a/src/azure/credential.rs +++ b/src/azure/credential.rs @@ -31,6 +31,7 @@ use reqwest::{ }, Client, Method, RequestBuilder, }; +use serde::Deserialize; use snafu::{ResultExt, Snafu}; use std::borrow::Cow; use std::str; @@ -44,8 +45,11 @@ pub(crate) static DELETE_SNAPSHOTS: HeaderName = HeaderName::from_static("x-ms-delete-snapshots"); pub(crate) static COPY_SOURCE: HeaderName = HeaderName::from_static("x-ms-copy-source"); static CONTENT_MD5: HeaderName = HeaderName::from_static("content-md5"); -pub(crate) static RFC1123_FMT: &str = "%a, %d %h %Y %T GMT"; +pub(crate) const RFC1123_FMT: &str = "%a, %d %h %Y %T GMT"; const CONTENT_TYPE_JSON: &str = "application/json"; +const MSI_SECRET_ENV_KEY: &str = "IDENTITY_HEADER"; +const MSI_API_VERSION: &str = "2019-08-01"; +const AZURE_STORAGE_SCOPE: &str = "https://storage.azure.com/.default"; #[derive(Debug, Snafu)] pub enum Error { @@ -54,6 +58,9 @@ pub enum Error { #[snafu(display("Error getting token response body: {}", source))] TokenResponseBody { source: reqwest::Error }, + + #[snafu(display("Error reading federated token file "))] + FederatedTokenFile, } pub type Result = std::result::Result; @@ -63,7 +70,7 @@ pub type Result = std::result::Result; pub enum CredentialProvider { AccessKey(String), SASToken(Vec<(String, String)>), - ClientSecret(ClientSecretOAuthProvider), + TokenCredential(TokenCache, Box), } pub(crate) enum AzureCredential { @@ -273,7 +280,16 @@ fn lexy_sort<'a>( values } -#[derive(serde::Deserialize, Debug)] +#[async_trait::async_trait] +pub trait TokenCredential: std::fmt::Debug + Send + Sync + 'static { + async fn fetch_token( + &self, + client: &Client, + retry: &RetryConfig, + ) -> Result>; +} + +#[derive(Deserialize, Debug)] struct TokenResponse { access_token: String, expires_in: u64, @@ -282,11 +298,9 @@ struct TokenResponse { /// Encapsulates the logic to perform an OAuth token challenge #[derive(Debug)] pub struct ClientSecretOAuthProvider { - scope: String, token_url: String, client_id: String, client_secret: String, - cache: TokenCache, } impl ClientSecretOAuthProvider { @@ -294,45 +308,220 @@ impl ClientSecretOAuthProvider { pub fn new( client_id: String, client_secret: String, - tenant_id: String, + tenant_id: impl AsRef, authority_host: Option, ) -> Self { let authority_host = authority_host .unwrap_or_else(|| authority_hosts::AZURE_PUBLIC_CLOUD.to_owned()); Self { - scope: "https://storage.azure.com/.default".to_owned(), - token_url: format!("{}/{}/oauth2/v2.0/token", authority_host, tenant_id), + token_url: format!( + "{}/{}/oauth2/v2.0/token", + authority_host, + tenant_id.as_ref() + ), client_id, client_secret, - cache: TokenCache::default(), } } +} +#[async_trait::async_trait] +impl TokenCredential for ClientSecretOAuthProvider { /// Fetch a token - pub async fn fetch_token( + async fn fetch_token( &self, client: &Client, retry: &RetryConfig, - ) -> Result { - self.cache - .get_or_insert_with(|| self.fetch_token_inner(client, retry)) + ) -> Result> { + let response: TokenResponse = client + .request(Method::POST, &self.token_url) + .header(ACCEPT, HeaderValue::from_static(CONTENT_TYPE_JSON)) + .form(&[ + ("client_id", self.client_id.as_str()), + ("client_secret", self.client_secret.as_str()), + ("scope", AZURE_STORAGE_SCOPE), + ("grant_type", "client_credentials"), + ]) + .send_retry(retry) + .await + .context(TokenRequestSnafu)? + .json() .await + .context(TokenResponseBodySnafu)?; + + let token = TemporaryToken { + token: response.access_token, + expiry: Instant::now() + Duration::from_secs(response.expires_in), + }; + + Ok(token) } +} + +fn expires_in_string<'de, D>(deserializer: D) -> std::result::Result +where + D: serde::de::Deserializer<'de>, +{ + let v = String::deserialize(deserializer)?; + v.parse::().map_err(serde::de::Error::custom) +} + +// NOTE: expires_on is a String version of unix epoch time, not an integer. +// +#[derive(Debug, Clone, Deserialize)] +struct MsiTokenResponse { + pub access_token: String, + #[serde(deserialize_with = "expires_in_string")] + pub expires_in: u64, +} + +/// Attempts authentication using a managed identity that has been assigned to the deployment environment. +/// +/// This authentication type works in Azure VMs, App Service and Azure Functions applications, as well as the Azure Cloud Shell +/// +#[derive(Debug)] +pub struct ImdsManagedIdentityOAuthProvider { + msi_endpoint: String, + client_id: Option, + object_id: Option, + msi_res_id: Option, + client: Client, +} + +impl ImdsManagedIdentityOAuthProvider { + /// Create a new [`ImdsManagedIdentityOAuthProvider`] for an azure backed store + pub fn new( + client_id: Option, + object_id: Option, + msi_res_id: Option, + msi_endpoint: Option, + client: Client, + ) -> Self { + let msi_endpoint = msi_endpoint.unwrap_or_else(|| { + "http://169.254.169.254/metadata/identity/oauth2/token".to_owned() + }); - /// Fetch a fresh token - async fn fetch_token_inner( + Self { + msi_endpoint, + client_id, + object_id, + msi_res_id, + client, + } + } +} + +#[async_trait::async_trait] +impl TokenCredential for ImdsManagedIdentityOAuthProvider { + /// Fetch a token + async fn fetch_token( + &self, + _client: &Client, + retry: &RetryConfig, + ) -> Result> { + let mut query_items = vec![ + ("api-version", MSI_API_VERSION), + ("resource", AZURE_STORAGE_SCOPE), + ]; + + let mut identity = None; + if let Some(client_id) = &self.client_id { + identity = Some(("client_id", client_id)); + } + if let Some(object_id) = &self.object_id { + identity = Some(("object_id", object_id)); + } + if let Some(msi_res_id) = &self.msi_res_id { + identity = Some(("msi_res_id", msi_res_id)); + } + if let Some((key, value)) = identity { + query_items.push((key, value)); + } + + let mut builder = self + .client + .request(Method::GET, &self.msi_endpoint) + .header("metadata", "true") + .query(&query_items); + + if let Ok(val) = std::env::var(MSI_SECRET_ENV_KEY) { + builder = builder.header("x-identity-header", val); + }; + + let response: MsiTokenResponse = builder + .send_retry(retry) + .await + .context(TokenRequestSnafu)? + .json() + .await + .context(TokenResponseBodySnafu)?; + + let token = TemporaryToken { + token: response.access_token, + expiry: Instant::now() + Duration::from_secs(response.expires_in), + }; + + Ok(token) + } +} + +/// Credential for using workload identity dfederation +/// +/// +#[derive(Debug)] +pub struct WorkloadIdentityOAuthProvider { + token_url: String, + client_id: String, + federated_token_file: String, +} + +impl WorkloadIdentityOAuthProvider { + /// Create a new [`WorkloadIdentityOAuthProvider`] for an azure backed store + pub fn new( + client_id: impl Into, + federated_token_file: impl Into, + tenant_id: impl AsRef, + authority_host: Option, + ) -> Self { + let authority_host = authority_host + .unwrap_or_else(|| authority_hosts::AZURE_PUBLIC_CLOUD.to_owned()); + + Self { + token_url: format!( + "{}/{}/oauth2/v2.0/token", + authority_host, + tenant_id.as_ref() + ), + client_id: client_id.into(), + federated_token_file: federated_token_file.into(), + } + } +} + +#[async_trait::async_trait] +impl TokenCredential for WorkloadIdentityOAuthProvider { + /// Fetch a token + async fn fetch_token( &self, client: &Client, retry: &RetryConfig, ) -> Result> { + let token_str = std::fs::read_to_string(&self.federated_token_file) + .map_err(|_| Error::FederatedTokenFile)?; + + // https://learn.microsoft.com/en-us/azure/active-directory/develop/v2-oauth2-client-creds-grant-flow#third-case-access-token-request-with-a-federated-credential let response: TokenResponse = client .request(Method::POST, &self.token_url) .header(ACCEPT, HeaderValue::from_static(CONTENT_TYPE_JSON)) .form(&[ ("client_id", self.client_id.as_str()), - ("client_secret", self.client_secret.as_str()), - ("scope", self.scope.as_str()), + ( + "client_assertion_type", + "urn:ietf:params:oauth:client-assertion-type:jwt-bearer", + ), + ("client_assertion", token_str.as_str()), + ("scope", AZURE_STORAGE_SCOPE), ("grant_type", "client_credentials"), ]) .send_retry(retry) @@ -350,3 +539,117 @@ impl ClientSecretOAuthProvider { Ok(token) } } + +#[cfg(test)] +mod tests { + use super::*; + use crate::client::mock_server::MockServer; + use futures::executor::block_on; + use hyper::body::to_bytes; + use hyper::{Body, Response}; + use reqwest::{Client, Method}; + use tempfile::NamedTempFile; + + #[tokio::test] + async fn test_managed_identity() { + let server = MockServer::new(); + + std::env::set_var(MSI_SECRET_ENV_KEY, "env-secret"); + + let endpoint = server.url(); + let client = Client::new(); + let retry_config = RetryConfig::default(); + + // Test IMDS + server.push_fn(|req| { + assert_eq!(req.uri().path(), "/metadata/identity/oauth2/token"); + assert!(req.uri().query().unwrap().contains("client_id=client_id")); + assert_eq!(req.method(), &Method::GET); + let t = req + .headers() + .get("x-identity-header") + .unwrap() + .to_str() + .unwrap(); + assert_eq!(t, "env-secret"); + let t = req.headers().get("metadata").unwrap().to_str().unwrap(); + assert_eq!(t, "true"); + Response::new(Body::from( + r#" + { + "access_token": "TOKEN", + "refresh_token": "", + "expires_in": "3599", + "expires_on": "1506484173", + "not_before": "1506480273", + "resource": "https://management.azure.com/", + "token_type": "Bearer" + } + "#, + )) + }); + + let credential = ImdsManagedIdentityOAuthProvider::new( + Some("client_id".into()), + None, + None, + Some(format!("{}/metadata/identity/oauth2/token", endpoint)), + client.clone(), + ); + + let token = credential + .fetch_token(&client, &retry_config) + .await + .unwrap(); + + assert_eq!(&token.token, "TOKEN"); + } + + #[tokio::test] + async fn test_workload_identity() { + let server = MockServer::new(); + let tokenfile = NamedTempFile::new().unwrap(); + let tenant = "tenant"; + std::fs::write(tokenfile.path(), "federated-token").unwrap(); + + let endpoint = server.url(); + let client = Client::new(); + let retry_config = RetryConfig::default(); + + // Test IMDS + server.push_fn(move |req| { + assert_eq!(req.uri().path(), format!("/{}/oauth2/v2.0/token", tenant)); + assert_eq!(req.method(), &Method::POST); + let body = block_on(to_bytes(req.into_body())).unwrap(); + let body = String::from_utf8(body.to_vec()).unwrap(); + assert!(body.contains("federated-token")); + Response::new(Body::from( + r#" + { + "access_token": "TOKEN", + "refresh_token": "", + "expires_in": 3599, + "expires_on": "1506484173", + "not_before": "1506480273", + "resource": "https://management.azure.com/", + "token_type": "Bearer" + } + "#, + )) + }); + + let credential = WorkloadIdentityOAuthProvider::new( + "client_id", + tokenfile.path().to_str().unwrap(), + tenant, + Some(endpoint.to_string()), + ); + + let token = credential + .fetch_token(&client, &retry_config) + .await + .unwrap(); + + assert_eq!(&token.token, "TOKEN"); + } +} diff --git a/src/azure/mod.rs b/src/azure/mod.rs index 3bce8e5..1eea278 100644 --- a/src/azure/mod.rs +++ b/src/azure/mod.rs @@ -27,6 +27,7 @@ //! a way to drop old blocks. Instead unused blocks are automatically cleaned up //! after 7 days. use self::client::{BlockId, BlockList}; +use crate::client::token::TokenCache; use crate::{ multipart::{CloudMultiPartUpload, CloudMultiPartUploadImpl, UploadPart}, path::Path, @@ -65,6 +66,8 @@ const EMULATOR_ACCOUNT: &str = "devstoreaccount1"; const EMULATOR_ACCOUNT_KEY: &str = "Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw=="; +const MSI_ENDPOINT_ENV_KEY: &str = "IDENTITY_ENDPOINT"; + /// A specialized `Error` for Azure object store-related errors #[derive(Debug, Snafu)] #[allow(missing_docs)] @@ -393,6 +396,10 @@ pub struct MicrosoftAzureBuilder { authority_host: Option, url: Option, use_emulator: bool, + msi_endpoint: Option, + object_id: Option, + msi_resource_id: Option, + federated_token_file: Option, retry_config: RetryConfig, client_options: ClientOptions, } @@ -496,6 +503,36 @@ pub enum AzureConfigKey { /// - `object_store_use_emulator` /// - `use_emulator` UseEmulator, + + /// Endpoint to request a imds managed identity token + /// + /// Supported keys: + /// - `azure_msi_endpoint` + /// - `azure_identity_endpoint` + /// - `identity_endpoint` + /// - `msi_endpoint` + MsiEndpoint, + + /// Object id for use with managed identity authentication + /// + /// Supported keys: + /// - `azure_object_id` + /// - `object_id` + ObjectId, + + /// Msi resource id for use with managed identity authentication + /// + /// Supported keys: + /// - `azure_msi_resource_id` + /// - `msi_resource_id` + MsiResourceId, + + /// File containing token for Azure AD workload identity federation + /// + /// Supported keys: + /// - `azure_federated_token_file` + /// - `federated_token_file` + FederatedTokenFile, } impl AsRef for AzureConfigKey { @@ -509,6 +546,10 @@ impl AsRef for AzureConfigKey { Self::SasKey => "azure_storage_sas_key", Self::Token => "azure_storage_token", Self::UseEmulator => "azure_storage_use_emulator", + Self::MsiEndpoint => "azure_msi_endpoint", + Self::ObjectId => "azure_object_id", + Self::MsiResourceId => "azure_msi_resource_id", + Self::FederatedTokenFile => "azure_federated_token_file", } } } @@ -543,6 +584,15 @@ impl FromStr for AzureConfigKey { | "sas_token" => Ok(Self::SasKey), "azure_storage_token" | "bearer_token" | "token" => Ok(Self::Token), "azure_storage_use_emulator" | "use_emulator" => Ok(Self::UseEmulator), + "azure_msi_endpoint" + | "azure_identity_endpoint" + | "identity_endpoint" + | "msi_endpoint" => Ok(Self::MsiEndpoint), + "azure_object_id" | "object_id" => Ok(Self::ObjectId), + "azure_msi_resource_id" | "msi_resource_id" => Ok(Self::MsiResourceId), + "azure_federated_token_file" | "federated_token_file" => { + Ok(Self::FederatedTokenFile) + } _ => Err(Error::UnknownConfigurationKey { key: s.into() }.into()), } } @@ -600,6 +650,10 @@ impl MicrosoftAzureBuilder { builder.client_options.with_allow_http(str_is_truthy(&text)); } + if let Ok(text) = std::env::var(MSI_ENDPOINT_ENV_KEY) { + builder = builder.with_msi_endpoint(text); + } + builder } @@ -644,6 +698,12 @@ impl MicrosoftAzureBuilder { AzureConfigKey::AuthorityId => self.tenant_id = Some(value.into()), AzureConfigKey::SasKey => self.sas_key = Some(value.into()), AzureConfigKey::Token => self.bearer_token = Some(value.into()), + AzureConfigKey::MsiEndpoint => self.msi_endpoint = Some(value.into()), + AzureConfigKey::ObjectId => self.object_id = Some(value.into()), + AzureConfigKey::MsiResourceId => self.msi_resource_id = Some(value.into()), + AzureConfigKey::FederatedTokenFile => { + self.federated_token_file = Some(value.into()) + } AzureConfigKey::UseEmulator => { self.use_emulator = str_is_truthy(&value.into()) } @@ -743,6 +803,24 @@ impl MicrosoftAzureBuilder { self } + /// Sets the client id for use in client secret or k8s federated credential flow + pub fn with_client_id(mut self, client_id: impl Into) -> Self { + self.client_id = Some(client_id.into()); + self + } + + /// Sets the client secret for use in client secret flow + pub fn with_client_secret(mut self, client_secret: impl Into) -> Self { + self.client_secret = Some(client_secret.into()); + self + } + + /// Sets the tenant id for use in client secret or k8s federated credential flow + pub fn with_tenant_id(mut self, tenant_id: impl Into) -> Self { + self.tenant_id = Some(tenant_id.into()); + self + } + /// Set query pairs appended to the url for shared access signature authorization pub fn with_sas_authorization( mut self, @@ -769,8 +847,8 @@ impl MicrosoftAzureBuilder { /// Sets an alternative authority host for OAuth based authorization /// common hosts for azure clouds are defined in [authority_hosts]. /// Defaults to - pub fn with_authority_host(mut self, authority_host: String) -> Self { - self.authority_host = Some(authority_host); + pub fn with_authority_host(mut self, authority_host: impl Into) -> Self { + self.authority_host = Some(authority_host.into()); self } @@ -792,6 +870,23 @@ impl MicrosoftAzureBuilder { self } + /// Sets the endpoint for acquiring managed identity token + pub fn with_msi_endpoint(mut self, msi_endpoint: impl Into) -> Self { + self.msi_endpoint = Some(msi_endpoint.into()); + self + } + + /// Sets a file path for acquiring azure federated identity token in k8s + /// + /// requires `client_id` and `tenant_id` to be set + pub fn with_federated_token_file( + mut self, + federated_token_file: impl Into, + ) -> Self { + self.federated_token_file = Some(federated_token_file.into()); + self + } + /// Configure a connection to container with given name on Microsoft Azure /// Blob store. pub fn build(mut self) -> Result { @@ -821,28 +916,54 @@ impl MicrosoftAzureBuilder { let url = Url::parse(&account_url) .context(UnableToParseUrlSnafu { url: account_url })?; let credential = if let Some(bearer_token) = self.bearer_token { - Ok(credential::CredentialProvider::AccessKey(bearer_token)) + credential::CredentialProvider::AccessKey(bearer_token) } else if let Some(access_key) = self.access_key { - Ok(credential::CredentialProvider::AccessKey(access_key)) + credential::CredentialProvider::AccessKey(access_key) + } else if let (Some(client_id), Some(tenant_id), Some(federated_token_file)) = + (&self.client_id, &self.tenant_id, self.federated_token_file) + { + let client_credential = credential::WorkloadIdentityOAuthProvider::new( + client_id, + federated_token_file, + tenant_id, + self.authority_host, + ); + credential::CredentialProvider::TokenCredential( + TokenCache::default(), + Box::new(client_credential), + ) } else if let (Some(client_id), Some(client_secret), Some(tenant_id)) = - (self.client_id, self.client_secret, self.tenant_id) + (&self.client_id, self.client_secret, &self.tenant_id) { let client_credential = credential::ClientSecretOAuthProvider::new( - client_id, + client_id.clone(), client_secret, tenant_id, self.authority_host, ); - Ok(credential::CredentialProvider::ClientSecret( - client_credential, - )) + credential::CredentialProvider::TokenCredential( + TokenCache::default(), + Box::new(client_credential), + ) } else if let Some(query_pairs) = self.sas_query_pairs { - Ok(credential::CredentialProvider::SASToken(query_pairs)) + credential::CredentialProvider::SASToken(query_pairs) } else if let Some(sas) = self.sas_key { - Ok(credential::CredentialProvider::SASToken(split_sas(&sas)?)) + credential::CredentialProvider::SASToken(split_sas(&sas)?) } else { - Err(Error::MissingCredentials {}) - }?; + let client = + self.client_options.clone().with_allow_http(true).client()?; + let msi_credential = credential::ImdsManagedIdentityOAuthProvider::new( + self.client_id, + self.object_id, + self.msi_resource_id, + self.msi_endpoint, + client, + ); + credential::CredentialProvider::TokenCredential( + TokenCache::default(), + Box::new(msi_credential), + ) + }; (false, url, credential, account_name) }; From 31091810adfe65fdc47c8e8d45b09acf552951b9 Mon Sep 17 00:00:00 2001 From: Marius S <39998+winding-lines@users.noreply.github.com> Date: Wed, 25 Jan 2023 07:24:30 -0800 Subject: [PATCH 088/397] Additional GCP authentication (#3541) * Implement authentication with instance and application credentials * Fix link in documentation * Address feedback * Instantiate InstanceCredentialsProvider client just once --- src/gcp/credential.rs | 255 +++++++++++++++++++++++++++++++++++++++++- src/gcp/mod.rs | 166 ++++++++++++++------------- 2 files changed, 344 insertions(+), 77 deletions(-) diff --git a/src/gcp/credential.rs b/src/gcp/credential.rs index cc157dd..5646856 100644 --- a/src/gcp/credential.rs +++ b/src/gcp/credential.rs @@ -17,16 +17,30 @@ use crate::client::retry::RetryExt; use crate::client::token::TemporaryToken; +use crate::ClientOptions; use crate::RetryConfig; +use async_trait::async_trait; use base64::prelude::BASE64_URL_SAFE_NO_PAD; use base64::Engine; +use futures::TryFutureExt; use reqwest::{Client, Method}; use ring::signature::RsaKeyPair; use snafu::{ResultExt, Snafu}; +use std::env; +use std::fs::File; +use std::io::BufReader; +use std::path::Path; use std::time::{Duration, Instant}; +use tracing::info; #[derive(Debug, Snafu)] pub enum Error { + #[snafu(display("Unable to open service account file: {}", source))] + OpenCredentials { source: std::io::Error }, + + #[snafu(display("Unable to decode service account file: {}", source))] + DecodeCredentials { source: serde_json::Error }, + #[snafu(display("No RSA key found in pem file"))] MissingKey, @@ -47,6 +61,12 @@ pub enum Error { #[snafu(display("Error getting token response body: {}", source))] TokenResponseBody { source: reqwest::Error }, + + #[snafu(display("A configuration file was passed in but was not used."))] + UnusedConfigurationFile, + + #[snafu(display("Error creating client: {}", source))] + Client { source: crate::Error }, } pub type Result = std::result::Result; @@ -104,6 +124,15 @@ struct TokenResponse { expires_in: u64, } +#[async_trait] +pub trait TokenProvider: std::fmt::Debug + Send + Sync { + async fn fetch_token( + &self, + client: &Client, + retry: &RetryConfig, + ) -> Result>; +} + /// Encapsulates the logic to perform an OAuth token challenge #[derive(Debug)] pub struct OAuthProvider { @@ -138,9 +167,12 @@ impl OAuthProvider { random: ring::rand::SystemRandom::new(), }) } +} +#[async_trait] +impl TokenProvider for OAuthProvider { /// Fetch a fresh token - pub async fn fetch_token( + async fn fetch_token( &self, client: &Client, retry: &RetryConfig, @@ -195,6 +227,69 @@ impl OAuthProvider { } } +fn read_credentials_file( + service_account_path: impl AsRef, +) -> Result +where + T: serde::de::DeserializeOwned, +{ + let file = File::open(service_account_path).context(OpenCredentialsSnafu)?; + let reader = BufReader::new(file); + serde_json::from_reader(reader).context(DecodeCredentialsSnafu) +} + +/// A deserialized `service-account-********.json`-file. +#[derive(serde::Deserialize, Debug)] +pub struct ServiceAccountCredentials { + /// The private key in RSA format. + pub private_key: String, + + /// The email address associated with the service account. + pub client_email: String, + + /// Base URL for GCS + #[serde(default = "default_gcs_base_url")] + pub gcs_base_url: String, + + /// Disable oauth and use empty tokens. + #[serde(default = "default_disable_oauth")] + pub disable_oauth: bool, +} + +pub fn default_gcs_base_url() -> String { + "https://storage.googleapis.com".to_owned() +} + +pub fn default_disable_oauth() -> bool { + false +} + +impl ServiceAccountCredentials { + /// Create a new [`ServiceAccountCredentials`] from a file. + pub fn from_file>(path: P) -> Result { + read_credentials_file(path) + } + + /// Create a new [`ServiceAccountCredentials`] from a string. + pub fn from_key(key: &str) -> Result { + serde_json::from_str(key).context(DecodeCredentialsSnafu) + } + + /// Create an [`OAuthProvider`] from this credentials struct. + pub fn token_provider( + self, + scope: &str, + audience: &str, + ) -> Result> { + Ok(Box::new(OAuthProvider::new( + self.client_email, + self.private_key, + scope.to_string(), + audience.to_string(), + )?) as Box) + } +} + /// Returns the number of seconds since unix epoch fn seconds_since_epoch() -> u64 { std::time::SystemTime::now() @@ -205,7 +300,7 @@ fn seconds_since_epoch() -> u64 { fn decode_first_rsa_key(private_key_pem: String) -> Result { use rustls_pemfile::Item; - use std::io::{BufReader, Cursor}; + use std::io::Cursor; let mut cursor = Cursor::new(private_key_pem); let mut reader = BufReader::new(&mut cursor); @@ -222,3 +317,159 @@ fn b64_encode_obj(obj: &T) -> Result { let string = serde_json::to_string(obj).context(EncodeSnafu)?; Ok(BASE64_URL_SAFE_NO_PAD.encode(string)) } + +/// A provider that uses the Google Cloud Platform metadata server to fetch a token. +/// +/// +#[derive(Debug, Default)] +pub struct InstanceCredentialProvider { + audience: String, + client: Client, +} + +impl InstanceCredentialProvider { + /// Create a new [`InstanceCredentialProvider`], we need to control the client in order to enable http access so save the options. + pub fn new>( + audience: T, + client_options: ClientOptions, + ) -> Result { + client_options + .with_allow_http(true) + .client() + .map(|client| Self { + audience: audience.into(), + client, + }) + .context(ClientSnafu) + } +} + +/// Make a request to the metadata server to fetch a token, using a a given hostname. +async fn make_metadata_request( + client: &Client, + hostname: &str, + retry: &RetryConfig, + audience: &str, +) -> Result { + let url = format!( + "http://{}/computeMetadata/v1/instance/service-accounts/default/token", + hostname + ); + let response: TokenResponse = client + .request(Method::GET, url) + .header("Metadata-Flavor", "Google") + .query(&[("audience", audience)]) + .send_retry(retry) + .await + .context(TokenRequestSnafu)? + .json() + .await + .context(TokenResponseBodySnafu)?; + Ok(response) +} + +#[async_trait] +impl TokenProvider for InstanceCredentialProvider { + /// Fetch a token from the metadata server. + /// Since the connection is local we need to enable http access and don't actually use the client object passed in. + async fn fetch_token( + &self, + _client: &Client, + retry: &RetryConfig, + ) -> Result> { + const METADATA_IP: &str = "169.254.169.254"; + const METADATA_HOST: &str = "metadata"; + + info!("fetching token from metadata server"); + let response = + make_metadata_request(&self.client, METADATA_HOST, retry, &self.audience) + .or_else(|_| { + make_metadata_request( + &self.client, + METADATA_IP, + retry, + &self.audience, + ) + }) + .await?; + let token = TemporaryToken { + token: response.access_token, + expiry: Instant::now() + Duration::from_secs(response.expires_in), + }; + Ok(token) + } +} + +/// A deserialized `application_default_credentials.json`-file. +/// +#[derive(serde::Deserialize, Debug)] +pub struct ApplicationDefaultCredentials { + client_id: String, + client_secret: String, + refresh_token: String, + #[serde(rename = "type")] + type_: String, +} + +impl ApplicationDefaultCredentials { + const DEFAULT_TOKEN_GCP_URI: &'static str = + "https://accounts.google.com/o/oauth2/token"; + const CREDENTIALS_PATH: &'static str = + ".config/gcloud/application_default_credentials.json"; + const EXPECTED_TYPE: &str = "authorized_user"; + + // Create a new application default credential in the following situations: + // 1. a file is passed in and the type matches. + // 2. without argument if the well-known configuration file is present. + pub fn new(path: Option<&str>) -> Result, Error> { + if let Some(path) = path { + if let Ok(credentials) = read_credentials_file::(path) { + if credentials.type_ == Self::EXPECTED_TYPE { + return Ok(Some(credentials)); + } + } + // Return an error if the path has not been used. + return Err(Error::UnusedConfigurationFile); + } + if let Some(home) = env::var_os("HOME") { + let path = Path::new(&home).join(Self::CREDENTIALS_PATH); + + // It's expected for this file to not exist unless it has been explicitly configured by the user. + if path.try_exists().unwrap_or(false) { + return read_credentials_file::(path).map(Some); + } + } + Ok(None) + } +} + +#[async_trait] +impl TokenProvider for ApplicationDefaultCredentials { + async fn fetch_token( + &self, + client: &Client, + retry: &RetryConfig, + ) -> Result, Error> { + let body = [ + ("grant_type", "refresh_token"), + ("client_id", &self.client_id), + ("client_secret", &self.client_secret), + ("refresh_token", &self.refresh_token), + ]; + + let response = client + .request(Method::POST, Self::DEFAULT_TOKEN_GCP_URI) + .form(&body) + .send_retry(retry) + .await + .context(TokenRequestSnafu)? + .json::() + .await + .context(TokenResponseBodySnafu)?; + let token = TemporaryToken { + token: response.access_token, + expiry: Instant::now() + Duration::from_secs(response.expires_in), + }; + Ok(token) + } +} diff --git a/src/gcp/mod.rs b/src/gcp/mod.rs index 28972c4..871413b 100644 --- a/src/gcp/mod.rs +++ b/src/gcp/mod.rs @@ -30,8 +30,7 @@ //! consider implementing automatic clean up of unused parts that are older than one //! week. use std::collections::BTreeSet; -use std::fs::File; -use std::io::{self, BufReader}; +use std::io; use std::ops::Range; use std::str::FromStr; use std::sync::Arc; @@ -59,18 +58,15 @@ use crate::{ RetryConfig, }; -use credential::OAuthProvider; +use self::credential::{ + default_gcs_base_url, ApplicationDefaultCredentials, InstanceCredentialProvider, + ServiceAccountCredentials, TokenProvider, +}; mod credential; #[derive(Debug, Snafu)] enum Error { - #[snafu(display("Unable to open service account file: {}", source))] - OpenCredentials { source: std::io::Error }, - - #[snafu(display("Unable to decode service account file: {}", source))] - DecodeCredentials { source: serde_json::Error }, - #[snafu(display("Got invalid XML response for {} {}: {}", method, url, source))] InvalidXMLResponse { source: quick_xml::de::DeError, @@ -121,8 +117,8 @@ enum Error { #[snafu(display("Missing bucket name"))] MissingBucketName {}, - #[snafu(display("Missing service account path or key"))] - MissingServiceAccountPathOrKey, + #[snafu(display("Could not find either metadata credentials or configuration properties to initialize GCS credentials."))] + MissingCredentials, #[snafu(display( "One of service account path or service account key may be provided." @@ -185,32 +181,6 @@ impl From for super::Error { } } -/// A deserialized `service-account-********.json`-file. -#[derive(serde::Deserialize, Debug)] -struct ServiceAccountCredentials { - /// The private key in RSA format. - pub private_key: String, - - /// The email address associated with the service account. - pub client_email: String, - - /// Base URL for GCS - #[serde(default = "default_gcs_base_url")] - pub gcs_base_url: String, - - /// Disable oauth and use empty tokens. - #[serde(default = "default_disable_oauth")] - pub disable_oauth: bool, -} - -fn default_gcs_base_url() -> String { - "https://storage.googleapis.com".to_owned() -} - -fn default_disable_oauth() -> bool { - false -} - #[derive(serde::Deserialize, Debug)] #[serde(rename_all = "camelCase")] struct ListResponse { @@ -267,7 +237,7 @@ struct GoogleCloudStorageClient { client: Client, base_url: String, - oauth_provider: Option, + token_provider: Option>>, token_cache: TokenCache, bucket_name: String, @@ -282,11 +252,11 @@ struct GoogleCloudStorageClient { impl GoogleCloudStorageClient { async fn get_token(&self) -> Result { - if let Some(oauth_provider) = &self.oauth_provider { + if let Some(token_provider) = &self.token_provider { Ok(self .token_cache .get_or_insert_with(|| { - oauth_provider.fetch_token(&self.client, &self.retry_config) + token_provider.fetch_token(&self.client, &self.retry_config) }) .await .context(CredentialSnafu)?) @@ -779,14 +749,6 @@ impl ObjectStore for GoogleCloudStorage { } } -fn reader_credentials_file( - service_account_path: impl AsRef, -) -> Result { - let file = File::open(service_account_path).context(OpenCredentialsSnafu)?; - let reader = BufReader::new(file); - Ok(serde_json::from_reader(reader).context(DecodeCredentialsSnafu)?) -} - /// Configure a connection to Google Cloud Storage using the specified /// credentials. /// @@ -806,6 +768,7 @@ pub struct GoogleCloudStorageBuilder { url: Option, service_account_path: Option, service_account_key: Option, + application_credentials_path: Option, retry_config: RetryConfig, client_options: ClientOptions, } @@ -862,6 +825,11 @@ pub enum GoogleConfigKey { /// - `bucket` /// - `bucket_name` Bucket, + + /// Application credentials path + /// + /// See [`GoogleCloudStorageBuilder::with_application_credentials`]. + ApplicationCredentials, } impl AsRef for GoogleConfigKey { @@ -870,6 +838,7 @@ impl AsRef for GoogleConfigKey { Self::ServiceAccount => "google_service_account", Self::ServiceAccountKey => "google_service_account_key", Self::Bucket => "google_bucket", + Self::ApplicationCredentials => "google_application_credentials", } } } @@ -889,6 +858,7 @@ impl FromStr for GoogleConfigKey { "google_bucket" | "google_bucket_name" | "bucket" | "bucket_name" => { Ok(Self::Bucket) } + "google_application_credentials" => Ok(Self::ApplicationCredentials), _ => Err(Error::UnknownConfigurationKey { key: s.into() }.into()), } } @@ -900,6 +870,7 @@ impl Default for GoogleCloudStorageBuilder { bucket_name: None, service_account_path: None, service_account_key: None, + application_credentials_path: None, retry_config: Default::default(), client_options: ClientOptions::new().with_allow_http(true), url: None, @@ -988,6 +959,9 @@ impl GoogleCloudStorageBuilder { self.service_account_key = Some(value.into()) } GoogleConfigKey::Bucket => self.bucket_name = Some(value.into()), + GoogleConfigKey::ApplicationCredentials => { + self.application_credentials_path = Some(value.into()) + } }; Ok(self) } @@ -1069,6 +1043,17 @@ impl GoogleCloudStorageBuilder { self } + /// Set the path to the application credentials file. + /// + /// + pub fn with_application_credentials( + mut self, + application_credentials_path: impl Into, + ) -> Self { + self.application_credentials_path = Some(application_credentials_path.into()); + self + } + /// Set the retry configuration pub fn with_retry(mut self, retry_config: RetryConfig) -> Self { self.retry_config = retry_config; @@ -1098,44 +1083,75 @@ impl GoogleCloudStorageBuilder { let client = self.client_options.client()?; - let credentials = match (self.service_account_path, self.service_account_key) { - (Some(path), None) => reader_credentials_file(path)?, - (None, Some(key)) => { - serde_json::from_str(&key).context(DecodeCredentialsSnafu)? - } - (None, None) => return Err(Error::MissingServiceAccountPathOrKey.into()), - (Some(_), Some(_)) => { - return Err(Error::ServiceAccountPathAndKeyProvided.into()) - } - }; + // First try to initialize from the service account information. + let service_account_credentials = + match (self.service_account_path, self.service_account_key) { + (Some(path), None) => Some( + ServiceAccountCredentials::from_file(path) + .context(CredentialSnafu)?, + ), + (None, Some(key)) => Some( + ServiceAccountCredentials::from_key(&key).context(CredentialSnafu)?, + ), + (None, None) => None, + (Some(_), Some(_)) => { + return Err(Error::ServiceAccountPathAndKeyProvided.into()) + } + }; + + // Then try to initialize from the application credentials file, or the environment. + let application_default_credentials = ApplicationDefaultCredentials::new( + self.application_credentials_path.as_deref(), + ) + .context(CredentialSnafu)?; + + let disable_oauth = service_account_credentials + .as_ref() + .map(|c| c.disable_oauth) + .unwrap_or(false); + + let gcs_base_url = service_account_credentials + .as_ref() + .map(|c| c.gcs_base_url.clone()) + .unwrap_or_else(default_gcs_base_url); // TODO: https://cloud.google.com/storage/docs/authentication#oauth-scopes let scope = "https://www.googleapis.com/auth/devstorage.full_control"; - let audience = "https://www.googleapis.com/oauth2/v4/token".to_string(); - - let oauth_provider = (!credentials.disable_oauth) - .then(|| { - OAuthProvider::new( - credentials.client_email, - credentials.private_key, - scope.to_string(), - audience, + let audience = "https://www.googleapis.com/oauth2/v4/token"; + + let token_provider = if disable_oauth { + None + } else { + let best_provider = if let Some(credentials) = service_account_credentials { + Some( + credentials + .token_provider(scope, audience) + .context(CredentialSnafu)?, ) - }) - .transpose() - .context(CredentialSnafu)?; + } else if let Some(credentials) = application_default_credentials { + Some(Box::new(credentials) as Box) + } else { + Some(Box::new( + InstanceCredentialProvider::new( + audience, + self.client_options.clone(), + ) + .context(CredentialSnafu)?, + ) as Box) + }; + + // A provider is required at this point, bail out if we don't have one. + Some(best_provider.ok_or(Error::MissingCredentials)?) + }; let encoded_bucket_name = percent_encode(bucket_name.as_bytes(), NON_ALPHANUMERIC).to_string(); - // The cloud storage crate currently only supports authentication via - // environment variables. Set the environment variable explicitly so - // that we can optionally accept command line arguments instead. Ok(GoogleCloudStorage { client: Arc::new(GoogleCloudStorageClient { client, - base_url: credentials.gcs_base_url, - oauth_provider, + base_url: gcs_base_url, + token_provider: token_provider.map(Arc::new), token_cache: Default::default(), bucket_name, bucket_name_encoded: encoded_bucket_name, From 08bd27fabe91f7620dda2bed4fd9ab5fcac4196f Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Thu, 26 Jan 2023 20:42:40 -0800 Subject: [PATCH 089/397] Fix clippy (#3612) --- src/aws/credential.rs | 14 ++++++------- src/aws/mod.rs | 17 ++++++++-------- src/azure/client.rs | 8 ++++---- src/azure/credential.rs | 4 ++-- src/azure/mod.rs | 2 +- src/client/backoff.rs | 2 +- src/client/retry.rs | 4 ++-- src/gcp/credential.rs | 3 +-- src/gcp/mod.rs | 16 ++++++--------- src/lib.rs | 16 ++++++--------- src/local.rs | 9 ++++----- src/memory.rs | 5 ++--- src/multipart.rs | 2 +- src/path/mod.rs | 44 +++++++++++------------------------------ 14 files changed, 56 insertions(+), 90 deletions(-) diff --git a/src/aws/credential.rs b/src/aws/credential.rs index 3a6976d..cba5584 100644 --- a/src/aws/credential.rs +++ b/src/aws/credential.rs @@ -207,7 +207,7 @@ fn hex_encode(bytes: &[u8]) -> String { let mut out = String::with_capacity(bytes.len() * 2); for byte in bytes { // String writing is infallible - let _ = write!(out, "{:02x}", byte); + let _ = write!(out, "{byte:02x}"); } out } @@ -397,7 +397,7 @@ async fn instance_creds( const CREDENTIALS_PATH: &str = "latest/meta-data/iam/security-credentials"; const AWS_EC2_METADATA_TOKEN_HEADER: &str = "X-aws-ec2-metadata-token"; - let token_url = format!("{}/latest/api/token", endpoint); + let token_url = format!("{endpoint}/latest/api/token"); let token_result = client .request(Method::PUT, token_url) @@ -416,7 +416,7 @@ async fn instance_creds( Err(e) => return Err(e.into()), }; - let role_url = format!("{}/{}/", endpoint, CREDENTIALS_PATH); + let role_url = format!("{endpoint}/{CREDENTIALS_PATH}/"); let mut role_request = client.request(Method::GET, role_url); if let Some(token) = &token { @@ -425,7 +425,7 @@ async fn instance_creds( let role = role_request.send_retry(retry_config).await?.text().await?; - let creds_url = format!("{}/{}/{}", endpoint, CREDENTIALS_PATH, role); + let creds_url = format!("{endpoint}/{CREDENTIALS_PATH}/{role}"); let mut creds_request = client.request(Method::GET, creds_url); if let Some(token) = &token { creds_request = creds_request.header(AWS_EC2_METADATA_TOKEN_HEADER, token); @@ -483,7 +483,7 @@ async fn web_identity( endpoint: &str, ) -> Result>, StdError> { let token = std::fs::read_to_string(token_path) - .map_err(|e| format!("Failed to read token file '{}': {}", token_path, e))?; + .map_err(|e| format!("Failed to read token file '{token_path}': {e}"))?; let bytes = client .request(Method::POST, endpoint) @@ -501,7 +501,7 @@ async fn web_identity( .await?; let resp: AssumeRoleResponse = quick_xml::de::from_reader(bytes.reader()) - .map_err(|e| format!("Invalid AssumeRoleWithWebIdentity response: {}", e))?; + .map_err(|e| format!("Invalid AssumeRoleWithWebIdentity response: {e}"))?; let creds = resp.assume_role_with_web_identity_result.credentials; let now = Utc::now(); @@ -677,7 +677,7 @@ mod tests { // Verify only allows IMDSv2 let resp = client - .request(Method::GET, format!("{}/latest/meta-data/ami-id", endpoint)) + .request(Method::GET, format!("{endpoint}/latest/meta-data/ami-id")) .send() .await .unwrap(); diff --git a/src/aws/mod.rs b/src/aws/mod.rs index 2017469..a1c9eae 100644 --- a/src/aws/mod.rs +++ b/src/aws/mod.rs @@ -614,7 +614,7 @@ impl AmazonS3Builder { std::env::var("AWS_CONTAINER_CREDENTIALS_RELATIVE_URI") { builder.metadata_endpoint = - Some(format!("{}{}", METADATA_ENDPOINT, metadata_relative_uri)); + Some(format!("{METADATA_ENDPOINT}{metadata_relative_uri}")); } if let Ok(text) = std::env::var("AWS_ALLOW_HTTP") { @@ -896,7 +896,7 @@ impl AmazonS3Builder { let session_name = std::env::var("AWS_ROLE_SESSION_NAME") .unwrap_or_else(|_| "WebIdentitySession".to_string()); - let endpoint = format!("https://sts.{}.amazonaws.com", region); + let endpoint = format!("https://sts.{region}.amazonaws.com"); // Disallow non-HTTPs requests let client = self @@ -948,15 +948,15 @@ impl AmazonS3Builder { // `virtual_hosted_style_request`. i.e. if `virtual_hosted_style_request` is true then // `endpoint` should have bucket name included. if self.virtual_hosted_style_request { - endpoint = self.endpoint.unwrap_or_else(|| { - format!("https://{}.s3.{}.amazonaws.com", bucket, region) - }); + endpoint = self + .endpoint + .unwrap_or_else(|| format!("https://{bucket}.s3.{region}.amazonaws.com")); bucket_endpoint = endpoint.clone(); } else { endpoint = self .endpoint - .unwrap_or_else(|| format!("https://s3.{}.amazonaws.com", region)); - bucket_endpoint = format!("{}/{}", endpoint, bucket); + .unwrap_or_else(|| format!("https://s3.{region}.amazonaws.com")); + bucket_endpoint = format!("{endpoint}/{bucket}"); } let config = S3Config { @@ -1137,8 +1137,7 @@ mod tests { assert_eq!(builder.endpoint.unwrap(), aws_endpoint); assert_eq!(builder.token.unwrap(), aws_session_token); - let metadata_uri = - format!("{}{}", METADATA_ENDPOINT, container_creds_relative_uri); + let metadata_uri = format!("{METADATA_ENDPOINT}{container_creds_relative_uri}"); assert_eq!(builder.metadata_endpoint.unwrap(), metadata_uri); } diff --git a/src/azure/client.rs b/src/azure/client.rs index e42950b..39da717 100644 --- a/src/azure/client.rs +++ b/src/azure/client.rs @@ -179,12 +179,12 @@ impl AzureClient { Ok(AzureCredential::AuthorizationToken( // we do the conversion to a HeaderValue here, since it is fallible // and we wna to use it in an infallible function - HeaderValue::from_str(&format!("Bearer {}", token)).map_err( - |err| crate::Error::Generic { + HeaderValue::from_str(&format!("Bearer {token}")).map_err(|err| { + crate::Error::Generic { store: "MicrosoftAzure", source: Box::new(err), - }, - )?, + } + })?, )) } CredentialProvider::SASToken(sas) => { diff --git a/src/azure/credential.rs b/src/azure/credential.rs index 280d843..67023d2 100644 --- a/src/azure/credential.rs +++ b/src/azure/credential.rs @@ -593,7 +593,7 @@ mod tests { Some("client_id".into()), None, None, - Some(format!("{}/metadata/identity/oauth2/token", endpoint)), + Some(format!("{endpoint}/metadata/identity/oauth2/token")), client.clone(), ); @@ -618,7 +618,7 @@ mod tests { // Test IMDS server.push_fn(move |req| { - assert_eq!(req.uri().path(), format!("/{}/oauth2/v2.0/token", tenant)); + assert_eq!(req.uri().path(), format!("/{tenant}/oauth2/v2.0/token")); assert_eq!(req.method(), &Method::POST); let body = block_on(to_bytes(req.into_body())).unwrap(); let body = String::from_utf8(body.to_vec()).unwrap(); diff --git a/src/azure/mod.rs b/src/azure/mod.rs index 1eea278..5296906 100644 --- a/src/azure/mod.rs +++ b/src/azure/mod.rs @@ -327,7 +327,7 @@ impl CloudMultiPartUploadImpl for AzureMultiPartUpload { buf: Vec, part_idx: usize, ) -> Result { - let content_id = format!("{:20}", part_idx); + let content_id = format!("{part_idx:20}"); let block_id: BlockId = content_id.clone().into(); self.client diff --git a/src/client/backoff.rs b/src/client/backoff.rs index 5a6126c..a4ca976 100644 --- a/src/client/backoff.rs +++ b/src/client/backoff.rs @@ -123,7 +123,7 @@ mod tests { }; let assert_fuzzy_eq = - |a: f64, b: f64| assert!((b - a).abs() < 0.0001, "{} != {}", a, b); + |a: f64, b: f64| assert!((b - a).abs() < 0.0001, "{a} != {b}"); // Create a static rng that takes the minimum of the range let rng = Box::new(StepRng::new(0, 0)); diff --git a/src/client/retry.rs b/src/client/retry.rs index cee86b3..e6dd2eb 100644 --- a/src/client/retry.rs +++ b/src/client/retry.rs @@ -41,7 +41,7 @@ impl std::fmt::Display for Error { self.message, self.retries )?; if let Some(source) = &self.source { - write!(f, ": {}", source)?; + write!(f, ": {source}")?; } Ok(()) } @@ -171,7 +171,7 @@ impl RetryExt for reqwest::RequestBuilder { true => match r.text().await { Ok(message) if !message.is_empty() => message, Ok(_) => "No Body".to_string(), - Err(e) => format!("error getting response body: {}", e) + Err(e) => format!("error getting response body: {e}") } false => status.to_string(), }; diff --git a/src/gcp/credential.rs b/src/gcp/credential.rs index 5646856..c12b37c 100644 --- a/src/gcp/credential.rs +++ b/src/gcp/credential.rs @@ -352,8 +352,7 @@ async fn make_metadata_request( audience: &str, ) -> Result { let url = format!( - "http://{}/computeMetadata/v1/instance/service-accounts/default/token", - hostname + "http://{hostname}/computeMetadata/v1/instance/service-accounts/default/token" ); let response: TokenResponse = client .request(Method::GET, url) diff --git a/src/gcp/mod.rs b/src/gcp/mod.rs index 871413b..97f4444 100644 --- a/src/gcp/mod.rs +++ b/src/gcp/mod.rs @@ -1271,8 +1271,7 @@ mod test { assert!( matches!(err, ObjectStoreError::NotFound { .. }), - "unexpected error type: {}", - err + "unexpected error type: {err}" ); } @@ -1291,8 +1290,7 @@ mod test { assert!( matches!(err, ObjectStoreError::NotFound { .. }), - "unexpected error type: {}", - err + "unexpected error type: {err}" ); } @@ -1305,8 +1303,7 @@ mod test { let err = integration.delete(&location).await.unwrap_err(); assert!( matches!(err, ObjectStoreError::NotFound { .. }), - "unexpected error type: {}", - err + "unexpected error type: {err}" ); } @@ -1322,8 +1319,7 @@ mod test { let err = integration.delete(&location).await.unwrap_err(); assert!( matches!(err, ObjectStoreError::NotFound { .. }), - "unexpected error type: {}", - err + "unexpected error type: {err}" ); } @@ -1352,7 +1348,7 @@ mod test { #[tokio::test] async fn gcs_test_proxy_url() { let mut tfile = NamedTempFile::new().unwrap(); - write!(tfile, "{}", FAKE_KEY).unwrap(); + write!(tfile, "{FAKE_KEY}").unwrap(); let service_account_path = tfile.path(); let gcs = GoogleCloudStorageBuilder::new() .with_service_account_path(service_account_path.to_str().unwrap()) @@ -1400,7 +1396,7 @@ mod test { #[test] fn gcs_test_service_account_key_and_path() { let mut tfile = NamedTempFile::new().unwrap(); - write!(tfile, "{}", FAKE_KEY).unwrap(); + write!(tfile, "{FAKE_KEY}").unwrap(); let _ = GoogleCloudStorageBuilder::new() .with_service_account_key(FAKE_KEY) .with_service_account_path(tfile.path().to_str().unwrap()) diff --git a/src/lib.rs b/src/lib.rs index 4ec58c3..8c20288 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -441,11 +441,9 @@ impl GetResult { } })?; - file.seek(SeekFrom::Start(0)).map_err(|source| { - local::Error::Seek { - source, - path: path.clone(), - } + file.rewind().map_err(|source| local::Error::Seek { + source, + path: path.clone(), })?; let mut buffer = Vec::with_capacity(len as usize); @@ -611,8 +609,7 @@ mod tests { let content_list = flatten_list_stream(storage, None).await.unwrap(); assert!( content_list.is_empty(), - "Expected list to be empty; found: {:?}", - content_list + "Expected list to be empty; found: {content_list:?}" ); let location = Path::from("test_dir/test_file.json"); @@ -815,7 +812,7 @@ mod tests { storage.delete(&path).await.unwrap(); let files = flatten_list_stream(storage, None).await.unwrap(); - assert!(files.is_empty(), "{:?}", files); + assert!(files.is_empty(), "{files:?}"); } fn get_vec_of_bytes(chunk_length: usize, num_chunks: usize) -> Vec { @@ -900,8 +897,7 @@ mod tests { let content_list = flatten_list_stream(storage, None).await.unwrap(); assert!( content_list.is_empty(), - "Expected list to be empty; found: {:?}", - content_list + "Expected list to be empty; found: {content_list:?}" ); let location1 = Path::from("foo/x.json"); diff --git a/src/local.rs b/src/local.rs index 2ef87ad..9a518ba 100644 --- a/src/local.rs +++ b/src/local.rs @@ -555,7 +555,7 @@ impl ObjectStore for LocalFileSystem { fn get_upload_stage_path(dest: &std::path::Path, multipart_id: &MultipartId) -> PathBuf { let mut staging_path = dest.as_os_str().to_owned(); - staging_path.push(format!("#{}", multipart_id)); + staging_path.push(format!("#{multipart_id}")); staging_path.into() } @@ -607,7 +607,7 @@ impl AsyncWrite for LocalUpload { |condition: &str| -> std::task::Poll> { Poll::Ready(Err(io::Error::new( io::ErrorKind::InvalidInput, - format!("Tried to write to file {}.", condition), + format!("Tried to write to file {condition}."), ))) }; @@ -1040,12 +1040,11 @@ mod tests { let source_variant = source.downcast_ref::(); assert!( matches!(source_variant, Some(std::io::Error { .. }),), - "got: {:?}", - source_variant + "got: {source_variant:?}" ); assert!(path.ends_with(NON_EXISTENT_NAME), "{}", path); } else { - panic!("unexpected error type: {:?}", err); + panic!("unexpected error type: {err:?}"); } } diff --git a/src/memory.rs b/src/memory.rs index e4be5b2..372164c 100644 --- a/src/memory.rs +++ b/src/memory.rs @@ -365,12 +365,11 @@ mod tests { let source_variant = source.downcast_ref::(); assert!( matches!(source_variant, Some(Error::NoDataInMemory { .. }),), - "got: {:?}", - source_variant + "got: {source_variant:?}" ); assert_eq!(path, NON_EXISTENT_NAME); } else { - panic!("unexpected error type: {:?}", err); + panic!("unexpected error type: {err:?}"); } } } diff --git a/src/multipart.rs b/src/multipart.rs index 65427d1..0606fb5 100644 --- a/src/multipart.rs +++ b/src/multipart.rs @@ -222,7 +222,7 @@ where part.ok_or_else(|| { io::Error::new( io::ErrorKind::Other, - format!("Missing information for upload part {}", idx), + format!("Missing information for upload part {idx}"), ) }) }) diff --git a/src/path/mod.rs b/src/path/mod.rs index 020e5f5..4b0862e 100644 --- a/src/path/mod.rs +++ b/src/path/mod.rs @@ -454,63 +454,49 @@ mod tests { // self starts with self assert!( haystack.prefix_matches(&haystack), - "{:?} should have started with {:?}", - haystack, - haystack + "{haystack:?} should have started with {haystack:?}" ); // a longer prefix doesn't match let needle = needle.child("longer now"); assert!( !haystack.prefix_matches(&needle), - "{:?} shouldn't have started with {:?}", - haystack, - needle + "{haystack:?} shouldn't have started with {needle:?}" ); // one dir prefix matches let needle = Path::from_iter(["foo/bar"]); assert!( haystack.prefix_matches(&needle), - "{:?} should have started with {:?}", - haystack, - needle + "{haystack:?} should have started with {needle:?}" ); // two dir prefix matches let needle = needle.child("baz%2Ftest"); assert!( haystack.prefix_matches(&needle), - "{:?} should have started with {:?}", - haystack, - needle + "{haystack:?} should have started with {needle:?}" ); // partial dir prefix doesn't match let needle = Path::from_iter(["f"]); assert!( !haystack.prefix_matches(&needle), - "{:?} should not have started with {:?}", - haystack, - needle + "{haystack:?} should not have started with {needle:?}" ); // one dir and one partial dir doesn't match let needle = Path::from_iter(["foo/bar", "baz"]); assert!( !haystack.prefix_matches(&needle), - "{:?} should not have started with {:?}", - haystack, - needle + "{haystack:?} should not have started with {needle:?}" ); // empty prefix matches let needle = Path::from(""); assert!( haystack.prefix_matches(&needle), - "{:?} should have started with {:?}", - haystack, - needle + "{haystack:?} should have started with {needle:?}" ); } @@ -524,9 +510,7 @@ mod tests { assert!( !haystack.prefix_matches(&needle), - "{:?} should not have started with {:?}", - haystack, - needle + "{haystack:?} should not have started with {needle:?}" ); // All directories match but file name is not a prefix @@ -534,9 +518,7 @@ mod tests { assert!( !haystack.prefix_matches(&needle), - "{:?} should not have started with {:?}", - haystack, - needle + "{haystack:?} should not have started with {needle:?}" ); // Not all directories match; file name is a prefix of the next directory; this @@ -545,9 +527,7 @@ mod tests { assert!( !haystack.prefix_matches(&needle), - "{:?} should not have started with {:?}", - haystack, - needle + "{haystack:?} should not have started with {needle:?}" ); // Not all directories match; file name is NOT a prefix of the next directory; @@ -556,9 +536,7 @@ mod tests { assert!( !haystack.prefix_matches(&needle), - "{:?} should not have started with {:?}", - haystack, - needle + "{haystack:?} should not have started with {needle:?}" ); } From d2d6aa9c56a1b0a759990ef6b06efca2b75f85d4 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Fri, 27 Jan 2023 18:06:58 +0000 Subject: [PATCH 090/397] Update AWS SDK (#3617) --- Cargo.toml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 8c9ede0..c685685 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -53,9 +53,9 @@ ring = { version = "0.16", default-features = false, features = ["std"], optiona rustls-pemfile = { version = "1.0", default-features = false, optional = true } # AWS Profile support -aws-types = { version = "0.53", optional = true } -aws-credential-types = { version = "0.53", optional = true } -aws-config = { version = "0.53", optional = true } +aws-types = { version = "0.54", optional = true } +aws-credential-types = { version = "0.54", optional = true } +aws-config = { version = "0.54", optional = true } [features] cloud = ["serde", "serde_json", "quick-xml", "reqwest", "reqwest/json", "reqwest/stream", "chrono/serde", "base64", "rand", "ring"] From 99bfb5ab5e564e107a0e252c22bd090d7b97fe3d Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Wed, 1 Feb 2023 20:06:43 +0000 Subject: [PATCH 091/397] Prepare object store 0.5.4 (#3636) (#3640) * Prepare object store 0.5.4 (#3636) * Update CHANGELOG-old --- CHANGELOG-old.md | 35 ++++++++++++++++++++++++++++ CHANGELOG.md | 40 +++++++++++++------------------- Cargo.toml | 2 +- dev/release/update_change_log.sh | 4 ++-- 4 files changed, 54 insertions(+), 27 deletions(-) diff --git a/CHANGELOG-old.md b/CHANGELOG-old.md index 2813cfc..78237a0 100644 --- a/CHANGELOG-old.md +++ b/CHANGELOG-old.md @@ -21,6 +21,41 @@ # Changelog +## [object_store_0.5.3](https://github.com/apache/arrow-rs/tree/object_store_0.5.3) (2023-01-04) + +[Full Changelog](https://github.com/apache/arrow-rs/compare/object_store_0.5.2...object_store_0.5.3) + +**Implemented enhancements:** + +- Derive Clone for the builders in object-store. [\#3419](https://github.com/apache/arrow-rs/issues/3419) +- Add a constant prefix object store wrapper [\#3328](https://github.com/apache/arrow-rs/issues/3328) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Add support for content-type while uploading files through ObjectStore API [\#3300](https://github.com/apache/arrow-rs/issues/3300) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Add HttpStore [\#3294](https://github.com/apache/arrow-rs/issues/3294) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Add support for Azure Data Lake Storage Gen2 \(aka: ADLS Gen2\) in Object Store library [\#3283](https://github.com/apache/arrow-rs/issues/3283) +- object\_store: Add Put and Multipart Upload Doc Examples [\#2863](https://github.com/apache/arrow-rs/issues/2863) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] + +**Closed issues:** + +- Only flush buffered multi-part data on poll\_shutdown not on poll\_flush [\#3390](https://github.com/apache/arrow-rs/issues/3390) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] + +**Merged pull requests:** + +- object\_store: builder configuration api [\#3436](https://github.com/apache/arrow-rs/pull/3436) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([roeap](https://github.com/roeap)) +- Derive Clone for ObjectStore builders and Make URL Parsing Stricter \(\#3419\) [\#3424](https://github.com/apache/arrow-rs/pull/3424) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Add Put and Multipart Put doc examples [\#3420](https://github.com/apache/arrow-rs/pull/3420) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([GeauxEric](https://github.com/GeauxEric)) +- object\_store: update localstack instructions [\#3403](https://github.com/apache/arrow-rs/pull/3403) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([wjones127](https://github.com/wjones127)) +- object\_store: Flush buffered multipart only during poll\_shutdown [\#3397](https://github.com/apache/arrow-rs/pull/3397) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([askoa](https://github.com/askoa)) +- Update quick-xml to 0.27 [\#3395](https://github.com/apache/arrow-rs/pull/3395) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Add HttpStore \(\#3294\) [\#3380](https://github.com/apache/arrow-rs/pull/3380) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- add support for content-type in `ClientOptions` [\#3358](https://github.com/apache/arrow-rs/pull/3358) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([ByteBaker](https://github.com/ByteBaker)) +- Update AWS SDK [\#3349](https://github.com/apache/arrow-rs/pull/3349) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Upstream newline\_delimited\_stream and ChunkedStore from DataFusion [\#3341](https://github.com/apache/arrow-rs/pull/3341) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- feat\(object\_store\): add PrefixObjectStore [\#3329](https://github.com/apache/arrow-rs/pull/3329) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([roeap](https://github.com/roeap)) +- feat\(object\_store\): parse well-known storage urls [\#3327](https://github.com/apache/arrow-rs/pull/3327) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([roeap](https://github.com/roeap)) +- Disable getrandom object\_store [\#3278](https://github.com/apache/arrow-rs/pull/3278) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Reload token from AWS\_WEB\_IDENTITY\_TOKEN\_FILE [\#3274](https://github.com/apache/arrow-rs/pull/3274) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Minor: skip aws integration test if TEST\_INTEGRATION is not set [\#3262](https://github.com/apache/arrow-rs/pull/3262) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([viirya](https://github.com/viirya)) + ## [object_store_0.5.2](https://github.com/apache/arrow-rs/tree/object_store_0.5.2) (2022-12-02) [Full Changelog](https://github.com/apache/arrow-rs/compare/object_store_0.5.1...object_store_0.5.2) diff --git a/CHANGELOG.md b/CHANGELOG.md index 41b029c..c1734ec 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,40 +19,32 @@ # Changelog -## [object_store_0.5.3](https://github.com/apache/arrow-rs/tree/object_store_0.5.3) (2023-01-04) +## [object_store_0.5.4](https://github.com/apache/arrow-rs/tree/object_store_0.5.4) (2023-01-30) -[Full Changelog](https://github.com/apache/arrow-rs/compare/object_store_0.5.2...object_store_0.5.3) +[Full Changelog](https://github.com/apache/arrow-rs/compare/object_store_0.5.3...object_store_0.5.4) **Implemented enhancements:** -- Derive Clone for the builders in object-store. [\#3419](https://github.com/apache/arrow-rs/issues/3419) -- Add a constant prefix object store wrapper [\#3328](https://github.com/apache/arrow-rs/issues/3328) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- Add support for content-type while uploading files through ObjectStore API [\#3300](https://github.com/apache/arrow-rs/issues/3300) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- Add HttpStore [\#3294](https://github.com/apache/arrow-rs/issues/3294) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- Add support for Azure Data Lake Storage Gen2 \(aka: ADLS Gen2\) in Object Store library [\#3283](https://github.com/apache/arrow-rs/issues/3283) -- object\_store: Add Put and Multipart Upload Doc Examples [\#2863](https://github.com/apache/arrow-rs/issues/2863) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- \[object\_store\] support more identity based auth flows for azure [\#3580](https://github.com/apache/arrow-rs/issues/3580) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Implement workload identity and application default credentials for GCP object store. [\#3533](https://github.com/apache/arrow-rs/issues/3533) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Support GCP Workload Identity [\#3490](https://github.com/apache/arrow-rs/issues/3490) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Allow providing service account key directly when building GCP object store client [\#3488](https://github.com/apache/arrow-rs/issues/3488) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] **Closed issues:** -- Only flush buffered multi-part data on poll\_shutdown not on poll\_flush [\#3390](https://github.com/apache/arrow-rs/issues/3390) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- object\_store: temporary aws credentials not refreshed? [\#3446](https://github.com/apache/arrow-rs/issues/3446) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] **Merged pull requests:** -- object\_store: builder configuration api [\#3436](https://github.com/apache/arrow-rs/pull/3436) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([roeap](https://github.com/roeap)) -- Derive Clone for ObjectStore builders and Make URL Parsing Stricter \(\#3419\) [\#3424](https://github.com/apache/arrow-rs/pull/3424) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- Add Put and Multipart Put doc examples [\#3420](https://github.com/apache/arrow-rs/pull/3420) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([GeauxEric](https://github.com/GeauxEric)) -- object\_store: update localstack instructions [\#3403](https://github.com/apache/arrow-rs/pull/3403) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([wjones127](https://github.com/wjones127)) -- object\_store: Flush buffered multipart only during poll\_shutdown [\#3397](https://github.com/apache/arrow-rs/pull/3397) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([askoa](https://github.com/askoa)) -- Update quick-xml to 0.27 [\#3395](https://github.com/apache/arrow-rs/pull/3395) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- Add HttpStore \(\#3294\) [\#3380](https://github.com/apache/arrow-rs/pull/3380) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- add support for content-type in `ClientOptions` [\#3358](https://github.com/apache/arrow-rs/pull/3358) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([ByteBaker](https://github.com/ByteBaker)) -- Update AWS SDK [\#3349](https://github.com/apache/arrow-rs/pull/3349) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- Upstream newline\_delimited\_stream and ChunkedStore from DataFusion [\#3341](https://github.com/apache/arrow-rs/pull/3341) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- feat\(object\_store\): add PrefixObjectStore [\#3329](https://github.com/apache/arrow-rs/pull/3329) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([roeap](https://github.com/roeap)) -- feat\(object\_store\): parse well-known storage urls [\#3327](https://github.com/apache/arrow-rs/pull/3327) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([roeap](https://github.com/roeap)) -- Disable getrandom object\_store [\#3278](https://github.com/apache/arrow-rs/pull/3278) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- Reload token from AWS\_WEB\_IDENTITY\_TOKEN\_FILE [\#3274](https://github.com/apache/arrow-rs/pull/3274) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- Minor: skip aws integration test if TEST\_INTEGRATION is not set [\#3262](https://github.com/apache/arrow-rs/pull/3262) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([viirya](https://github.com/viirya)) +- Final tweaks to 32.0.0 changelog [\#3618](https://github.com/apache/arrow-rs/pull/3618) ([tustvold](https://github.com/tustvold)) +- Update AWS SDK [\#3617](https://github.com/apache/arrow-rs/pull/3617) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Add ClientOption.allow\_insecure [\#3600](https://github.com/apache/arrow-rs/pull/3600) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([poelzi](https://github.com/poelzi)) +- \[object\_store\] support azure managed and workload identities [\#3581](https://github.com/apache/arrow-rs/pull/3581) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([roeap](https://github.com/roeap)) +- Additional GCP authentication [\#3541](https://github.com/apache/arrow-rs/pull/3541) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([winding-lines](https://github.com/winding-lines)) +- Update aws-config and aws-types requirements from 0.52 to 0.53 [\#3539](https://github.com/apache/arrow-rs/pull/3539) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([viirya](https://github.com/viirya)) +- Use GHA concurrency groups \(\#3495\) [\#3538](https://github.com/apache/arrow-rs/pull/3538) ([tustvold](https://github.com/tustvold)) +- Remove azurite test exception [\#3497](https://github.com/apache/arrow-rs/pull/3497) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- feat: Allow providing a service account key directly for GCS [\#3489](https://github.com/apache/arrow-rs/pull/3489) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([scsmithr](https://github.com/scsmithr)) diff --git a/Cargo.toml b/Cargo.toml index c685685..686a661 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "object_store" -version = "0.5.3" +version = "0.5.4" edition = "2021" license = "MIT/Apache-2.0" readme = "README.md" diff --git a/dev/release/update_change_log.sh b/dev/release/update_change_log.sh index 2f6c809..5cf5582 100755 --- a/dev/release/update_change_log.sh +++ b/dev/release/update_change_log.sh @@ -29,8 +29,8 @@ set -e -SINCE_TAG="object_store_0.5.2" -FUTURE_RELEASE="object_store_0.5.3" +SINCE_TAG="object_store_0.5.3" +FUTURE_RELEASE="object_store_0.5.4" SOURCE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" SOURCE_TOP_DIR="$(cd "${SOURCE_DIR}/../../" && pwd)" From 3196e9d032ed146955002411bb7b00d71e1c3480 Mon Sep 17 00:00:00 2001 From: Rich Date: Mon, 6 Feb 2023 06:14:27 -0500 Subject: [PATCH 092/397] object_store: add Path::from_url_path (#3663) * object_store: add Path::from_url_path * reuse existing implementation * Final tweaks * Fix wasm32 build --------- Co-authored-by: Raphael Taylor-Davies --- src/path/mod.rs | 39 ++++++++++++++++++++++++++++++++------- 1 file changed, 32 insertions(+), 7 deletions(-) diff --git a/src/path/mod.rs b/src/path/mod.rs index 4b0862e..a15f7ca 100644 --- a/src/path/mod.rs +++ b/src/path/mod.rs @@ -18,7 +18,6 @@ //! Path abstraction for Object Storage use itertools::Itertools; -#[cfg(not(target_arch = "wasm32"))] use percent_encoding::percent_decode; use snafu::{ensure, ResultExt, Snafu}; use std::fmt::Formatter; @@ -166,7 +165,7 @@ impl Path { /// Convert a filesystem path to a [`Path`] relative to the filesystem root /// /// This will return an error if the path contains illegal character sequences - /// as defined by [`Path::parse`] or does not exist + /// as defined on the docstring for [`Path`] or does not exist /// /// Note: this will canonicalize the provided path, resolving any symlinks pub fn from_filesystem_path( @@ -182,8 +181,8 @@ impl Path { #[cfg(not(target_arch = "wasm32"))] /// Convert an absolute filesystem path to a [`Path`] relative to the filesystem root /// - /// This will return an error if the path contains illegal character sequences - /// as defined by [`Path::parse`], or `base` is not an absolute path + /// This will return an error if the path contains illegal character sequences, + /// as defined on the docstring for [`Path`], or `base` is not an absolute path pub fn from_absolute_path(path: impl AsRef) -> Result { Self::from_absolute_path_with_base(path, None) } @@ -191,9 +190,9 @@ impl Path { #[cfg(not(target_arch = "wasm32"))] /// Convert a filesystem path to a [`Path`] relative to the provided base /// - /// This will return an error if the path contains illegal character sequences - /// as defined by [`Path::parse`], or `base` does not refer to a parent path of `path`, - /// or `base` is not an absolute path + /// This will return an error if the path contains illegal character sequences, + /// as defined on the docstring for [`Path`], or `base` does not refer to a parent + /// path of `path`, or `base` is not an absolute path pub(crate) fn from_absolute_path_with_base( path: impl AsRef, base: Option<&Url>, @@ -210,6 +209,15 @@ impl Path { }; // Reverse any percent encoding performed by conversion to URL + Self::from_url_path(path) + } + + /// Parse a url encoded string as a [`Path`], returning a [`Error`] if invalid + /// + /// This will return an error if the path contains illegal character sequences + /// as defined on the docstring for [`Path`] + pub fn from_url_path(path: impl AsRef) -> Result { + let path = path.as_ref(); let decoded = percent_decode(path.as_bytes()) .decode_utf8() .context(NonUnicodeSnafu { path })?; @@ -551,6 +559,23 @@ mod tests { assert_eq!(b.raw, c.raw); } + #[test] + fn from_url_path() { + let a = Path::from_url_path("foo%20bar").unwrap(); + let b = Path::from_url_path("foo/%2E%2E/bar").unwrap_err(); + let c = Path::from_url_path("foo%2F%252E%252E%2Fbar").unwrap(); + let d = Path::from_url_path("foo/%252E%252E/bar").unwrap(); + let e = Path::from_url_path("%48%45%4C%4C%4F").unwrap(); + let f = Path::from_url_path("foo/%FF/as").unwrap_err(); + + assert_eq!(a.raw, "foo bar"); + assert!(matches!(b, Error::BadSegment { .. })); + assert_eq!(c.raw, "foo/%2E%2E/bar"); + assert_eq!(d.raw, "foo/%2E%2E/bar"); + assert_eq!(e.raw, "HELLO"); + assert!(matches!(f, Error::NonUnicode { .. })); + } + #[test] fn filename_from_path() { let a = Path::from("foo/bar"); From c55714ce8d3c1ad8a240cee2ebfc58a6a488534a Mon Sep 17 00:00:00 2001 From: Robert Pack <42610831+roeap@users.noreply.github.com> Date: Mon, 13 Feb 2023 15:40:16 +0100 Subject: [PATCH 093/397] object_store: azure cli authorization (#3698) * fix: pass bearer token credential as auth header * feat: add azure cli credential * fix: clippy * Update object_store/src/azure/client.rs Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> * chore: PR feedback * docs: add azure cli link --------- Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> --- src/azure/client.rs | 14 ++++- src/azure/credential.rs | 126 +++++++++++++++++++++++++++++++++++++++- src/azure/mod.rs | 27 ++++++++- 3 files changed, 164 insertions(+), 3 deletions(-) diff --git a/src/azure/client.rs b/src/azure/client.rs index 39da717..76bb451 100644 --- a/src/azure/client.rs +++ b/src/azure/client.rs @@ -169,6 +169,18 @@ impl AzureClient { CredentialProvider::AccessKey(key) => { Ok(AzureCredential::AccessKey(key.to_owned())) } + CredentialProvider::BearerToken(token) => { + Ok(AzureCredential::AuthorizationToken( + // we do the conversion to a HeaderValue here, since it is fallible + // and we want to use it in an infallible function + HeaderValue::from_str(&format!("Bearer {token}")).map_err(|err| { + crate::Error::Generic { + store: "MicrosoftAzure", + source: Box::new(err), + } + })?, + )) + } CredentialProvider::TokenCredential(cache, cred) => { let token = cache .get_or_insert_with(|| { @@ -178,7 +190,7 @@ impl AzureClient { .context(AuthorizationSnafu)?; Ok(AzureCredential::AuthorizationToken( // we do the conversion to a HeaderValue here, since it is fallible - // and we wna to use it in an infallible function + // and we want to use it in an infallible function HeaderValue::from_str(&format!("Bearer {token}")).map_err(|err| { crate::Error::Generic { store: "MicrosoftAzure", diff --git a/src/azure/credential.rs b/src/azure/credential.rs index 67023d2..9460c2d 100644 --- a/src/azure/credential.rs +++ b/src/azure/credential.rs @@ -21,7 +21,7 @@ use crate::util::hmac_sha256; use crate::RetryConfig; use base64::prelude::BASE64_STANDARD; use base64::Engine; -use chrono::Utc; +use chrono::{DateTime, Utc}; use reqwest::header::ACCEPT; use reqwest::{ header::{ @@ -34,6 +34,7 @@ use reqwest::{ use serde::Deserialize; use snafu::{ResultExt, Snafu}; use std::borrow::Cow; +use std::process::Command; use std::str; use std::time::{Duration, Instant}; use url::Url; @@ -61,6 +62,12 @@ pub enum Error { #[snafu(display("Error reading federated token file "))] FederatedTokenFile, + + #[snafu(display("'az account get-access-token' command failed: {message}"))] + AzureCli { message: String }, + + #[snafu(display("Failed to parse azure cli response: {source}"))] + AzureCliResponse { source: serde_json::Error }, } pub type Result = std::result::Result; @@ -69,6 +76,7 @@ pub type Result = std::result::Result; #[derive(Debug)] pub enum CredentialProvider { AccessKey(String), + BearerToken(String), SASToken(Vec<(String, String)>), TokenCredential(TokenCache, Box), } @@ -540,6 +548,122 @@ impl TokenCredential for WorkloadIdentityOAuthProvider { } } +mod az_cli_date_format { + use chrono::{DateTime, TimeZone}; + use serde::{self, Deserialize, Deserializer}; + + pub fn deserialize<'de, D>( + deserializer: D, + ) -> Result, D::Error> + where + D: Deserializer<'de>, + { + let s = String::deserialize(deserializer)?; + // expiresOn from azure cli uses the local timezone + let date = chrono::NaiveDateTime::parse_from_str(&s, "%Y-%m-%d %H:%M:%S.%6f") + .map_err(serde::de::Error::custom)?; + chrono::Local + .from_local_datetime(&date) + .single() + .ok_or(serde::de::Error::custom( + "azure cli returned ambiguous expiry date", + )) + } +} + +#[derive(Debug, Clone, Deserialize)] +#[serde(rename_all = "camelCase")] +struct AzureCliTokenResponse { + pub access_token: String, + #[serde(with = "az_cli_date_format")] + pub expires_on: DateTime, + pub token_type: String, +} + +#[derive(Default, Debug)] +pub struct AzureCliCredential { + _private: (), +} + +impl AzureCliCredential { + pub fn new() -> Self { + Self::default() + } +} + +#[async_trait::async_trait] +impl TokenCredential for AzureCliCredential { + /// Fetch a token + async fn fetch_token( + &self, + _client: &Client, + _retry: &RetryConfig, + ) -> Result> { + // on window az is a cmd and it should be called like this + // see https://doc.rust-lang.org/nightly/std/process/struct.Command.html + let program = if cfg!(target_os = "windows") { + "cmd" + } else { + "az" + }; + let mut args = Vec::new(); + if cfg!(target_os = "windows") { + args.push("/C"); + args.push("az"); + } + args.push("account"); + args.push("get-access-token"); + args.push("--output"); + args.push("json"); + args.push("--scope"); + args.push(AZURE_STORAGE_SCOPE); + + match Command::new(program).args(args).output() { + Ok(az_output) if az_output.status.success() => { + let output = + str::from_utf8(&az_output.stdout).map_err(|_| Error::AzureCli { + message: "az response is not a valid utf-8 string".to_string(), + })?; + + let token_response = + serde_json::from_str::(output) + .context(AzureCliResponseSnafu)?; + if !token_response.token_type.eq_ignore_ascii_case("bearer") { + return Err(Error::AzureCli { + message: format!( + "got unexpected token type from azure cli: {0}", + token_response.token_type + ), + }); + } + let duration = token_response.expires_on.naive_local() + - chrono::Local::now().naive_local(); + Ok(TemporaryToken { + token: token_response.access_token, + expiry: Instant::now() + + duration.to_std().map_err(|_| Error::AzureCli { + message: "az returned invalid lifetime".to_string(), + })?, + }) + } + Ok(az_output) => { + let message = String::from_utf8_lossy(&az_output.stderr); + Err(Error::AzureCli { + message: message.into(), + }) + } + Err(e) => match e.kind() { + std::io::ErrorKind::NotFound => Err(Error::AzureCli { + message: "Azure Cli not installed".into(), + }), + error_kind => Err(Error::AzureCli { + message: format!("io error: {error_kind:?}"), + }), + }, + } + } +} + #[cfg(test)] mod tests { use super::*; diff --git a/src/azure/mod.rs b/src/azure/mod.rs index 5296906..e5f1465 100644 --- a/src/azure/mod.rs +++ b/src/azure/mod.rs @@ -400,6 +400,7 @@ pub struct MicrosoftAzureBuilder { object_id: Option, msi_resource_id: Option, federated_token_file: Option, + use_azure_cli: bool, retry_config: RetryConfig, client_options: ClientOptions, } @@ -533,6 +534,13 @@ pub enum AzureConfigKey { /// - `azure_federated_token_file` /// - `federated_token_file` FederatedTokenFile, + + /// Use azure cli for acquiring access token + /// + /// Supported keys: + /// - `azure_use_azure_cli` + /// - `use_azure_cli` + UseAzureCli, } impl AsRef for AzureConfigKey { @@ -550,6 +558,7 @@ impl AsRef for AzureConfigKey { Self::ObjectId => "azure_object_id", Self::MsiResourceId => "azure_msi_resource_id", Self::FederatedTokenFile => "azure_federated_token_file", + Self::UseAzureCli => "azure_use_azure_cli", } } } @@ -593,6 +602,7 @@ impl FromStr for AzureConfigKey { "azure_federated_token_file" | "federated_token_file" => { Ok(Self::FederatedTokenFile) } + "azure_use_azure_cli" | "use_azure_cli" => Ok(Self::UseAzureCli), _ => Err(Error::UnknownConfigurationKey { key: s.into() }.into()), } } @@ -704,6 +714,9 @@ impl MicrosoftAzureBuilder { AzureConfigKey::FederatedTokenFile => { self.federated_token_file = Some(value.into()) } + AzureConfigKey::UseAzureCli => { + self.use_azure_cli = str_is_truthy(&value.into()) + } AzureConfigKey::UseEmulator => { self.use_emulator = str_is_truthy(&value.into()) } @@ -887,6 +900,13 @@ impl MicrosoftAzureBuilder { self } + /// Set if the Azure Cli should be used for acquiring access token + /// + pub fn with_use_azure_cli(mut self, use_azure_cli: bool) -> Self { + self.use_azure_cli = use_azure_cli; + self + } + /// Configure a connection to container with given name on Microsoft Azure /// Blob store. pub fn build(mut self) -> Result { @@ -916,7 +936,7 @@ impl MicrosoftAzureBuilder { let url = Url::parse(&account_url) .context(UnableToParseUrlSnafu { url: account_url })?; let credential = if let Some(bearer_token) = self.bearer_token { - credential::CredentialProvider::AccessKey(bearer_token) + credential::CredentialProvider::BearerToken(bearer_token) } else if let Some(access_key) = self.access_key { credential::CredentialProvider::AccessKey(access_key) } else if let (Some(client_id), Some(tenant_id), Some(federated_token_file)) = @@ -949,6 +969,11 @@ impl MicrosoftAzureBuilder { credential::CredentialProvider::SASToken(query_pairs) } else if let Some(sas) = self.sas_key { credential::CredentialProvider::SASToken(split_sas(&sas)?) + } else if self.use_azure_cli { + credential::CredentialProvider::TokenCredential( + TokenCache::default(), + Box::new(credential::AzureCliCredential::new()), + ) } else { let client = self.client_options.clone().with_allow_http(true).client()?; From 601f7f937ecb9cd9fa1f90c7d79f740f8ed8b0e8 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Mon, 13 Feb 2023 22:29:51 +0000 Subject: [PATCH 094/397] Filter exact list prefix matches for MemoryStore and HttpStore (#3712) (#3713) * Filter exact list prefix matches for MemoryStore and HttpStore (#3712) * Update object_store/src/lib.rs Co-authored-by: Andrew Lamb --------- Co-authored-by: Andrew Lamb --- src/http/mod.rs | 14 ++++++++++++-- src/lib.rs | 20 ++++++++++++++++++++ src/memory.rs | 21 +++++++++++++++++---- 3 files changed, 49 insertions(+), 6 deletions(-) diff --git a/src/http/mod.rs b/src/http/mod.rs index f05e700..c91faa2 100644 --- a/src/http/mod.rs +++ b/src/http/mod.rs @@ -37,6 +37,7 @@ use async_trait::async_trait; use bytes::Bytes; use futures::stream::BoxStream; use futures::{StreamExt, TryStreamExt}; +use itertools::Itertools; use snafu::{OptionExt, ResultExt, Snafu}; use tokio::io::AsyncWrite; use url::Url; @@ -163,6 +164,7 @@ impl ObjectStore for HttpStore { &self, prefix: Option<&Path>, ) -> Result>> { + let prefix_len = prefix.map(|p| p.as_ref().len()).unwrap_or_default(); let status = self.client.list(prefix, "infinity").await?; Ok(futures::stream::iter( status @@ -172,7 +174,9 @@ impl ObjectStore for HttpStore { .map(|response| { response.check_ok()?; response.object_meta(self.client.base_url()) - }), + }) + // Filter out exact prefix matches + .filter_ok(move |r| r.location.as_ref().len() > prefix_len), ) .boxed()) } @@ -186,7 +190,13 @@ impl ObjectStore for HttpStore { for response in status.response { response.check_ok()?; match response.is_dir() { - false => objects.push(response.object_meta(self.client.base_url())?), + false => { + let meta = response.object_meta(self.client.base_url())?; + // Filter out exact prefix matches + if meta.location.as_ref().len() > prefix_len { + objects.push(meta); + } + } true => { let path = response.path(self.client.base_url())?; // Exclude the current object diff --git a/src/lib.rs b/src/lib.rs index 8c20288..6a3275b 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -911,9 +911,29 @@ mod tests { let content_list = flatten_list_stream(storage, Some(&prefix)).await.unwrap(); assert_eq!(content_list, &[location1.clone()]); + let result = storage.list_with_delimiter(Some(&prefix)).await.unwrap(); + assert_eq!(result.objects.len(), 1); + assert_eq!(result.objects[0].location, location1); + assert_eq!(result.common_prefixes, &[]); + + // Listing an existing path (file) should return an empty list: + // https://github.com/apache/arrow-rs/issues/3712 + let content_list = flatten_list_stream(storage, Some(&location1)) + .await + .unwrap(); + assert_eq!(content_list, &[]); + + let list = storage.list_with_delimiter(Some(&location1)).await.unwrap(); + assert_eq!(list.objects, &[]); + assert_eq!(list.common_prefixes, &[]); + let prefix = Path::from("foo/x"); let content_list = flatten_list_stream(storage, Some(&prefix)).await.unwrap(); assert_eq!(content_list, &[]); + + let list = storage.list_with_delimiter(Some(&prefix)).await.unwrap(); + assert_eq!(list.objects, &[]); + assert_eq!(list.common_prefixes, &[]); } pub(crate) async fn list_with_delimiter(storage: &DynObjectStore) { diff --git a/src/memory.rs b/src/memory.rs index 372164c..40eee55 100644 --- a/src/memory.rs +++ b/src/memory.rs @@ -163,13 +163,21 @@ impl ObjectStore for InMemory { &self, prefix: Option<&Path>, ) -> Result>> { + let root = Path::default(); + let prefix = prefix.unwrap_or(&root); let last_modified = Utc::now(); let storage = self.storage.read(); let values: Vec<_> = storage - .iter() - .filter(move |(key, _)| prefix.map(|p| key.prefix_matches(p)).unwrap_or(true)) - .map(move |(key, value)| { + .range((prefix)..) + .take_while(|(key, _)| key.as_ref().starts_with(prefix.as_ref())) + .filter(|(key, _)| { + // Don't return for exact prefix match + key.prefix_match(prefix) + .map(|mut x| x.next().is_some()) + .unwrap_or(false) + }) + .map(|(key, value)| { Ok(ObjectMeta { location: key.clone(), last_modified, @@ -195,14 +203,19 @@ impl ObjectStore for InMemory { // response. Otherwise, we just collect the common prefixes. let mut objects = vec![]; for (k, v) in self.storage.read().range((prefix)..) { + if !k.as_ref().starts_with(prefix.as_ref()) { + break; + } + let mut parts = match k.prefix_match(prefix) { Some(parts) => parts, - None => break, + None => continue, }; // Pop first element let common_prefix = match parts.next() { Some(p) => p, + // Should only return children of the prefix None => continue, }; From 91c3778c183d25eac275b523eac6d01c10b5493c Mon Sep 17 00:00:00 2001 From: Robert Pack <42610831+roeap@users.noreply.github.com> Date: Tue, 14 Feb 2023 13:03:15 +0100 Subject: [PATCH 095/397] Filter exact list prefix matches for azure gen2 accounts (#3714) * fix: consistent list responses for gen1 and gen2 accounts * Update object_store/src/azure/client.rs Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> --------- Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> --- src/azure/client.rs | 58 ++++++++++++++++++++++++--------------------- 1 file changed, 31 insertions(+), 27 deletions(-) diff --git a/src/azure/client.rs b/src/azure/client.rs index 76bb451..c5a5652 100644 --- a/src/azure/client.rs +++ b/src/azure/client.rs @@ -388,7 +388,7 @@ impl AzureClient { .context(InvalidListResponseSnafu)?; let token = response.next_marker.take(); - Ok((response.try_into()?, token)) + Ok((to_list_result(response, prefix)?, token)) } /// Perform a list operation automatically handling pagination @@ -419,33 +419,37 @@ struct ListResultInternal { pub blobs: Blobs, } -impl TryFrom for ListResult { - type Error = crate::Error; - - fn try_from(value: ListResultInternal) -> Result { - let common_prefixes = value - .blobs - .blob_prefix - .into_iter() - .map(|x| Ok(Path::parse(x.name)?)) - .collect::>()?; - - let objects = value - .blobs - .blobs - .into_iter() - .map(ObjectMeta::try_from) - // Note: workaround for gen2 accounts with hierarchical namespaces. These accounts also - // return path segments as "directories". When we cant directories, its always via - // the BlobPrefix mechanics. - .filter_map_ok(|obj| if obj.size > 0 { Some(obj) } else { None }) - .collect::>()?; - - Ok(Self { - common_prefixes, - objects, +fn to_list_result(value: ListResultInternal, prefix: Option<&str>) -> Result { + let prefix = prefix.map(Path::from).unwrap_or_else(Path::default); + let common_prefixes = value + .blobs + .blob_prefix + .into_iter() + .map(|x| Ok(Path::parse(x.name)?)) + .collect::>()?; + + let objects = value + .blobs + .blobs + .into_iter() + .map(ObjectMeta::try_from) + // Note: workaround for gen2 accounts with hierarchical namespaces. These accounts also + // return path segments as "directories" and include blobs in list requests with prefix, + // if the prefix mateches the blob. When we want directories, its always via + // the BlobPrefix mechanics, and during lists we state that prefixes are evaluated on path segement basis. + .filter_map_ok(|obj| { + if obj.size > 0 && obj.location.as_ref().len() > prefix.as_ref().len() { + Some(obj) + } else { + None + } }) - } + .collect::>()?; + + Ok(ListResult { + common_prefixes, + objects, + }) } /// Collection of blobs and potentially shared prefixes returned from list requests. From e03fe20b67743a0d1ed78be1f4be61396ddcb84b Mon Sep 17 00:00:00 2001 From: Spencer Bartholomew <38776747+spencerbart@users.noreply.github.com> Date: Sat, 25 Feb 2023 02:01:49 -0700 Subject: [PATCH 096/397] update object_store deps to patch potential security vulnerabilities (#3761) --- Cargo.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 686a661..8ab0c15 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -37,7 +37,7 @@ itertools = "0.10.1" parking_lot = { version = "0.12" } percent-encoding = "2.1" snafu = "0.7" -tokio = { version = "1.18", features = ["sync", "macros", "rt", "time", "io-util"] } +tokio = { version = "1.25.0", features = ["sync", "macros", "rt", "time", "io-util"] } tracing = { version = "0.1" } url = "2.2" walkdir = "2" @@ -72,4 +72,4 @@ dotenv = "0.15.0" tempfile = "3.1.0" futures-test = "0.3" rand = "0.8" -hyper = { version = "0.14", features = ["server"] } +hyper = { version = "0.14.24", features = ["server"] } From 8c3acb546fdda35af2369fef156266fb18781718 Mon Sep 17 00:00:00 2001 From: Willem D'Haeseleer Date: Mon, 27 Feb 2023 09:21:25 -0800 Subject: [PATCH 097/397] object-store: fix handling of AWS profile credentials without expiry (#3766) * fix aws profile * fix unused import * support None as expiry * fix clippy * fix fmt * revert fmt whitespace fix --- src/aws/credential.rs | 18 ++++++------------ src/azure/credential.rs | 16 +++++++++------- src/client/token.rs | 22 ++++++++++++++-------- src/gcp/credential.rs | 6 +++--- 4 files changed, 32 insertions(+), 30 deletions(-) diff --git a/src/aws/credential.rs b/src/aws/credential.rs index cba5584..e2332d0 100644 --- a/src/aws/credential.rs +++ b/src/aws/credential.rs @@ -438,7 +438,7 @@ async fn instance_creds( let ttl = (creds.expiration - now).to_std().unwrap_or_default(); Ok(TemporaryToken { token: Arc::new(creds.into()), - expiry: Instant::now() + ttl, + expiry: Some(Instant::now() + ttl), }) } @@ -509,7 +509,7 @@ async fn web_identity( Ok(TemporaryToken { token: Arc::new(creds.into()), - expiry: Instant::now() + ttl, + expiry: Some(Instant::now() + ttl), }) } @@ -553,17 +553,11 @@ mod profile { store: "S3", source: Box::new(source), })?; - let t_now = SystemTime::now(); - let expiry = match c.expiry().and_then(|e| e.duration_since(t_now).ok()) { - Some(ttl) => Instant::now() + ttl, - None => { - return Err(crate::Error::Generic { - store: "S3", - source: "Invalid expiry".into(), - }) - } - }; + let expiry = c + .expiry() + .and_then(|e| e.duration_since(t_now).ok()) + .map(|ttl| Instant::now() + ttl); Ok(TemporaryToken { token: Arc::new(AwsCredential { diff --git a/src/azure/credential.rs b/src/azure/credential.rs index 9460c2d..9e07222 100644 --- a/src/azure/credential.rs +++ b/src/azure/credential.rs @@ -360,7 +360,7 @@ impl TokenCredential for ClientSecretOAuthProvider { let token = TemporaryToken { token: response.access_token, - expiry: Instant::now() + Duration::from_secs(response.expires_in), + expiry: Some(Instant::now() + Duration::from_secs(response.expires_in)), }; Ok(token) @@ -467,7 +467,7 @@ impl TokenCredential for ImdsManagedIdentityOAuthProvider { let token = TemporaryToken { token: response.access_token, - expiry: Instant::now() + Duration::from_secs(response.expires_in), + expiry: Some(Instant::now() + Duration::from_secs(response.expires_in)), }; Ok(token) @@ -541,7 +541,7 @@ impl TokenCredential for WorkloadIdentityOAuthProvider { let token = TemporaryToken { token: response.access_token, - expiry: Instant::now() + Duration::from_secs(response.expires_in), + expiry: Some(Instant::now() + Duration::from_secs(response.expires_in)), }; Ok(token) @@ -640,10 +640,12 @@ impl TokenCredential for AzureCliCredential { - chrono::Local::now().naive_local(); Ok(TemporaryToken { token: token_response.access_token, - expiry: Instant::now() - + duration.to_std().map_err(|_| Error::AzureCli { - message: "az returned invalid lifetime".to_string(), - })?, + expiry: Some( + Instant::now() + + duration.to_std().map_err(|_| Error::AzureCli { + message: "az returned invalid lifetime".to_string(), + })?, + ), }) } Ok(az_output) => { diff --git a/src/client/token.rs b/src/client/token.rs index 2ff2861..7e48d35 100644 --- a/src/client/token.rs +++ b/src/client/token.rs @@ -25,7 +25,8 @@ pub struct TemporaryToken { /// The temporary credential pub token: T, /// The instant at which this credential is no longer valid - pub expiry: Instant, + /// None means the credential does not expire + pub expiry: Option, } /// Provides [`TokenCache::get_or_insert_with`] which can be used to cache a @@ -53,13 +54,18 @@ impl TokenCache { let mut locked = self.cache.lock().await; if let Some(cached) = locked.as_ref() { - let delta = cached - .expiry - .checked_duration_since(now) - .unwrap_or_default(); - - if delta.as_secs() > 300 { - return Ok(cached.token.clone()); + match cached.expiry { + Some(ttl) + if ttl + .checked_duration_since(now) + .unwrap_or_default() + .as_secs() + > 300 => + { + return Ok(cached.token.clone()); + } + None => return Ok(cached.token.clone()), + _ => (), } } diff --git a/src/gcp/credential.rs b/src/gcp/credential.rs index c12b37c..853e4ce 100644 --- a/src/gcp/credential.rs +++ b/src/gcp/credential.rs @@ -220,7 +220,7 @@ impl TokenProvider for OAuthProvider { let token = TemporaryToken { token: response.access_token, - expiry: Instant::now() + Duration::from_secs(response.expires_in), + expiry: Some(Instant::now() + Duration::from_secs(response.expires_in)), }; Ok(token) @@ -393,7 +393,7 @@ impl TokenProvider for InstanceCredentialProvider { .await?; let token = TemporaryToken { token: response.access_token, - expiry: Instant::now() + Duration::from_secs(response.expires_in), + expiry: Some(Instant::now() + Duration::from_secs(response.expires_in)), }; Ok(token) } @@ -467,7 +467,7 @@ impl TokenProvider for ApplicationDefaultCredentials { .context(TokenResponseBodySnafu)?; let token = TemporaryToken { token: response.access_token, - expiry: Instant::now() + Duration::from_secs(response.expires_in), + expiry: Some(Instant::now() + Duration::from_secs(response.expires_in)), }; Ok(token) } From 7337426c700a1171c546aa8aa244dec7406de447 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Mon, 27 Feb 2023 20:18:56 +0000 Subject: [PATCH 098/397] Prepare object_store 0.5.5 (#3768) (#3770) --- CHANGELOG-old.md | 27 ++++++++++++++++++++++++++- CHANGELOG.md | 31 ++++++++++++++----------------- Cargo.toml | 2 +- dev/release/update_change_log.sh | 4 ++-- 4 files changed, 43 insertions(+), 21 deletions(-) diff --git a/CHANGELOG-old.md b/CHANGELOG-old.md index 78237a0..58fb8a3 100644 --- a/CHANGELOG-old.md +++ b/CHANGELOG-old.md @@ -19,7 +19,32 @@ # Historical Changelog -# Changelog +## [object_store_0.5.4](https://github.com/apache/arrow-rs/tree/object_store_0.5.4) (2023-01-30) + +[Full Changelog](https://github.com/apache/arrow-rs/compare/object_store_0.5.3...object_store_0.5.4) + +**Implemented enhancements:** + +- \[object\_store\] support more identity based auth flows for azure [\#3580](https://github.com/apache/arrow-rs/issues/3580) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Implement workload identity and application default credentials for GCP object store. [\#3533](https://github.com/apache/arrow-rs/issues/3533) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Support GCP Workload Identity [\#3490](https://github.com/apache/arrow-rs/issues/3490) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Allow providing service account key directly when building GCP object store client [\#3488](https://github.com/apache/arrow-rs/issues/3488) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] + +**Closed issues:** + +- object\_store: temporary aws credentials not refreshed? [\#3446](https://github.com/apache/arrow-rs/issues/3446) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] + +**Merged pull requests:** + +- Final tweaks to 32.0.0 changelog [\#3618](https://github.com/apache/arrow-rs/pull/3618) ([tustvold](https://github.com/tustvold)) +- Update AWS SDK [\#3617](https://github.com/apache/arrow-rs/pull/3617) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Add ClientOption.allow\_insecure [\#3600](https://github.com/apache/arrow-rs/pull/3600) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([poelzi](https://github.com/poelzi)) +- \[object\_store\] support azure managed and workload identities [\#3581](https://github.com/apache/arrow-rs/pull/3581) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([roeap](https://github.com/roeap)) +- Additional GCP authentication [\#3541](https://github.com/apache/arrow-rs/pull/3541) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([winding-lines](https://github.com/winding-lines)) +- Update aws-config and aws-types requirements from 0.52 to 0.53 [\#3539](https://github.com/apache/arrow-rs/pull/3539) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([viirya](https://github.com/viirya)) +- Use GHA concurrency groups \(\#3495\) [\#3538](https://github.com/apache/arrow-rs/pull/3538) ([tustvold](https://github.com/tustvold)) +- Remove azurite test exception [\#3497](https://github.com/apache/arrow-rs/pull/3497) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- feat: Allow providing a service account key directly for GCS [\#3489](https://github.com/apache/arrow-rs/pull/3489) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([scsmithr](https://github.com/scsmithr)) ## [object_store_0.5.3](https://github.com/apache/arrow-rs/tree/object_store_0.5.3) (2023-01-04) diff --git a/CHANGELOG.md b/CHANGELOG.md index c1734ec..b8f2fe8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,32 +19,29 @@ # Changelog -## [object_store_0.5.4](https://github.com/apache/arrow-rs/tree/object_store_0.5.4) (2023-01-30) +## [object_store_0.5.5](https://github.com/apache/arrow-rs/tree/object_store_0.5.5) (2023-02-27) -[Full Changelog](https://github.com/apache/arrow-rs/compare/object_store_0.5.3...object_store_0.5.4) +[Full Changelog](https://github.com/apache/arrow-rs/compare/object_store_0.5.4...object_store_0.5.5) **Implemented enhancements:** -- \[object\_store\] support more identity based auth flows for azure [\#3580](https://github.com/apache/arrow-rs/issues/3580) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- Implement workload identity and application default credentials for GCP object store. [\#3533](https://github.com/apache/arrow-rs/issues/3533) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- Support GCP Workload Identity [\#3490](https://github.com/apache/arrow-rs/issues/3490) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- Allow providing service account key directly when building GCP object store client [\#3488](https://github.com/apache/arrow-rs/issues/3488) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- object\_store: support azure cli credential [\#3697](https://github.com/apache/arrow-rs/issues/3697) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- object\_store: support encoded path as input [\#3651](https://github.com/apache/arrow-rs/issues/3651) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -**Closed issues:** +**Fixed bugs:** -- object\_store: temporary aws credentials not refreshed? [\#3446](https://github.com/apache/arrow-rs/issues/3446) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- object-store: aws\_profile fails to load static credentials [\#3765](https://github.com/apache/arrow-rs/issues/3765) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Inconsistent Behaviour Listing File [\#3712](https://github.com/apache/arrow-rs/issues/3712) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- object\_store: bearer token is azure is used like access key [\#3696](https://github.com/apache/arrow-rs/issues/3696) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] **Merged pull requests:** -- Final tweaks to 32.0.0 changelog [\#3618](https://github.com/apache/arrow-rs/pull/3618) ([tustvold](https://github.com/tustvold)) -- Update AWS SDK [\#3617](https://github.com/apache/arrow-rs/pull/3617) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- Add ClientOption.allow\_insecure [\#3600](https://github.com/apache/arrow-rs/pull/3600) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([poelzi](https://github.com/poelzi)) -- \[object\_store\] support azure managed and workload identities [\#3581](https://github.com/apache/arrow-rs/pull/3581) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([roeap](https://github.com/roeap)) -- Additional GCP authentication [\#3541](https://github.com/apache/arrow-rs/pull/3541) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([winding-lines](https://github.com/winding-lines)) -- Update aws-config and aws-types requirements from 0.52 to 0.53 [\#3539](https://github.com/apache/arrow-rs/pull/3539) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([viirya](https://github.com/viirya)) -- Use GHA concurrency groups \(\#3495\) [\#3538](https://github.com/apache/arrow-rs/pull/3538) ([tustvold](https://github.com/tustvold)) -- Remove azurite test exception [\#3497](https://github.com/apache/arrow-rs/pull/3497) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- feat: Allow providing a service account key directly for GCS [\#3489](https://github.com/apache/arrow-rs/pull/3489) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([scsmithr](https://github.com/scsmithr)) +- object-store: fix handling of AWS profile credentials without expiry [\#3766](https://github.com/apache/arrow-rs/pull/3766) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([helmus](https://github.com/helmus)) +- update object\_store deps to patch potential security vulnerabilities [\#3761](https://github.com/apache/arrow-rs/pull/3761) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([spencerbart](https://github.com/spencerbart)) +- Filter exact list prefix matches for azure gen2 accounts [\#3714](https://github.com/apache/arrow-rs/pull/3714) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([roeap](https://github.com/roeap)) +- Filter exact list prefix matches for MemoryStore and HttpStore \(\#3712\) [\#3713](https://github.com/apache/arrow-rs/pull/3713) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- object\_store: azure cli authorization [\#3698](https://github.com/apache/arrow-rs/pull/3698) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([roeap](https://github.com/roeap)) +- object\_store: add Path::from\_url\_path [\#3663](https://github.com/apache/arrow-rs/pull/3663) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([jychen7](https://github.com/jychen7)) diff --git a/Cargo.toml b/Cargo.toml index 8ab0c15..c0c090c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "object_store" -version = "0.5.4" +version = "0.5.5" edition = "2021" license = "MIT/Apache-2.0" readme = "README.md" diff --git a/dev/release/update_change_log.sh b/dev/release/update_change_log.sh index 5cf5582..de80d0f 100755 --- a/dev/release/update_change_log.sh +++ b/dev/release/update_change_log.sh @@ -29,8 +29,8 @@ set -e -SINCE_TAG="object_store_0.5.3" -FUTURE_RELEASE="object_store_0.5.4" +SINCE_TAG="object_store_0.5.4" +FUTURE_RELEASE="object_store_0.5.5" SOURCE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" SOURCE_TOP_DIR="$(cd "${SOURCE_DIR}/../../" && pwd)" From 3ef6eeec578d16f8492f9bd239165e17f2eb3891 Mon Sep 17 00:00:00 2001 From: Satyam Singh Date: Tue, 28 Feb 2023 04:30:55 +0530 Subject: [PATCH 099/397] Add support for unsigned payloads in aws (#3741) * Add support for unsigned payloads in aws * Add unsigned payload to AmazonS3ConfigKey * Link to aws doc * Add env test * Add test * Add integration test * Take boolean argument * Fix doc * Clippy fixes * Merge into s3 test --- src/aws/client.rs | 50 +++++++++++++++++++++++++++++++------ src/aws/credential.rs | 57 ++++++++++++++++++++++++++++++++++++++++--- src/aws/mod.rs | 38 ++++++++++++++++++++++++++++- 3 files changed, 133 insertions(+), 12 deletions(-) diff --git a/src/aws/client.rs b/src/aws/client.rs index b40bcba..0b0f883 100644 --- a/src/aws/client.rs +++ b/src/aws/client.rs @@ -204,6 +204,7 @@ pub struct S3Config { pub credentials: Box, pub retry_config: RetryConfig, pub client_options: ClientOptions, + pub sign_payload: bool, } impl S3Config { @@ -256,7 +257,12 @@ impl S3Client { } let response = builder - .with_aws_sigv4(credential.as_ref(), &self.config.region, "s3") + .with_aws_sigv4( + credential.as_ref(), + &self.config.region, + "s3", + self.config.sign_payload, + ) .send_retry(&self.config.retry_config) .await .context(GetRequestSnafu { @@ -287,7 +293,12 @@ impl S3Client { let response = builder .query(query) - .with_aws_sigv4(credential.as_ref(), &self.config.region, "s3") + .with_aws_sigv4( + credential.as_ref(), + &self.config.region, + "s3", + self.config.sign_payload, + ) .send_retry(&self.config.retry_config) .await .context(PutRequestSnafu { @@ -309,7 +320,12 @@ impl S3Client { self.client .request(Method::DELETE, url) .query(query) - .with_aws_sigv4(credential.as_ref(), &self.config.region, "s3") + .with_aws_sigv4( + credential.as_ref(), + &self.config.region, + "s3", + self.config.sign_payload, + ) .send_retry(&self.config.retry_config) .await .context(DeleteRequestSnafu { @@ -328,7 +344,12 @@ impl S3Client { self.client .request(Method::PUT, url) .header("x-amz-copy-source", source) - .with_aws_sigv4(credential.as_ref(), &self.config.region, "s3") + .with_aws_sigv4( + credential.as_ref(), + &self.config.region, + "s3", + self.config.sign_payload, + ) .send_retry(&self.config.retry_config) .await .context(CopyRequestSnafu { @@ -369,7 +390,12 @@ impl S3Client { .client .request(Method::GET, &url) .query(&query) - .with_aws_sigv4(credential.as_ref(), &self.config.region, "s3") + .with_aws_sigv4( + credential.as_ref(), + &self.config.region, + "s3", + self.config.sign_payload, + ) .send_retry(&self.config.retry_config) .await .context(ListRequestSnafu)? @@ -407,7 +433,12 @@ impl S3Client { let response = self .client .request(Method::POST, url) - .with_aws_sigv4(credential.as_ref(), &self.config.region, "s3") + .with_aws_sigv4( + credential.as_ref(), + &self.config.region, + "s3", + self.config.sign_payload, + ) .send_retry(&self.config.retry_config) .await .context(CreateMultipartRequestSnafu)? @@ -446,7 +477,12 @@ impl S3Client { .request(Method::POST, url) .query(&[("uploadId", upload_id)]) .body(body) - .with_aws_sigv4(credential.as_ref(), &self.config.region, "s3") + .with_aws_sigv4( + credential.as_ref(), + &self.config.region, + "s3", + self.config.sign_payload, + ) .send_retry(&self.config.retry_config) .await .context(CompleteMultipartRequestSnafu)?; diff --git a/src/aws/credential.rs b/src/aws/credential.rs index e2332d0..05f2c53 100644 --- a/src/aws/credential.rs +++ b/src/aws/credential.rs @@ -39,6 +39,7 @@ type StdError = Box; /// SHA256 hash of empty string static EMPTY_SHA256_HASH: &str = "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855"; +static UNSIGNED_PAYLOAD_LITERAL: &str = "UNSIGNED-PAYLOAD"; #[derive(Debug)] pub struct AwsCredential { @@ -72,6 +73,7 @@ struct RequestSigner<'a> { credential: &'a AwsCredential, service: &'a str, region: &'a str, + sign_payload: bool, } const DATE_HEADER: &str = "x-amz-date"; @@ -98,9 +100,13 @@ impl<'a> RequestSigner<'a> { let date_val = HeaderValue::from_str(&date_str).unwrap(); request.headers_mut().insert(DATE_HEADER, date_val); - let digest = match request.body() { - None => EMPTY_SHA256_HASH.to_string(), - Some(body) => hex_digest(body.as_bytes().unwrap()), + let digest = if self.sign_payload { + match request.body() { + None => EMPTY_SHA256_HASH.to_string(), + Some(body) => hex_digest(body.as_bytes().unwrap()), + } + } else { + UNSIGNED_PAYLOAD_LITERAL.to_string() }; let header_digest = HeaderValue::from_str(&digest).unwrap(); @@ -158,6 +164,7 @@ pub trait CredentialExt { credential: &AwsCredential, region: &str, service: &str, + sign_payload: bool, ) -> Self; } @@ -167,6 +174,7 @@ impl CredentialExt for RequestBuilder { credential: &AwsCredential, region: &str, service: &str, + sign_payload: bool, ) -> Self { // Hack around lack of access to underlying request // https://github.com/seanmonstar/reqwest/issues/1212 @@ -182,6 +190,7 @@ impl CredentialExt for RequestBuilder { credential, service, region, + sign_payload, }; signer.sign(&mut request); @@ -585,7 +594,7 @@ mod tests { // Test generated using https://docs.aws.amazon.com/general/latest/gr/sigv4-signed-request-examples.html #[test] - fn test_sign() { + fn test_sign_with_signed_payload() { let client = Client::new(); // Test credentials from https://docs.aws.amazon.com/AmazonS3/latest/userguide/RESTAuthentication.html @@ -615,12 +624,51 @@ mod tests { credential: &credential, service: "ec2", region: "us-east-1", + sign_payload: true, }; signer.sign(&mut request); assert_eq!(request.headers().get(AUTH_HEADER).unwrap(), "AWS4-HMAC-SHA256 Credential=AKIAIOSFODNN7EXAMPLE/20220806/us-east-1/ec2/aws4_request, SignedHeaders=host;x-amz-content-sha256;x-amz-date, Signature=a3c787a7ed37f7fdfbfd2d7056a3d7c9d85e6d52a2bfbec73793c0be6e7862d4") } + #[test] + fn test_sign_with_unsigned_payload() { + let client = Client::new(); + + // Test credentials from https://docs.aws.amazon.com/AmazonS3/latest/userguide/RESTAuthentication.html + let credential = AwsCredential { + key_id: "AKIAIOSFODNN7EXAMPLE".to_string(), + secret_key: "wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY".to_string(), + token: None, + }; + + // method = 'GET' + // service = 'ec2' + // host = 'ec2.amazonaws.com' + // region = 'us-east-1' + // endpoint = 'https://ec2.amazonaws.com' + // request_parameters = '' + let date = DateTime::parse_from_rfc3339("2022-08-06T18:01:34Z") + .unwrap() + .with_timezone(&Utc); + + let mut request = client + .request(Method::GET, "https://ec2.amazon.com/") + .build() + .unwrap(); + + let signer = RequestSigner { + date, + credential: &credential, + service: "ec2", + region: "us-east-1", + sign_payload: false, + }; + + signer.sign(&mut request); + assert_eq!(request.headers().get(AUTH_HEADER).unwrap(), "AWS4-HMAC-SHA256 Credential=AKIAIOSFODNN7EXAMPLE/20220806/us-east-1/ec2/aws4_request, SignedHeaders=host;x-amz-content-sha256;x-amz-date, Signature=653c3d8ea261fd826207df58bc2bb69fbb5003e9eb3c0ef06e4a51f2a81d8699") + } + #[test] fn test_sign_port() { let client = Client::new(); @@ -651,6 +699,7 @@ mod tests { credential: &credential, service: "s3", region: "us-east-1", + sign_payload: true, }; signer.sign(&mut request); diff --git a/src/aws/mod.rs b/src/aws/mod.rs index a1c9eae..c724886 100644 --- a/src/aws/mod.rs +++ b/src/aws/mod.rs @@ -385,6 +385,7 @@ pub struct AmazonS3Builder { retry_config: RetryConfig, imdsv1_fallback: bool, virtual_hosted_style_request: bool, + unsigned_payload: bool, metadata_endpoint: Option, profile: Option, client_options: ClientOptions, @@ -504,6 +505,15 @@ pub enum AmazonS3ConfigKey { /// - `virtual_hosted_style_request` VirtualHostedStyleRequest, + /// Avoid computing payload checksum when calculating signature. + /// + /// See [`AmazonS3Builder::with_unsigned_payload`] for details. + /// + /// Supported keys: + /// - `aws_unsigned_payload` + /// - `unsigned_payload` + UnsignedPayload, + /// Set the instance metadata endpoint /// /// See [`AmazonS3Builder::with_metadata_endpoint`] for details. @@ -535,6 +545,7 @@ impl AsRef for AmazonS3ConfigKey { Self::DefaultRegion => "aws_default_region", Self::MetadataEndpoint => "aws_metadata_endpoint", Self::Profile => "aws_profile", + Self::UnsignedPayload => "aws_unsigned_payload", } } } @@ -563,6 +574,7 @@ impl FromStr for AmazonS3ConfigKey { "aws_profile" | "profile" => Ok(Self::Profile), "aws_imdsv1_fallback" | "imdsv1_fallback" => Ok(Self::ImdsV1Fallback), "aws_metadata_endpoint" | "metadata_endpoint" => Ok(Self::MetadataEndpoint), + "aws_unsigned_payload" | "unsigned_payload" => Ok(Self::UnsignedPayload), _ => Err(Error::UnknownConfigurationKey { key: s.into() }.into()), } } @@ -679,6 +691,9 @@ impl AmazonS3Builder { self.metadata_endpoint = Some(value.into()) } AmazonS3ConfigKey::Profile => self.profile = Some(value.into()), + AmazonS3ConfigKey::UnsignedPayload => { + self.unsigned_payload = str_is_truthy(&value.into()) + } }; Ok(self) } @@ -822,6 +837,15 @@ impl AmazonS3Builder { self } + /// Sets if unsigned payload option has to be used. + /// See [unsigned payload option](https://docs.aws.amazon.com/AmazonS3/latest/API/sig-v4-header-based-auth.html) + /// * false (default): Signed payload option is used, where the checksum for the request body is computed and included when constructing a canonical request. + /// * true: Unsigned payload option is used. `UNSIGNED-PAYLOAD` literal is included when constructing a canonical request, + pub fn with_unsigned_payload(mut self, unsigned_payload: bool) -> Self { + self.unsigned_payload = unsigned_payload; + self + } + /// Set the [instance metadata endpoint](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/ec2-instance-metadata.html), /// used primarily within AWS EC2. /// @@ -967,6 +991,7 @@ impl AmazonS3Builder { credentials, retry_config: self.retry_config, client_options: self.client_options, + sign_payload: !self.unsigned_payload, }; let client = Arc::new(S3Client::new(config)?); @@ -1125,6 +1150,7 @@ mod tests { "AWS_CONTAINER_CREDENTIALS_RELATIVE_URI", &container_creds_relative_uri, ); + env::set_var("AWS_UNSIGNED_PAYLOAD", "true"); let builder = AmazonS3Builder::from_env(); assert_eq!(builder.access_key_id.unwrap(), aws_access_key_id.as_str()); @@ -1136,9 +1162,9 @@ mod tests { assert_eq!(builder.endpoint.unwrap(), aws_endpoint); assert_eq!(builder.token.unwrap(), aws_session_token); - let metadata_uri = format!("{METADATA_ENDPOINT}{container_creds_relative_uri}"); assert_eq!(builder.metadata_endpoint.unwrap(), metadata_uri); + assert!(builder.unsigned_payload); } #[test] @@ -1154,6 +1180,7 @@ mod tests { ("aws_default_region", aws_default_region.clone()), ("aws_endpoint", aws_endpoint.clone()), ("aws_session_token", aws_session_token.clone()), + ("aws_unsigned_payload", "true".to_string()), ]); let builder = AmazonS3Builder::new() @@ -1166,6 +1193,7 @@ mod tests { assert_eq!(builder.region.unwrap(), aws_default_region); assert_eq!(builder.endpoint.unwrap(), aws_endpoint); assert_eq!(builder.token.unwrap(), aws_session_token); + assert!(builder.unsigned_payload); } #[test] @@ -1181,6 +1209,7 @@ mod tests { (AmazonS3ConfigKey::DefaultRegion, aws_default_region.clone()), (AmazonS3ConfigKey::Endpoint, aws_endpoint.clone()), (AmazonS3ConfigKey::Token, aws_session_token.clone()), + (AmazonS3ConfigKey::UnsignedPayload, "true".to_string()), ]); let builder = AmazonS3Builder::new() @@ -1193,6 +1222,7 @@ mod tests { assert_eq!(builder.region.unwrap(), aws_default_region); assert_eq!(builder.endpoint.unwrap(), aws_endpoint); assert_eq!(builder.token.unwrap(), aws_session_token); + assert!(builder.unsigned_payload); } #[test] @@ -1220,6 +1250,12 @@ mod tests { list_with_delimiter(&integration).await; rename_and_copy(&integration).await; stream_get(&integration).await; + + // run integration test with unsigned payload enabled + let config = maybe_skip_integration!().with_unsigned_payload(true); + let is_local = matches!(&config.endpoint, Some(e) if e.starts_with("http://")); + let integration = config.build().unwrap(); + put_get_delete_list_opts(&integration, is_local).await; } #[tokio::test] From d53d1c9b0aae01a0592dc65d5c723842832614c2 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 2 Mar 2023 16:57:52 +0000 Subject: [PATCH 100/397] Make LocalFileSystem::put atomic (#3780) (#3781) * Make LocalFileSystem::put atomic (#3780) * Clippy * Add list test --- src/lib.rs | 6 +++ src/local.rs | 106 ++++++++++++++++++++++++--------------------------- 2 files changed, 56 insertions(+), 56 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 6a3275b..671b22d 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -840,6 +840,12 @@ mod tests { crate::Error::NotFound { .. } )); + let files = flatten_list_stream(storage, None).await.unwrap(); + assert_eq!(&files, &[]); + + let result = storage.list_with_delimiter(None).await.unwrap(); + assert_eq!(&result.objects, &[]); + writer.shutdown().await.unwrap(); let bytes_written = storage.get(&location).await.unwrap().bytes().await.unwrap(); assert_eq!(bytes_expected, bytes_written); diff --git a/src/local.rs b/src/local.rs index 9a518ba..f1733f5 100644 --- a/src/local.rs +++ b/src/local.rs @@ -27,8 +27,8 @@ use futures::future::BoxFuture; use futures::FutureExt; use futures::{stream::BoxStream, StreamExt}; use snafu::{ensure, OptionExt, ResultExt, Snafu}; -use std::fs::{metadata, symlink_metadata, File}; -use std::io::{Read, Seek, SeekFrom, Write}; +use std::fs::{metadata, symlink_metadata, File, OpenOptions}; +use std::io::{ErrorKind, Read, Seek, SeekFrom, Write}; use std::ops::Range; use std::pin::Pin; use std::sync::Arc; @@ -65,6 +65,11 @@ pub(crate) enum Error { source: io::Error, }, + #[snafu(display("Unable to rename file: {}", source))] + UnableToRenameFile { + source: io::Error, + }, + #[snafu(display("Unable to create dir {}: {}", path.display(), source))] UnableToCreateDir { source: io::Error, @@ -266,11 +271,14 @@ impl ObjectStore for LocalFileSystem { let path = self.config.path_to_filesystem(location)?; maybe_spawn_blocking(move || { - let mut file = open_writable_file(&path)?; + let (mut file, suffix) = new_staged_upload(&path)?; + let staging_path = staged_upload_path(&path, &suffix); file.write_all(&bytes) .context(UnableToCopyDataToFileSnafu)?; + std::fs::rename(staging_path, path).context(UnableToRenameFileSnafu)?; + Ok(()) }) .await @@ -282,28 +290,10 @@ impl ObjectStore for LocalFileSystem { ) -> Result<(MultipartId, Box)> { let dest = self.config.path_to_filesystem(location)?; - // Generate an id in case of concurrent writes - let mut multipart_id = 1; - - // Will write to a temporary path - let staging_path = loop { - let staging_path = get_upload_stage_path(&dest, &multipart_id.to_string()); - - match std::fs::metadata(&staging_path) { - Err(err) if err.kind() == io::ErrorKind::NotFound => break staging_path, - Err(err) => { - return Err(Error::UnableToCopyDataToFile { source: err }.into()) - } - Ok(_) => multipart_id += 1, - } - }; - let multipart_id = multipart_id.to_string(); - - let file = open_writable_file(&staging_path)?; - + let (file, suffix) = new_staged_upload(&dest)?; Ok(( - multipart_id.clone(), - Box::new(LocalUpload::new(dest, multipart_id, Arc::new(file))), + suffix.clone(), + Box::new(LocalUpload::new(dest, suffix, Arc::new(file))), )) } @@ -313,7 +303,7 @@ impl ObjectStore for LocalFileSystem { multipart_id: &MultipartId, ) -> Result<()> { let dest = self.config.path_to_filesystem(location)?; - let staging_path: PathBuf = get_upload_stage_path(&dest, multipart_id); + let staging_path: PathBuf = staged_upload_path(&dest, multipart_id); maybe_spawn_blocking(move || { std::fs::remove_file(&staging_path) @@ -553,9 +543,40 @@ impl ObjectStore for LocalFileSystem { } } -fn get_upload_stage_path(dest: &std::path::Path, multipart_id: &MultipartId) -> PathBuf { +/// Generates a unique file path `{base}#{suffix}`, returning the opened `File` and `suffix` +/// +/// Creates any directories if necessary +fn new_staged_upload(base: &std::path::Path) -> Result<(File, String)> { + let mut multipart_id = 1; + loop { + let suffix = multipart_id.to_string(); + let path = staged_upload_path(base, &suffix); + let mut options = OpenOptions::new(); + match options.read(true).write(true).create_new(true).open(&path) { + Ok(f) => return Ok((f, suffix)), + Err(e) if e.kind() == ErrorKind::AlreadyExists => { + multipart_id += 1; + } + Err(err) if err.kind() == ErrorKind::NotFound => { + let parent = path + .parent() + .context(UnableToCreateFileSnafu { path: &path, err })?; + + std::fs::create_dir_all(parent) + .context(UnableToCreateDirSnafu { path: parent })?; + + continue; + } + Err(source) => return Err(Error::UnableToOpenFile { source, path }.into()), + } + } +} + +/// Returns the unique upload for the given path and suffix +fn staged_upload_path(dest: &std::path::Path, suffix: &str) -> PathBuf { let mut staging_path = dest.as_os_str().to_owned(); - staging_path.push(format!("#{multipart_id}")); + staging_path.push("#"); + staging_path.push(suffix); staging_path.into() } @@ -700,7 +721,7 @@ impl AsyncWrite for LocalUpload { Poll::Ready(res) => { res?; let staging_path = - get_upload_stage_path(&self.dest, &self.multipart_id); + staged_upload_path(&self.dest, &self.multipart_id); let dest = self.dest.clone(); self.inner_state = LocalUploadState::Committing(Box::pin( runtime @@ -741,7 +762,7 @@ impl AsyncWrite for LocalUpload { } } } else { - let staging_path = get_upload_stage_path(&self.dest, &self.multipart_id); + let staging_path = staged_upload_path(&self.dest, &self.multipart_id); match &mut self.inner_state { LocalUploadState::Idle(file) => { let file = Arc::clone(file); @@ -802,33 +823,6 @@ fn open_file(path: &PathBuf) -> Result { Ok(file) } -fn open_writable_file(path: &PathBuf) -> Result { - match File::create(path) { - Ok(f) => Ok(f), - Err(err) if err.kind() == std::io::ErrorKind::NotFound => { - let parent = path - .parent() - .context(UnableToCreateFileSnafu { path: &path, err })?; - std::fs::create_dir_all(parent) - .context(UnableToCreateDirSnafu { path: parent })?; - - match File::create(path) { - Ok(f) => Ok(f), - Err(err) => Err(Error::UnableToCreateFile { - path: path.to_path_buf(), - err, - } - .into()), - } - } - Err(err) => Err(Error::UnableToCreateFile { - path: path.to_path_buf(), - err, - } - .into()), - } -} - fn convert_entry(entry: DirEntry, location: Path) -> Result { let metadata = entry .metadata() From 1b14b4a2ab2371ec1e895fb119e704654d094a99 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 2 Mar 2023 17:07:50 +0000 Subject: [PATCH 101/397] Add ObjectStore::append (#3791) --- src/lib.rs | 37 +++++++++++++++++++++++++++++++++++-- 1 file changed, 35 insertions(+), 2 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 671b22d..3af5382 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -278,7 +278,11 @@ pub type MultipartId = String; /// Universal API to multiple object store services. #[async_trait] pub trait ObjectStore: std::fmt::Display + Send + Sync + Debug + 'static { - /// Save the provided bytes to the specified location. + /// Save the provided bytes to the specified location + /// + /// The operation is guaranteed to be atomic, it will either successfully + /// write the entirety of `bytes` to `location`, or fail. No clients + /// should be able to observe a partially written object async fn put(&self, location: &Path, bytes: Bytes) -> Result<()>; /// Get a multi-part upload that allows writing data in chunks @@ -286,7 +290,9 @@ pub trait ObjectStore: std::fmt::Display + Send + Sync + Debug + 'static { /// Most cloud-based uploads will buffer and upload parts in parallel. /// /// To complete the upload, [AsyncWrite::poll_shutdown] must be called - /// to completion. + /// to completion. This operation is guaranteed to be atomic, it will either + /// make all the written data available at `location`, or fail. No clients + /// should be able to observe a partially written object /// /// For some object stores (S3, GCS, and local in particular), if the /// writer fails or panics, you must call [ObjectStore::abort_multipart] @@ -306,6 +312,33 @@ pub trait ObjectStore: std::fmt::Display + Send + Sync + Debug + 'static { multipart_id: &MultipartId, ) -> Result<()>; + /// Returns an [`AsyncWrite`] that can be used to append to the object at `location` + /// + /// A new object will be created if it doesn't already exist, otherwise it will be + /// opened, with subsequent writes appended to the end. + /// + /// This operation cannot be supported by all stores, most use-cases should prefer + /// [`ObjectStore::put`] and [`ObjectStore::put_multipart`] for better portability + /// and stronger guarantees + /// + /// This API is not guaranteed to be atomic, in particular + /// + /// * On error, `location` may contain partial data + /// * Concurrent calls to [`ObjectStore::list`] may return partially written objects + /// * Concurrent calls to [`ObjectStore::get`] may return partially written data + /// * Concurrent calls to [`ObjectStore::put`] may result in data loss / corruption + /// * Concurrent calls to [`ObjectStore::append`] may result in data loss / corruption + /// + /// Additionally some stores, such as Azure, may only support appending to objects created + /// with [`ObjectStore::append`], and not with [`ObjectStore::put`], [`ObjectStore::copy`], or + /// [`ObjectStore::put_multipart`] + async fn append( + &self, + _location: &Path, + ) -> Result> { + Err(Error::NotImplemented) + } + /// Return the bytes that are stored at the specified location. async fn get(&self, location: &Path) -> Result; From 9b3b47ce1056db7e9314846537119206238e5be6 Mon Sep 17 00:00:00 2001 From: Alex Huang Date: Sat, 4 Mar 2023 18:37:06 +0100 Subject: [PATCH 102/397] Make InMemory object store track last modified time for each entry (#3796) * refactor: allow InMemoryUpload to store timestamp * use new last modified timestamp --- src/memory.rs | 67 ++++++++++++++++++++++++++++----------------------- 1 file changed, 37 insertions(+), 30 deletions(-) diff --git a/src/memory.rs b/src/memory.rs index 40eee55..1433701 100644 --- a/src/memory.rs +++ b/src/memory.rs @@ -20,7 +20,7 @@ use crate::MultipartId; use crate::{path::Path, GetResult, ListResult, ObjectMeta, ObjectStore, Result}; use async_trait::async_trait; use bytes::Bytes; -use chrono::Utc; +use chrono::{DateTime, Utc}; use futures::{stream::BoxStream, StreamExt}; use parking_lot::RwLock; use snafu::{ensure, OptionExt, Snafu}; @@ -33,6 +33,9 @@ use std::sync::Arc; use std::task::Poll; use tokio::io::AsyncWrite; +type Entry = (Bytes, DateTime); +type StorageType = Arc>>; + /// A specialized `Error` for in-memory object store-related errors #[derive(Debug, Snafu)] #[allow(missing_docs)] @@ -73,7 +76,7 @@ impl From for super::Error { /// storage provider. #[derive(Debug, Default)] pub struct InMemory { - storage: Arc>>, + storage: StorageType, } impl std::fmt::Display for InMemory { @@ -85,7 +88,9 @@ impl std::fmt::Display for InMemory { #[async_trait] impl ObjectStore for InMemory { async fn put(&self, location: &Path, bytes: Bytes) -> Result<()> { - self.storage.write().insert(location.clone(), bytes); + self.storage + .write() + .insert(location.clone(), (bytes, Utc::now())); Ok(()) } @@ -113,19 +118,19 @@ impl ObjectStore for InMemory { } async fn get(&self, location: &Path) -> Result { - let data = self.get_bytes(location).await?; + let data = self.entry(location).await?; Ok(GetResult::Stream( - futures::stream::once(async move { Ok(data) }).boxed(), + futures::stream::once(async move { Ok(data.0) }).boxed(), )) } async fn get_range(&self, location: &Path, range: Range) -> Result { - let data = self.get_bytes(location).await?; - ensure!(range.end <= data.len(), OutOfRangeSnafu); + let data = self.entry(location).await?; + ensure!(range.end <= data.0.len(), OutOfRangeSnafu); ensure!(range.start <= range.end, BadRangeSnafu); - Ok(data.slice(range)) + Ok(data.0.slice(range)) } async fn get_ranges( @@ -133,24 +138,23 @@ impl ObjectStore for InMemory { location: &Path, ranges: &[Range], ) -> Result> { - let data = self.get_bytes(location).await?; + let data = self.entry(location).await?; ranges .iter() .map(|range| { - ensure!(range.end <= data.len(), OutOfRangeSnafu); + ensure!(range.end <= data.0.len(), OutOfRangeSnafu); ensure!(range.start <= range.end, BadRangeSnafu); - Ok(data.slice(range.clone())) + Ok(data.0.slice(range.clone())) }) .collect() } async fn head(&self, location: &Path) -> Result { - let last_modified = Utc::now(); - let bytes = self.get_bytes(location).await?; + let entry = self.entry(location).await?; Ok(ObjectMeta { location: location.clone(), - last_modified, - size: bytes.len(), + last_modified: entry.1, + size: entry.0.len(), }) } @@ -165,7 +169,6 @@ impl ObjectStore for InMemory { ) -> Result>> { let root = Path::default(); let prefix = prefix.unwrap_or(&root); - let last_modified = Utc::now(); let storage = self.storage.read(); let values: Vec<_> = storage @@ -180,8 +183,8 @@ impl ObjectStore for InMemory { .map(|(key, value)| { Ok(ObjectMeta { location: key.clone(), - last_modified, - size: value.len(), + last_modified: value.1, + size: value.0.len(), }) }) .collect(); @@ -197,7 +200,6 @@ impl ObjectStore for InMemory { let prefix = prefix.unwrap_or(&root); let mut common_prefixes = BTreeSet::new(); - let last_modified = Utc::now(); // Only objects in this base level should be returned in the // response. Otherwise, we just collect the common prefixes. @@ -224,8 +226,8 @@ impl ObjectStore for InMemory { } else { let object = ObjectMeta { location: k.clone(), - last_modified, - size: v.len(), + last_modified: v.1, + size: v.0.len(), }; objects.push(object); } @@ -238,13 +240,15 @@ impl ObjectStore for InMemory { } async fn copy(&self, from: &Path, to: &Path) -> Result<()> { - let data = self.get_bytes(from).await?; - self.storage.write().insert(to.clone(), data); + let data = self.entry(from).await?; + self.storage + .write() + .insert(to.clone(), (data.0, Utc::now())); Ok(()) } async fn copy_if_not_exists(&self, from: &Path, to: &Path) -> Result<()> { - let data = self.get_bytes(from).await?; + let data = self.entry(from).await?; let mut storage = self.storage.write(); if storage.contains_key(to) { return Err(Error::AlreadyExists { @@ -252,7 +256,7 @@ impl ObjectStore for InMemory { } .into()); } - storage.insert(to.clone(), data); + storage.insert(to.clone(), (data.0, Utc::now())); Ok(()) } } @@ -273,22 +277,23 @@ impl InMemory { } } - async fn get_bytes(&self, location: &Path) -> Result { + async fn entry(&self, location: &Path) -> Result<(Bytes, DateTime)> { let storage = self.storage.read(); - let bytes = storage + let value = storage .get(location) .cloned() .context(NoDataInMemorySnafu { path: location.to_string(), })?; - Ok(bytes) + + Ok(value) } } struct InMemoryUpload { location: Path, data: Vec, - storage: Arc>>, + storage: StorageType, } impl AsyncWrite for InMemoryUpload { @@ -313,7 +318,9 @@ impl AsyncWrite for InMemoryUpload { _cx: &mut std::task::Context<'_>, ) -> std::task::Poll> { let data = Bytes::from(std::mem::take(&mut self.data)); - self.storage.write().insert(self.location.clone(), data); + self.storage + .write() + .insert(self.location.clone(), (data, Utc::now())); Poll::Ready(Ok(())) } } From ce35dbce89d6949febd90b67800bce0c895a9559 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Sun, 5 Mar 2023 13:10:12 +0000 Subject: [PATCH 103/397] Cleanup ApplicationDefaultCredentials (#3799) * Cleanup ApplicationDefaultCredentials * Fix doc --- src/gcp/credential.rs | 84 ++++++++++++++++++++++++++++++------------- 1 file changed, 59 insertions(+), 25 deletions(-) diff --git a/src/gcp/credential.rs b/src/gcp/credential.rs index 853e4ce..a8dce71 100644 --- a/src/gcp/credential.rs +++ b/src/gcp/credential.rs @@ -62,8 +62,8 @@ pub enum Error { #[snafu(display("Error getting token response body: {}", source))] TokenResponseBody { source: reqwest::Error }, - #[snafu(display("A configuration file was passed in but was not used."))] - UnusedConfigurationFile, + #[snafu(display("Unsupported ApplicationCredentials type: {}", type_))] + UnsupportedCredentialsType { type_: String }, #[snafu(display("Error creating client: {}", source))] Client { source: crate::Error }, @@ -399,36 +399,60 @@ impl TokenProvider for InstanceCredentialProvider { } } +/// ApplicationDefaultCredentials +/// +#[derive(Debug)] +pub enum ApplicationDefaultCredentials { + /// + AuthorizedUser { + client_id: String, + client_secret: String, + refresh_token: String, + }, +} + +impl ApplicationDefaultCredentials { + pub fn new(path: Option<&str>) -> Result, Error> { + let file = match ApplicationDefaultCredentialsFile::read(path)? { + Some(f) => f, + None => return Ok(None), + }; + + Ok(Some(match file.type_.as_str() { + "authorized_user" => Self::AuthorizedUser { + client_id: file.client_id, + client_secret: file.client_secret, + refresh_token: file.refresh_token, + }, + type_ => return UnsupportedCredentialsTypeSnafu { type_ }.fail(), + })) + } +} + /// A deserialized `application_default_credentials.json`-file. /// -#[derive(serde::Deserialize, Debug)] -pub struct ApplicationDefaultCredentials { +#[derive(serde::Deserialize)] +struct ApplicationDefaultCredentialsFile { + #[serde(default)] client_id: String, + #[serde(default)] client_secret: String, + #[serde(default)] refresh_token: String, #[serde(rename = "type")] type_: String, } -impl ApplicationDefaultCredentials { - const DEFAULT_TOKEN_GCP_URI: &'static str = - "https://accounts.google.com/o/oauth2/token"; +impl ApplicationDefaultCredentialsFile { const CREDENTIALS_PATH: &'static str = ".config/gcloud/application_default_credentials.json"; - const EXPECTED_TYPE: &str = "authorized_user"; // Create a new application default credential in the following situations: // 1. a file is passed in and the type matches. // 2. without argument if the well-known configuration file is present. - pub fn new(path: Option<&str>) -> Result, Error> { + fn read(path: Option<&str>) -> Result, Error> { if let Some(path) = path { - if let Ok(credentials) = read_credentials_file::(path) { - if credentials.type_ == Self::EXPECTED_TYPE { - return Ok(Some(credentials)); - } - } - // Return an error if the path has not been used. - return Err(Error::UnusedConfigurationFile); + return read_credentials_file::(path).map(Some); } if let Some(home) = env::var_os("HOME") { let path = Path::new(&home).join(Self::CREDENTIALS_PATH); @@ -442,6 +466,8 @@ impl ApplicationDefaultCredentials { } } +const DEFAULT_TOKEN_GCP_URI: &str = "https://accounts.google.com/o/oauth2/token"; + #[async_trait] impl TokenProvider for ApplicationDefaultCredentials { async fn fetch_token( @@ -449,16 +475,24 @@ impl TokenProvider for ApplicationDefaultCredentials { client: &Client, retry: &RetryConfig, ) -> Result, Error> { - let body = [ - ("grant_type", "refresh_token"), - ("client_id", &self.client_id), - ("client_secret", &self.client_secret), - ("refresh_token", &self.refresh_token), - ]; + let builder = client.request(Method::POST, DEFAULT_TOKEN_GCP_URI); + let builder = match self { + Self::AuthorizedUser { + client_id, + client_secret, + refresh_token, + } => { + let body = [ + ("grant_type", "refresh_token"), + ("client_id", client_id), + ("client_secret", client_secret), + ("refresh_token", refresh_token), + ]; + builder.form(&body) + } + }; - let response = client - .request(Method::POST, Self::DEFAULT_TOKEN_GCP_URI) - .form(&body) + let response = builder .send_retry(retry) .await .context(TokenRequestSnafu)? From 79702853f85a8baec7ebac789635d44604e31d48 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Metehan=20Y=C4=B1ld=C4=B1r=C4=B1m?= <100111937+metesynnada@users.noreply.github.com> Date: Sat, 11 Mar 2023 14:30:57 +0300 Subject: [PATCH 104/397] [ObjectStore] Add `append` API impl for `LocalFileSystem` (#3824) * Append Push API * wasm is not enabled. --- Cargo.toml | 7 ++- src/local.rs | 139 ++++++++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 144 insertions(+), 2 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index c0c090c..c6bb7e8 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -37,7 +37,6 @@ itertools = "0.10.1" parking_lot = { version = "0.12" } percent-encoding = "2.1" snafu = "0.7" -tokio = { version = "1.25.0", features = ["sync", "macros", "rt", "time", "io-util"] } tracing = { version = "0.1" } url = "2.2" walkdir = "2" @@ -57,6 +56,12 @@ aws-types = { version = "0.54", optional = true } aws-credential-types = { version = "0.54", optional = true } aws-config = { version = "0.54", optional = true } +[target.'cfg(not(target_arch = "wasm32"))'.dependencies] +tokio = { version = "1.25.0", features = ["sync", "macros", "rt", "time", "io-util", "fs"] } + +[target.'cfg(target_arch = "wasm32")'.dependencies] +tokio = { version = "1.25.0", features = ["sync", "macros", "rt", "time", "io-util"] } + [features] cloud = ["serde", "serde_json", "quick-xml", "reqwest", "reqwest/json", "reqwest/stream", "chrono/serde", "base64", "rand", "ring"] azure = ["cloud"] diff --git a/src/local.rs b/src/local.rs index f1733f5..ac0b020 100644 --- a/src/local.rs +++ b/src/local.rs @@ -269,7 +269,6 @@ impl Config { impl ObjectStore for LocalFileSystem { async fn put(&self, location: &Path, bytes: Bytes) -> Result<()> { let path = self.config.path_to_filesystem(location)?; - maybe_spawn_blocking(move || { let (mut file, suffix) = new_staged_upload(&path)?; let staging_path = staged_upload_path(&path, &suffix); @@ -313,6 +312,53 @@ impl ObjectStore for LocalFileSystem { .await } + async fn append( + &self, + location: &Path, + ) -> Result> { + #[cfg(not(target_arch = "wasm32"))] + // Get the path to the file from the configuration. + let path = self.config.path_to_filesystem(location)?; + loop { + // Create new `OpenOptions`. + let mut options = tokio::fs::OpenOptions::new(); + + // Attempt to open the file with the given options. + match options + .truncate(false) + .append(true) + .create(true) + .open(&path) + .await + { + // If the file was successfully opened, return it wrapped in a boxed `AsyncWrite` trait object. + Ok(file) => return Ok(Box::new(file)), + // If the error is that the file was not found, attempt to create the file and any necessary parent directories. + Err(err) if err.kind() == ErrorKind::NotFound => { + // Get the path to the parent directory of the file. + let parent = path + .parent() + // If the parent directory does not exist, return a `UnableToCreateFileSnafu` error. + .context(UnableToCreateFileSnafu { path: &path, err })?; + + // Create the parent directory and any necessary ancestors. + tokio::fs::create_dir_all(parent) + .await + // If creating the directory fails, return a `UnableToCreateDirSnafu` error. + .context(UnableToCreateDirSnafu { path: parent })?; + // Try again to open the file. + continue; + } + // If any other error occurs, return a `UnableToOpenFile` error. + Err(source) => { + return Err(Error::UnableToOpenFile { source, path }.into()) + } + } + } + #[cfg(target_arch = "wasm32")] + Err(super::Error::NotImplemented) + } + async fn get(&self, location: &Path) -> Result { let path = self.config.path_to_filesystem(location)?; maybe_spawn_blocking(move || { @@ -1305,3 +1351,94 @@ mod tests { integration.list_with_delimiter(Some(&path)).await.unwrap(); } } + +#[cfg(not(target_arch = "wasm32"))] +#[cfg(test)] +mod not_wasm_tests { + use crate::local::LocalFileSystem; + use crate::{ObjectStore, Path}; + use bytes::Bytes; + use tempfile::TempDir; + use tokio::io::AsyncWriteExt; + + #[tokio::test] + async fn creates_dir_if_not_present_append() { + let root = TempDir::new().unwrap(); + let integration = LocalFileSystem::new_with_prefix(root.path()).unwrap(); + + let location = Path::from("nested/file/test_file"); + + let data = Bytes::from("arbitrary data"); + let expected_data = data.clone(); + + let mut writer = integration.append(&location).await.unwrap(); + + writer.write_all(data.as_ref()).await.unwrap(); + + let read_data = integration + .get(&location) + .await + .unwrap() + .bytes() + .await + .unwrap(); + assert_eq!(&*read_data, expected_data); + } + + #[tokio::test] + async fn unknown_length_append() { + let root = TempDir::new().unwrap(); + let integration = LocalFileSystem::new_with_prefix(root.path()).unwrap(); + + let location = Path::from("some_file"); + + let data = Bytes::from("arbitrary data"); + let expected_data = data.clone(); + let mut writer = integration.append(&location).await.unwrap(); + + writer.write_all(data.as_ref()).await.unwrap(); + + let read_data = integration + .get(&location) + .await + .unwrap() + .bytes() + .await + .unwrap(); + assert_eq!(&*read_data, expected_data); + } + + #[tokio::test] + async fn multiple_append() { + let root = TempDir::new().unwrap(); + let integration = LocalFileSystem::new_with_prefix(root.path()).unwrap(); + + let location = Path::from("some_file"); + + let data = vec![ + Bytes::from("arbitrary"), + Bytes::from("data"), + Bytes::from("gnz"), + ]; + + let mut writer = integration.append(&location).await.unwrap(); + for d in &data { + writer.write_all(d).await.unwrap(); + } + + let mut writer = integration.append(&location).await.unwrap(); + for d in &data { + writer.write_all(d).await.unwrap(); + } + + let read_data = integration + .get(&location) + .await + .unwrap() + .bytes() + .await + .unwrap(); + let expected_data = Bytes::from("arbitrarydatagnzarbitrarydatagnz"); + assert_eq!(&*read_data, expected_data); + } +} From 180b7d6a6410ef8f938d43ef2bef45c059343acc Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 14 Mar 2023 14:28:40 +0000 Subject: [PATCH 105/397] Update quick-xml requirement from 0.27.0 to 0.28.0 (#3857) Updates the requirements on [quick-xml](https://github.com/tafia/quick-xml) to permit the latest version. - [Release notes](https://github.com/tafia/quick-xml/releases) - [Changelog](https://github.com/tafia/quick-xml/blob/master/Changelog.md) - [Commits](https://github.com/tafia/quick-xml/compare/v0.27.0...v0.28.0) --- updated-dependencies: - dependency-name: quick-xml dependency-type: direct:production ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index c6bb7e8..a385886 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -43,7 +43,7 @@ walkdir = "2" # Cloud storage support base64 = { version = "0.21", default-features = false, features = ["std"], optional = true } -quick-xml = { version = "0.27.0", features = ["serialize"], optional = true } +quick-xml = { version = "0.28.0", features = ["serialize"], optional = true } serde = { version = "1.0", default-features = false, features = ["derive"], optional = true } serde_json = { version = "1.0", default-features = false, optional = true } rand = { version = "0.8", default-features = false, features = ["std", "std_rng"], optional = true } From c1a1f82edede4de07489a7ea5ce11cb5afab311b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Metehan=20Y=C4=B1ld=C4=B1r=C4=B1m?= <100111937+metesynnada@users.noreply.github.com> Date: Wed, 15 Mar 2023 21:40:29 +0300 Subject: [PATCH 106/397] Supporting metadata fetch without open file read mode (#3868) * Initial implementation * Formatting and test timeout. * Clippy issue * Fmt issue * Update object_store/Cargo.toml Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> * Update object_store/src/local.rs Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> * Fmt --------- Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> --- Cargo.toml | 3 +++ src/local.rs | 47 ++++++++++++++++++++++++++++++++++++++++------- 2 files changed, 43 insertions(+), 7 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index a385886..3170872 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -62,6 +62,9 @@ tokio = { version = "1.25.0", features = ["sync", "macros", "rt", "time", "io-ut [target.'cfg(target_arch = "wasm32")'.dependencies] tokio = { version = "1.25.0", features = ["sync", "macros", "rt", "time", "io-util"] } +[target.'cfg(target_family="unix")'.dev-dependencies] +nix = "0.26.1" + [features] cloud = ["serde", "serde_json", "quick-xml", "reqwest", "reqwest/json", "reqwest/stream", "chrono/serde", "base64", "rand", "ring"] azure = ["cloud"] diff --git a/src/local.rs b/src/local.rs index ac0b020..9e710c2 100644 --- a/src/local.rs +++ b/src/local.rs @@ -400,13 +400,20 @@ impl ObjectStore for LocalFileSystem { let location = location.clone(); maybe_spawn_blocking(move || { - let file = open_file(&path)?; - let metadata = - file.metadata().map_err(|e| Error::UnableToAccessMetadata { - source: e.into(), - path: location.to_string(), - })?; - + let metadata = match metadata(&path) { + Err(e) => Err(if e.kind() == ErrorKind::NotFound { + Error::NotFound { + path: path.clone(), + source: e, + } + } else { + Error::UnableToAccessMetadata { + source: e.into(), + path: location.to_string(), + } + }), + Ok(m) => Ok(m), + }?; convert_metadata(metadata, location) }) .await @@ -1442,3 +1449,29 @@ mod not_wasm_tests { assert_eq!(&*read_data, expected_data); } } + +#[cfg(target_family = "unix")] +#[cfg(test)] +mod unix_test { + use crate::local::LocalFileSystem; + use crate::{ObjectStore, Path}; + use nix::sys::stat; + use nix::unistd; + use std::time::Duration; + use tempfile::TempDir; + use tokio::time::timeout; + + #[tokio::test] + async fn test_head_fifo() { + let filename = "some_file"; + let root = TempDir::new().unwrap(); + let integration = LocalFileSystem::new_with_prefix(root.path()).unwrap(); + unistd::mkfifo(&root.path().join(filename), stat::Mode::S_IRWXU).unwrap(); + let location = Path::from(filename); + if (timeout(Duration::from_millis(10), integration.head(&location)).await) + .is_err() + { + panic!("Did not receive value within 10 ms"); + } + } +} From 2ec67466104fbf301126848ad6d3fee5c27e609f Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Wed, 15 Mar 2023 18:44:08 +0000 Subject: [PATCH 107/397] Rename PrefixObjectStore to PrefixStore (#3870) --- src/prefix.rs | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/src/prefix.rs b/src/prefix.rs index d61fc22..c3a0ebd 100644 --- a/src/prefix.rs +++ b/src/prefix.rs @@ -27,21 +27,25 @@ use crate::{ Result as ObjectStoreResult, }; +#[doc(hidden)] +#[deprecated(note = "Use PrefixStore")] +pub type PrefixObjectStore = PrefixStore; + /// Store wrapper that applies a constant prefix to all paths handled by the store. #[derive(Debug, Clone)] -pub struct PrefixObjectStore { +pub struct PrefixStore { prefix: Path, inner: T, } -impl std::fmt::Display for PrefixObjectStore { +impl std::fmt::Display for PrefixStore { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!(f, "PrefixObjectStore({})", self.prefix.as_ref()) } } -impl PrefixObjectStore { - /// Create a new instance of [`PrefixObjectStore`] +impl PrefixStore { + /// Create a new instance of [`PrefixStore`] pub fn new(store: T, prefix: impl Into) -> Self { Self { prefix: prefix.into(), @@ -61,7 +65,7 @@ impl PrefixObjectStore { } #[async_trait::async_trait] -impl ObjectStore for PrefixObjectStore { +impl ObjectStore for PrefixStore { /// Save the provided bytes to the specified location. async fn put(&self, location: &Path, bytes: Bytes) -> ObjectStoreResult<()> { let full_path = self.full_path(location); @@ -221,7 +225,7 @@ mod tests { async fn prefix_test() { let root = TempDir::new().unwrap(); let inner = LocalFileSystem::new_with_prefix(root.path()).unwrap(); - let integration = PrefixObjectStore::new(inner, "prefix"); + let integration = PrefixStore::new(inner, "prefix"); put_get_delete_list(&integration).await; list_uses_directories_correctly(&integration).await; @@ -242,7 +246,7 @@ mod tests { local.put(&location, data).await.unwrap(); - let prefix = PrefixObjectStore::new(local, "prefix"); + let prefix = PrefixStore::new(local, "prefix"); let location_prefix = Path::from("test_file.json"); let content_list = flatten_list_stream(&prefix, None).await.unwrap(); From 925aab1922a742135164ed6dc358141c3bae4197 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Wed, 15 Mar 2023 19:12:13 +0000 Subject: [PATCH 108/397] Implement append for LimitStore, PrefixObjectStore, ThrottledStore (#3869) --- src/limit.rs | 9 +++++++++ src/prefix.rs | 48 +++++++++++++++++++++++------------------------- src/throttle.rs | 7 +++++++ 3 files changed, 39 insertions(+), 25 deletions(-) diff --git a/src/limit.rs b/src/limit.rs index 09c88aa..b3e55a9 100644 --- a/src/limit.rs +++ b/src/limit.rs @@ -95,6 +95,15 @@ impl ObjectStore for LimitStore { self.inner.abort_multipart(location, multipart_id).await } + async fn append( + &self, + location: &Path, + ) -> Result> { + let permit = Arc::clone(&self.semaphore).acquire_owned().await.unwrap(); + let write = self.inner.append(location).await?; + Ok(Box::new(PermitWrapper::new(write, permit))) + } + async fn get(&self, location: &Path) -> Result { let permit = Arc::clone(&self.semaphore).acquire_owned().await.unwrap(); match self.inner.get(location).await? { diff --git a/src/prefix.rs b/src/prefix.rs index c3a0ebd..7e7e716 100644 --- a/src/prefix.rs +++ b/src/prefix.rs @@ -66,20 +66,24 @@ impl PrefixStore { #[async_trait::async_trait] impl ObjectStore for PrefixStore { - /// Save the provided bytes to the specified location. async fn put(&self, location: &Path, bytes: Bytes) -> ObjectStoreResult<()> { let full_path = self.full_path(location); self.inner.put(&full_path, bytes).await } - /// Return the bytes that are stored at the specified location. + async fn append( + &self, + location: &Path, + ) -> ObjectStoreResult> { + let full_path = self.full_path(location); + self.inner.append(&full_path).await + } + async fn get(&self, location: &Path) -> ObjectStoreResult { let full_path = self.full_path(location); self.inner.get(&full_path).await } - /// Return the bytes that are stored at the specified location - /// in the given byte range async fn get_range( &self, location: &Path, @@ -89,7 +93,15 @@ impl ObjectStore for PrefixStore { self.inner.get_range(&full_path, range).await } - /// Return the metadata for the specified location + async fn get_ranges( + &self, + location: &Path, + ranges: &[Range], + ) -> ObjectStoreResult> { + let full_path = self.full_path(location); + self.inner.get_ranges(&full_path, ranges).await + } + async fn head(&self, location: &Path) -> ObjectStoreResult { let full_path = self.full_path(location); self.inner.head(&full_path).await.map(|meta| ObjectMeta { @@ -99,16 +111,11 @@ impl ObjectStore for PrefixStore { }) } - /// Delete the object at the specified location. async fn delete(&self, location: &Path) -> ObjectStoreResult<()> { let full_path = self.full_path(location); self.inner.delete(&full_path).await } - /// List all the objects with the given prefix. - /// - /// Prefixes are evaluated on a path segment basis, i.e. `foo/bar/` is a prefix of `foo/bar/x` but not of - /// `foo/bar_baz/x`. async fn list( &self, prefix: Option<&Path>, @@ -125,12 +132,6 @@ impl ObjectStore for PrefixStore { .boxed()) } - /// List objects with the given prefix and an implementation specific - /// delimiter. Returns common prefixes (directories) in addition to object - /// metadata. - /// - /// Prefixes are evaluated on a path segment basis, i.e. `foo/bar/` is a prefix of `foo/bar/x` but not of - /// `foo/bar_baz/x`. async fn list_with_delimiter( &self, prefix: Option<&Path>, @@ -160,27 +161,24 @@ impl ObjectStore for PrefixStore { }) } - /// Copy an object from one path to another in the same object store. - /// - /// If there exists an object at the destination, it will be overwritten. async fn copy(&self, from: &Path, to: &Path) -> ObjectStoreResult<()> { let full_from = self.full_path(from); let full_to = self.full_path(to); self.inner.copy(&full_from, &full_to).await } - /// Copy an object from one path to another, only if destination is empty. - /// - /// Will return an error if the destination already has an object. async fn copy_if_not_exists(&self, from: &Path, to: &Path) -> ObjectStoreResult<()> { let full_from = self.full_path(from); let full_to = self.full_path(to); self.inner.copy_if_not_exists(&full_from, &full_to).await } - /// Move an object from one path to another in the same object store. - /// - /// Will return an error if the destination already has an object. + async fn rename(&self, from: &Path, to: &Path) -> ObjectStoreResult<()> { + let full_from = self.full_path(from); + let full_to = self.full_path(to); + self.inner.rename(&full_from, &full_to).await + } + async fn rename_if_not_exists( &self, from: &Path, diff --git a/src/throttle.rs b/src/throttle.rs index 90f427c..6dff64a 100644 --- a/src/throttle.rs +++ b/src/throttle.rs @@ -166,6 +166,13 @@ impl ObjectStore for ThrottledStore { Err(super::Error::NotImplemented) } + async fn append( + &self, + _location: &Path, + ) -> Result> { + Err(super::Error::NotImplemented) + } + async fn get(&self, location: &Path) -> Result { sleep(self.config().wait_get_per_call).await; From 0ae33f393efda060c73bae595bf448b821adf09e Mon Sep 17 00:00:00 2001 From: kinrany Date: Tue, 21 Mar 2023 15:04:12 +0300 Subject: [PATCH 109/397] Impl ObjectStore for trait object (#3866) --- src/lib.rs | 80 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 80 insertions(+) diff --git a/src/lib.rs b/src/lib.rs index 3af5382..706cc07 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -418,6 +418,86 @@ pub trait ObjectStore: std::fmt::Display + Send + Sync + Debug + 'static { } } +#[async_trait] +impl ObjectStore for Box { + async fn put(&self, location: &Path, bytes: Bytes) -> Result<()> { + self.as_ref().put(location, bytes).await + } + + async fn put_multipart( + &self, + location: &Path, + ) -> Result<(MultipartId, Box)> { + self.as_ref().put_multipart(location).await + } + + async fn abort_multipart( + &self, + location: &Path, + multipart_id: &MultipartId, + ) -> Result<()> { + self.as_ref().abort_multipart(location, multipart_id).await + } + + async fn append( + &self, + location: &Path, + ) -> Result> { + self.as_ref().append(location).await + } + + async fn get(&self, location: &Path) -> Result { + self.as_ref().get(location).await + } + + async fn get_range(&self, location: &Path, range: Range) -> Result { + self.as_ref().get_range(location, range).await + } + + async fn get_ranges( + &self, + location: &Path, + ranges: &[Range], + ) -> Result> { + self.as_ref().get_ranges(location, ranges).await + } + + async fn head(&self, location: &Path) -> Result { + self.as_ref().head(location).await + } + + async fn delete(&self, location: &Path) -> Result<()> { + self.as_ref().delete(location).await + } + + async fn list( + &self, + prefix: Option<&Path>, + ) -> Result>> { + self.as_ref().list(prefix).await + } + + async fn list_with_delimiter(&self, prefix: Option<&Path>) -> Result { + self.as_ref().list_with_delimiter(prefix).await + } + + async fn copy(&self, from: &Path, to: &Path) -> Result<()> { + self.as_ref().copy(from, to).await + } + + async fn rename(&self, from: &Path, to: &Path) -> Result<()> { + self.as_ref().rename(from, to).await + } + + async fn copy_if_not_exists(&self, from: &Path, to: &Path) -> Result<()> { + self.as_ref().copy_if_not_exists(from, to).await + } + + async fn rename_if_not_exists(&self, from: &Path, to: &Path) -> Result<()> { + self.as_ref().rename_if_not_exists(from, to).await + } +} + /// Result of a list call that includes objects, prefixes (directories) and a /// token for the next set of results. Individual result sets may be limited to /// 1,000 objects based on the underlying object storage's limitations. From 5938a4506dcdf2217a150a45ce329870f04cd2b9 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Tue, 21 Mar 2023 14:34:05 +0000 Subject: [PATCH 110/397] Remove old object_store releases automatically (#3892) --- dev/release/release-tarball.sh | 3 ++ dev/release/remove-old-releases.sh | 45 ++++++++++++++++++++++++++++++ 2 files changed, 48 insertions(+) create mode 100755 dev/release/remove-old-releases.sh diff --git a/dev/release/release-tarball.sh b/dev/release/release-tarball.sh index 75ff886..9581186 100755 --- a/dev/release/release-tarball.sh +++ b/dev/release/release-tarball.sh @@ -74,3 +74,6 @@ rm -rf ${tmp_dir} echo "Success!" echo "The release is available here:" echo " https://dist.apache.org/repos/dist/release/arrow/${release_version}" + +echo "Clean up old versions from svn" +"${SOURCE_TOP_DIR}"/dev/release/remove-old-releases.sh diff --git a/dev/release/remove-old-releases.sh b/dev/release/remove-old-releases.sh new file mode 100755 index 0000000..c8bd8b7 --- /dev/null +++ b/dev/release/remove-old-releases.sh @@ -0,0 +1,45 @@ +#!/bin/bash +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +# This script removes all but the most recent versions of arrow-rs +# from svn +# +# The older versions are in SVN history as well as available on the +# archive page https://archive.apache.org/dist/ +# +# See +# https://infra.apache.org/release-download-pages.html + +set -e +set -u + +svn_base="https://dist.apache.org/repos/dist/release/arrow" + +echo "Remove all but the most recent version" +old_releases=$( + svn ls ${svn_base} | \ + grep -E '^arrow-object-store-rs-[0-9\.]+' | \ + sort --version-sort --reverse | \ + tail -n +2 +) +for old_release_version in $old_releases; do + echo "Remove old release ${old_release_version}" + svn delete -m "Removing ${old_release_version}" ${svn_base}/${old_release_version} +done From 71ea6026da8777a36469016423a9943673fd544b Mon Sep 17 00:00:00 2001 From: Satyam Singh Date: Tue, 21 Mar 2023 20:05:41 +0530 Subject: [PATCH 111/397] Add support for checksum algorithms in AWS (#3873) * Add support for checksum algorithms in aws * Remove other algorithms * Only set when checksum algorithm is sha256 * Fix --- src/aws/checksum.rs | 51 +++++++++++++++++++++++++++++++++++++++++++ src/aws/client.rs | 24 ++++++++++++++++++-- src/aws/credential.rs | 22 ++++++++++++------- src/aws/mod.rs | 37 +++++++++++++++++++++++++++++++ 4 files changed, 124 insertions(+), 10 deletions(-) create mode 100644 src/aws/checksum.rs diff --git a/src/aws/checksum.rs b/src/aws/checksum.rs new file mode 100644 index 0000000..ae35f06 --- /dev/null +++ b/src/aws/checksum.rs @@ -0,0 +1,51 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use ring::digest::{self, digest as ring_digest}; + +#[allow(non_camel_case_types)] +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +/// Enum representing checksum algorithm supported by S3. +pub enum Checksum { + /// SHA-256 algorithm. + SHA256, +} + +impl Checksum { + pub(super) fn digest(&self, bytes: &[u8]) -> Vec { + match self { + Self::SHA256 => ring_digest(&digest::SHA256, bytes).as_ref().to_owned(), + } + } + + pub(super) fn header_name(&self) -> &'static str { + match self { + Self::SHA256 => "x-amz-checksum-sha256", + } + } +} + +impl TryFrom<&String> for Checksum { + type Error = (); + + fn try_from(value: &String) -> Result { + match value.as_str() { + "sha256" => Ok(Self::SHA256), + _ => Err(()), + } + } +} diff --git a/src/aws/client.rs b/src/aws/client.rs index 0b0f883..bd58d09 100644 --- a/src/aws/client.rs +++ b/src/aws/client.rs @@ -15,6 +15,7 @@ // specific language governing permissions and limitations // under the License. +use crate::aws::checksum::Checksum; use crate::aws::credential::{AwsCredential, CredentialExt, CredentialProvider}; use crate::aws::STRICT_PATH_ENCODE_SET; use crate::client::pagination::stream_paginated; @@ -26,6 +27,8 @@ use crate::{ BoxStream, ClientOptions, ListResult, MultipartId, ObjectMeta, Path, Result, RetryConfig, StreamExt, }; +use base64::prelude::BASE64_STANDARD; +use base64::Engine; use bytes::{Buf, Bytes}; use chrono::{DateTime, Utc}; use percent_encoding::{utf8_percent_encode, PercentEncode}; @@ -205,6 +208,7 @@ pub struct S3Config { pub retry_config: RetryConfig, pub client_options: ClientOptions, pub sign_payload: bool, + pub checksum: Option, } impl S3Config { @@ -262,6 +266,7 @@ impl S3Client { &self.config.region, "s3", self.config.sign_payload, + None, ) .send_retry(&self.config.retry_config) .await @@ -281,10 +286,19 @@ impl S3Client { ) -> Result { let credential = self.get_credential().await?; let url = self.config.path_url(path); - let mut builder = self.client.request(Method::PUT, url); + let mut payload_sha256 = None; + if let Some(bytes) = bytes { - builder = builder.body(bytes) + if let Some(checksum) = self.config().checksum { + let digest = checksum.digest(&bytes); + builder = builder + .header(checksum.header_name(), BASE64_STANDARD.encode(&digest)); + if checksum == Checksum::SHA256 { + payload_sha256 = Some(digest); + } + } + builder = builder.body(bytes); } if let Some(value) = self.config().client_options.get_content_type(path) { @@ -298,6 +312,7 @@ impl S3Client { &self.config.region, "s3", self.config.sign_payload, + payload_sha256, ) .send_retry(&self.config.retry_config) .await @@ -325,6 +340,7 @@ impl S3Client { &self.config.region, "s3", self.config.sign_payload, + None, ) .send_retry(&self.config.retry_config) .await @@ -349,6 +365,7 @@ impl S3Client { &self.config.region, "s3", self.config.sign_payload, + None, ) .send_retry(&self.config.retry_config) .await @@ -395,6 +412,7 @@ impl S3Client { &self.config.region, "s3", self.config.sign_payload, + None, ) .send_retry(&self.config.retry_config) .await @@ -438,6 +456,7 @@ impl S3Client { &self.config.region, "s3", self.config.sign_payload, + None, ) .send_retry(&self.config.retry_config) .await @@ -482,6 +501,7 @@ impl S3Client { &self.config.region, "s3", self.config.sign_payload, + None, ) .send_retry(&self.config.retry_config) .await diff --git a/src/aws/credential.rs b/src/aws/credential.rs index 05f2c53..183e843 100644 --- a/src/aws/credential.rs +++ b/src/aws/credential.rs @@ -84,7 +84,7 @@ const AUTH_HEADER: &str = "authorization"; const ALL_HEADERS: &[&str; 4] = &[DATE_HEADER, HASH_HEADER, TOKEN_HEADER, AUTH_HEADER]; impl<'a> RequestSigner<'a> { - fn sign(&self, request: &mut Request) { + fn sign(&self, request: &mut Request, pre_calculated_digest: Option>) { if let Some(ref token) = self.credential.token { let token_val = HeaderValue::from_str(token).unwrap(); request.headers_mut().insert(TOKEN_HEADER, token_val); @@ -101,9 +101,13 @@ impl<'a> RequestSigner<'a> { request.headers_mut().insert(DATE_HEADER, date_val); let digest = if self.sign_payload { - match request.body() { - None => EMPTY_SHA256_HASH.to_string(), - Some(body) => hex_digest(body.as_bytes().unwrap()), + if let Some(digest) = pre_calculated_digest { + hex_encode(&digest) + } else { + match request.body() { + None => EMPTY_SHA256_HASH.to_string(), + Some(body) => hex_digest(body.as_bytes().unwrap()), + } } } else { UNSIGNED_PAYLOAD_LITERAL.to_string() @@ -165,6 +169,7 @@ pub trait CredentialExt { region: &str, service: &str, sign_payload: bool, + payload_sha256: Option>, ) -> Self; } @@ -175,6 +180,7 @@ impl CredentialExt for RequestBuilder { region: &str, service: &str, sign_payload: bool, + payload_sha256: Option>, ) -> Self { // Hack around lack of access to underlying request // https://github.com/seanmonstar/reqwest/issues/1212 @@ -193,7 +199,7 @@ impl CredentialExt for RequestBuilder { sign_payload, }; - signer.sign(&mut request); + signer.sign(&mut request, payload_sha256); for header in ALL_HEADERS { if let Some(val) = request.headers_mut().remove(*header) { @@ -627,7 +633,7 @@ mod tests { sign_payload: true, }; - signer.sign(&mut request); + signer.sign(&mut request, None); assert_eq!(request.headers().get(AUTH_HEADER).unwrap(), "AWS4-HMAC-SHA256 Credential=AKIAIOSFODNN7EXAMPLE/20220806/us-east-1/ec2/aws4_request, SignedHeaders=host;x-amz-content-sha256;x-amz-date, Signature=a3c787a7ed37f7fdfbfd2d7056a3d7c9d85e6d52a2bfbec73793c0be6e7862d4") } @@ -665,7 +671,7 @@ mod tests { sign_payload: false, }; - signer.sign(&mut request); + signer.sign(&mut request, None); assert_eq!(request.headers().get(AUTH_HEADER).unwrap(), "AWS4-HMAC-SHA256 Credential=AKIAIOSFODNN7EXAMPLE/20220806/us-east-1/ec2/aws4_request, SignedHeaders=host;x-amz-content-sha256;x-amz-date, Signature=653c3d8ea261fd826207df58bc2bb69fbb5003e9eb3c0ef06e4a51f2a81d8699") } @@ -702,7 +708,7 @@ mod tests { sign_payload: true, }; - signer.sign(&mut request); + signer.sign(&mut request, None); assert_eq!(request.headers().get(AUTH_HEADER).unwrap(), "AWS4-HMAC-SHA256 Credential=H20ABqCkLZID4rLe/20220809/us-east-1/s3/aws4_request, SignedHeaders=host;x-amz-content-sha256;x-amz-date, Signature=9ebf2f92872066c99ac94e573b4e1b80f4dbb8a32b1e8e23178318746e7d1b4d") } diff --git a/src/aws/mod.rs b/src/aws/mod.rs index c724886..7d10f37 100644 --- a/src/aws/mod.rs +++ b/src/aws/mod.rs @@ -47,6 +47,7 @@ use tokio::io::AsyncWrite; use tracing::info; use url::Url; +pub use crate::aws::checksum::Checksum; use crate::aws::client::{S3Client, S3Config}; use crate::aws::credential::{ AwsCredential, CredentialProvider, InstanceCredentialProvider, @@ -59,6 +60,7 @@ use crate::{ Result, RetryConfig, StreamExt, }; +mod checksum; mod client; mod credential; @@ -101,6 +103,9 @@ enum Error { source: std::num::ParseIntError, }, + #[snafu(display("Invalid Checksum algorithm"))] + InvalidChecksumAlgorithm, + #[snafu(display("Missing region"))] MissingRegion, @@ -386,6 +391,7 @@ pub struct AmazonS3Builder { imdsv1_fallback: bool, virtual_hosted_style_request: bool, unsigned_payload: bool, + checksum_algorithm: Option, metadata_endpoint: Option, profile: Option, client_options: ClientOptions, @@ -514,6 +520,11 @@ pub enum AmazonS3ConfigKey { /// - `unsigned_payload` UnsignedPayload, + /// Set the checksum algorithm for this client + /// + /// See [`AmazonS3Builder::with_checksum_algorithm`] + Checksum, + /// Set the instance metadata endpoint /// /// See [`AmazonS3Builder::with_metadata_endpoint`] for details. @@ -546,6 +557,7 @@ impl AsRef for AmazonS3ConfigKey { Self::MetadataEndpoint => "aws_metadata_endpoint", Self::Profile => "aws_profile", Self::UnsignedPayload => "aws_unsigned_payload", + Self::Checksum => "aws_checksum_algorithm", } } } @@ -575,6 +587,7 @@ impl FromStr for AmazonS3ConfigKey { "aws_imdsv1_fallback" | "imdsv1_fallback" => Ok(Self::ImdsV1Fallback), "aws_metadata_endpoint" | "metadata_endpoint" => Ok(Self::MetadataEndpoint), "aws_unsigned_payload" | "unsigned_payload" => Ok(Self::UnsignedPayload), + "aws_checksum_algorithm" | "checksum_algorithm" => Ok(Self::Checksum), _ => Err(Error::UnknownConfigurationKey { key: s.into() }.into()), } } @@ -694,6 +707,11 @@ impl AmazonS3Builder { AmazonS3ConfigKey::UnsignedPayload => { self.unsigned_payload = str_is_truthy(&value.into()) } + AmazonS3ConfigKey::Checksum => { + let algorithm = Checksum::try_from(&value.into()) + .map_err(|_| Error::InvalidChecksumAlgorithm)?; + self.checksum_algorithm = Some(algorithm) + } }; Ok(self) } @@ -846,6 +864,14 @@ impl AmazonS3Builder { self } + /// Sets the [checksum algorithm] which has to be used for object integrity check during upload. + /// + /// [checksum algorithm]: https://docs.aws.amazon.com/AmazonS3/latest/userguide/checking-object-integrity.html + pub fn with_checksum_algorithm(mut self, checksum_algorithm: Checksum) -> Self { + self.checksum_algorithm = Some(checksum_algorithm); + self + } + /// Set the [instance metadata endpoint](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/ec2-instance-metadata.html), /// used primarily within AWS EC2. /// @@ -992,6 +1018,7 @@ impl AmazonS3Builder { retry_config: self.retry_config, client_options: self.client_options, sign_payload: !self.unsigned_payload, + checksum: self.checksum_algorithm, }; let client = Arc::new(S3Client::new(config)?); @@ -1151,6 +1178,7 @@ mod tests { &container_creds_relative_uri, ); env::set_var("AWS_UNSIGNED_PAYLOAD", "true"); + env::set_var("AWS_CHECKSUM_ALGORITHM", "sha256"); let builder = AmazonS3Builder::from_env(); assert_eq!(builder.access_key_id.unwrap(), aws_access_key_id.as_str()); @@ -1164,6 +1192,7 @@ mod tests { assert_eq!(builder.token.unwrap(), aws_session_token); let metadata_uri = format!("{METADATA_ENDPOINT}{container_creds_relative_uri}"); assert_eq!(builder.metadata_endpoint.unwrap(), metadata_uri); + assert_eq!(builder.checksum_algorithm.unwrap(), Checksum::SHA256); assert!(builder.unsigned_payload); } @@ -1181,6 +1210,7 @@ mod tests { ("aws_endpoint", aws_endpoint.clone()), ("aws_session_token", aws_session_token.clone()), ("aws_unsigned_payload", "true".to_string()), + ("aws_checksum_algorithm", "sha256".to_string()), ]); let builder = AmazonS3Builder::new() @@ -1193,6 +1223,7 @@ mod tests { assert_eq!(builder.region.unwrap(), aws_default_region); assert_eq!(builder.endpoint.unwrap(), aws_endpoint); assert_eq!(builder.token.unwrap(), aws_session_token); + assert_eq!(builder.checksum_algorithm.unwrap(), Checksum::SHA256); assert!(builder.unsigned_payload); } @@ -1256,6 +1287,12 @@ mod tests { let is_local = matches!(&config.endpoint, Some(e) if e.starts_with("http://")); let integration = config.build().unwrap(); put_get_delete_list_opts(&integration, is_local).await; + + // run integration test with checksum set to sha256 + let config = maybe_skip_integration!().with_checksum_algorithm(Checksum::SHA256); + let is_local = matches!(&config.endpoint, Some(e) if e.starts_with("http://")); + let integration = config.build().unwrap(); + put_get_delete_list_opts(&integration, is_local).await; } #[tokio::test] From 3d77bd5bd9d21d773db39aefa380c7c43fd7ce70 Mon Sep 17 00:00:00 2001 From: Sean Smith Date: Thu, 23 Mar 2023 15:53:56 -0500 Subject: [PATCH 112/397] fix: Specify content length for gcp copy request (#3921) * fix: Specify content length for gcp copy request * Include comment about native-tls/rust-tls Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> --------- Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> --- src/gcp/mod.rs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/gcp/mod.rs b/src/gcp/mod.rs index 97f4444..fe79a6e 100644 --- a/src/gcp/mod.rs +++ b/src/gcp/mod.rs @@ -437,6 +437,9 @@ impl GoogleCloudStorageClient { builder .bearer_auth(token) + // Needed if reqwest is compiled with native-tls instead of rustls-tls + // See https://github.com/apache/arrow-rs/pull/3921 + .header(header::CONTENT_LENGTH, 0) .send_retry(&self.retry_config) .await .map_err(|err| { From e9acb0d0c67477c103cc704554c9a2c2ea84601d Mon Sep 17 00:00:00 2001 From: "R. Tyler Croy" Date: Sun, 26 Mar 2023 06:53:04 -0700 Subject: [PATCH 113/397] Remove incorrect validation logic on S3 bucket names (#3947) S3 bucket names can have dots in them, see [this documentation](https://docs.aws.amazon.com/AmazonS3/latest/userguide/bucketnamingrules.html) > Bucket names can consist only of lowercase letters, numbers, dots (.), and hyphens (-). This was originally reported in delta-io/delta-rs#1239 by @gray-sat --- src/aws/mod.rs | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/src/aws/mod.rs b/src/aws/mod.rs index 7d10f37..752fb2e 100644 --- a/src/aws/mod.rs +++ b/src/aws/mod.rs @@ -414,7 +414,7 @@ pub struct AmazonS3Builder { /// let typed_options = vec![ /// (AmazonS3ConfigKey::DefaultRegion, "my-default-region"), /// ]; -/// let azure = AmazonS3Builder::new() +/// let aws = AmazonS3Builder::new() /// .try_with_options(options) /// .unwrap() /// .try_with_options(typed_options) @@ -738,13 +738,9 @@ impl AmazonS3Builder { fn parse_url(&mut self, url: &str) -> Result<()> { let parsed = Url::parse(url).context(UnableToParseUrlSnafu { url })?; let host = parsed.host_str().context(UrlNotRecognisedSnafu { url })?; - let validate = |s: &str| match s.contains('.') { - true => Err(UrlNotRecognisedSnafu { url }.build()), - false => Ok(s.to_string()), - }; match parsed.scheme() { - "s3" | "s3a" => self.bucket_name = Some(validate(host)?), + "s3" | "s3a" => self.bucket_name = Some(host.to_string()), "https" => match host.splitn(4, '.').collect_tuple() { Some(("s3", bucket, "amazonaws", "com")) => { self.bucket_name = Some(bucket.to_string()); @@ -1389,6 +1385,15 @@ mod tests { builder.parse_url("s3://bucket/path").unwrap(); assert_eq!(builder.bucket_name, Some("bucket".to_string())); + let mut builder = AmazonS3Builder::new(); + builder + .parse_url("s3://buckets.can.have.dots/path") + .unwrap(); + assert_eq!( + builder.bucket_name, + Some("buckets.can.have.dots".to_string()) + ); + let mut builder = AmazonS3Builder::new(); builder .parse_url("https://s3.bucket.amazonaws.com") @@ -1405,7 +1410,6 @@ mod tests { let err_cases = [ "mailto://bucket/path", - "s3://bucket.mydomain/path", "https://s3.bucket.mydomain.com", "https://s3.bucket.foo.amazonaws.com", "https://bucket.mydomain.region.amazonaws.com", From 25fd5104f4ba0a286b4eed0a744a78781dfd3190 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 30 Mar 2023 17:09:53 +0100 Subject: [PATCH 114/397] Use workspace dependencies (#3936) * Use workspace dependencies * Fix rustfmt * Fix pyarrow integration test --- Cargo.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 3170872..d9b075f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -18,12 +18,12 @@ [package] name = "object_store" version = "0.5.5" -edition = "2021" +edition = { workspace = true } license = "MIT/Apache-2.0" readme = "README.md" description = "A generic object store interface for uniformly interacting with AWS S3, Google Cloud Storage, Azure Blob Storage and local files." keywords = ["object", "storage", "cloud"] -repository = "https://github.com/apache/arrow-rs" +repository = { workspace = true } [package.metadata.docs.rs] all-features = true From c4e1477cf4c359966ea5cf9c69eefe787fc3f3ca Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 30 Mar 2023 18:08:04 +0100 Subject: [PATCH 115/397] Add ObjectStore::list_with_offset (#3970) (#3973) * Stub out ObjectStore::list_with_offset (#3970) * Add tests and add AWS implementation * Update localstack * Add further implementations --- src/aws/client.rs | 27 ++++++++++---- src/aws/mod.rs | 19 ++++++++-- src/chunked.rs | 8 +++++ src/lib.rs | 91 ++++++++++++++++++++++++++++++++++++++++++++++- src/limit.rs | 10 ++++++ src/throttle.rs | 61 +++++++++++++++++-------------- 6 files changed, 180 insertions(+), 36 deletions(-) diff --git a/src/aws/client.rs b/src/aws/client.rs index bd58d09..7ac4b70 100644 --- a/src/aws/client.rs +++ b/src/aws/client.rs @@ -382,6 +382,7 @@ impl S3Client { prefix: Option<&str>, delimiter: bool, token: Option<&str>, + offset: Option<&str>, ) -> Result<(ListResult, Option)> { let credential = self.get_credential().await?; let url = self.config.bucket_endpoint.clone(); @@ -403,6 +404,10 @@ impl S3Client { query.push(("prefix", prefix)) } + if let Some(offset) = offset { + query.push(("start-after", offset)) + } + let response = self .client .request(Method::GET, &url) @@ -433,14 +438,24 @@ impl S3Client { &self, prefix: Option<&Path>, delimiter: bool, + offset: Option<&Path>, ) -> BoxStream<'_, Result> { + let offset = offset.map(|x| x.to_string()); let prefix = format_prefix(prefix); - stream_paginated(prefix, move |prefix, token| async move { - let (r, next_token) = self - .list_request(prefix.as_deref(), delimiter, token.as_deref()) - .await?; - Ok((r, prefix, next_token)) - }) + stream_paginated( + (prefix, offset), + move |(prefix, offset), token| async move { + let (r, next_token) = self + .list_request( + prefix.as_deref(), + delimiter, + token.as_deref(), + offset.as_deref(), + ) + .await?; + Ok((r, (prefix, offset), next_token)) + }, + ) .boxed() } diff --git a/src/aws/mod.rs b/src/aws/mod.rs index 752fb2e..1e302e6 100644 --- a/src/aws/mod.rs +++ b/src/aws/mod.rs @@ -273,7 +273,22 @@ impl ObjectStore for AmazonS3 { ) -> Result>> { let stream = self .client - .list_paginated(prefix, false) + .list_paginated(prefix, false, None) + .map_ok(|r| futures::stream::iter(r.objects.into_iter().map(Ok))) + .try_flatten() + .boxed(); + + Ok(stream) + } + + async fn list_with_offset( + &self, + prefix: Option<&Path>, + offset: &Path, + ) -> Result>> { + let stream = self + .client + .list_paginated(prefix, false, Some(offset)) .map_ok(|r| futures::stream::iter(r.objects.into_iter().map(Ok))) .try_flatten() .boxed(); @@ -282,7 +297,7 @@ impl ObjectStore for AmazonS3 { } async fn list_with_delimiter(&self, prefix: Option<&Path>) -> Result { - let mut stream = self.client.list_paginated(prefix, true); + let mut stream = self.client.list_paginated(prefix, true, None); let mut common_prefixes = BTreeSet::new(); let mut objects = Vec::new(); diff --git a/src/chunked.rs b/src/chunked.rs index 76865ef..aebefec 100644 --- a/src/chunked.rs +++ b/src/chunked.rs @@ -174,6 +174,14 @@ impl ObjectStore for ChunkedStore { self.inner.list(prefix).await } + async fn list_with_offset( + &self, + prefix: Option<&Path>, + offset: &Path, + ) -> Result>> { + self.inner.list_with_offset(prefix, offset).await + } + async fn list_with_delimiter(&self, prefix: Option<&Path>) -> Result { self.inner.list_with_delimiter(prefix).await } diff --git a/src/lib.rs b/src/lib.rs index 706cc07..5737071 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -258,7 +258,7 @@ use crate::util::{coalesce_ranges, collect_bytes, OBJECT_STORE_COALESCE_DEFAULT} use async_trait::async_trait; use bytes::Bytes; use chrono::{DateTime, Utc}; -use futures::{stream::BoxStream, StreamExt}; +use futures::{stream::BoxStream, StreamExt, TryStreamExt}; use snafu::Snafu; use std::fmt::{Debug, Formatter}; #[cfg(not(target_arch = "wasm32"))] @@ -371,11 +371,33 @@ pub trait ObjectStore: std::fmt::Display + Send + Sync + Debug + 'static { /// /// Prefixes are evaluated on a path segment basis, i.e. `foo/bar/` is a prefix of `foo/bar/x` but not of /// `foo/bar_baz/x`. + /// + /// Note: the order of returned [`ObjectMeta`] is not guaranteed async fn list( &self, prefix: Option<&Path>, ) -> Result>>; + /// List all the objects with the given prefix and a location greater than `offset` + /// + /// Some stores, such as S3 and GCS, may be able to push `offset` down to reduce + /// the number of network requests required + /// + /// Note: the order of returned [`ObjectMeta`] is not guaranteed + async fn list_with_offset( + &self, + prefix: Option<&Path>, + offset: &Path, + ) -> Result>> { + let offset = offset.clone(); + let stream = self + .list(prefix) + .await? + .try_filter(move |f| futures::future::ready(f.location > offset)) + .boxed(); + Ok(stream) + } + /// List objects with the given prefix and an implementation specific /// delimiter. Returns common prefixes (directories) in addition to object /// metadata. @@ -477,6 +499,14 @@ impl ObjectStore for Box { self.as_ref().list(prefix).await } + async fn list_with_offset( + &self, + prefix: Option<&Path>, + offset: &Path, + ) -> Result>> { + self.as_ref().list_with_offset(prefix, offset).await + } + async fn list_with_delimiter(&self, prefix: Option<&Path>) -> Result { self.as_ref().list_with_delimiter(prefix).await } @@ -926,6 +956,65 @@ mod tests { let files = flatten_list_stream(storage, None).await.unwrap(); assert!(files.is_empty(), "{files:?}"); + + // Test list order + let files = vec![ + Path::from("a a/b.file"), + Path::parse("a%2Fa.file").unwrap(), + Path::from("a/😀.file"), + Path::from("a/a file"), + Path::parse("a/a%2F.file").unwrap(), + Path::from("a/a.file"), + Path::from("a/a/b.file"), + Path::from("a/b.file"), + Path::from("aa/a.file"), + Path::from("ab/a.file"), + ]; + + for file in &files { + storage.put(file, "foo".into()).await.unwrap(); + } + + let cases = [ + (None, Path::from("a")), + (None, Path::from("a/a file")), + (None, Path::from("a/a/b.file")), + (None, Path::from("ab/a.file")), + (None, Path::from("a%2Fa.file")), + (None, Path::from("a/😀.file")), + (Some(Path::from("a")), Path::from("")), + (Some(Path::from("a")), Path::from("a")), + (Some(Path::from("a")), Path::from("a/😀")), + (Some(Path::from("a")), Path::from("a/😀.file")), + (Some(Path::from("a")), Path::from("a/b")), + (Some(Path::from("a")), Path::from("a/a/b.file")), + ]; + + for (prefix, offset) in cases { + let s = storage + .list_with_offset(prefix.as_ref(), &offset) + .await + .unwrap(); + + let mut actual: Vec<_> = + s.map_ok(|x| x.location).try_collect().await.unwrap(); + + actual.sort_unstable(); + + let expected: Vec<_> = files + .iter() + .cloned() + .filter(|x| { + let prefix_match = + prefix.as_ref().map(|p| x.prefix_matches(p)).unwrap_or(true); + prefix_match && x > &offset + }) + .collect(); + + assert_eq!(actual, expected, "{prefix:?} - {offset:?}"); + } + + delete_fixtures(storage).await; } fn get_vec_of_bytes(chunk_length: usize, num_chunks: usize) -> Vec { diff --git a/src/limit.rs b/src/limit.rs index b3e55a9..d0d9f73 100644 --- a/src/limit.rs +++ b/src/limit.rs @@ -147,6 +147,16 @@ impl ObjectStore for LimitStore { Ok(PermitWrapper::new(s, permit).boxed()) } + async fn list_with_offset( + &self, + prefix: Option<&Path>, + offset: &Path, + ) -> Result>> { + let permit = Arc::clone(&self.semaphore).acquire_owned().await.unwrap(); + let s = self.inner.list_with_offset(prefix, offset).await?; + Ok(PermitWrapper::new(s, permit).boxed()) + } + async fn list_with_delimiter(&self, prefix: Option<&Path>) -> Result { let _permit = self.semaphore.acquire().await.unwrap(); self.inner.list_with_delimiter(prefix).await diff --git a/src/throttle.rs b/src/throttle.rs index 6dff64a..e513031 100644 --- a/src/throttle.rs +++ b/src/throttle.rs @@ -24,7 +24,7 @@ use crate::MultipartId; use crate::{path::Path, GetResult, ListResult, ObjectMeta, ObjectStore, Result}; use async_trait::async_trait; use bytes::Bytes; -use futures::{stream::BoxStream, StreamExt}; +use futures::{stream::BoxStream, FutureExt, StreamExt}; use std::time::Duration; use tokio::io::AsyncWrite; @@ -185,19 +185,10 @@ impl ObjectStore for ThrottledStore { GetResult::File(_, _) => unimplemented!(), }; - GetResult::Stream( - s.then(move |bytes_result| async move { - match bytes_result { - Ok(bytes) => { - let bytes_len: u32 = usize_to_u32_saturate(bytes.len()); - sleep(wait_get_per_byte * bytes_len).await; - Ok(bytes) - } - Err(err) => Err(err), - } - }) - .boxed(), - ) + GetResult::Stream(throttle_stream(s, move |bytes| { + let bytes_len: u32 = usize_to_u32_saturate(bytes.len()); + wait_get_per_byte * bytes_len + })) }) } @@ -247,20 +238,21 @@ impl ObjectStore for ThrottledStore { // need to copy to avoid moving / referencing `self` let wait_list_per_entry = self.config().wait_list_per_entry; + let stream = self.inner.list(prefix).await?; + Ok(throttle_stream(stream, move |_| wait_list_per_entry)) + } - self.inner.list(prefix).await.map(|stream| { - stream - .then(move |result| async move { - match result { - Ok(entry) => { - sleep(wait_list_per_entry).await; - Ok(entry) - } - Err(err) => Err(err), - } - }) - .boxed() - }) + async fn list_with_offset( + &self, + prefix: Option<&Path>, + offset: &Path, + ) -> Result>> { + sleep(self.config().wait_list_per_call).await; + + // need to copy to avoid moving / referencing `self` + let wait_list_per_entry = self.config().wait_list_per_entry; + let stream = self.inner.list_with_offset(prefix, offset).await?; + Ok(throttle_stream(stream, move |_| wait_list_per_entry)) } async fn list_with_delimiter(&self, prefix: Option<&Path>) -> Result { @@ -307,6 +299,21 @@ fn usize_to_u32_saturate(x: usize) -> u32 { x.try_into().unwrap_or(u32::MAX) } +fn throttle_stream( + stream: BoxStream<'_, Result>, + delay: F, +) -> BoxStream<'_, Result> +where + F: Fn(&T) -> Duration + Send + Sync + 'static, +{ + stream + .then(move |result| { + let delay = result.as_ref().ok().map(&delay).unwrap_or_default(); + sleep(delay).then(|_| futures::future::ready(result)) + }) + .boxed() +} + #[cfg(test)] mod tests { use super::*; From 1ea1b3f93ea9319ef5e3b925f914aca662dd019b Mon Sep 17 00:00:00 2001 From: Yang Xiufeng Date: Fri, 31 Mar 2023 18:59:44 +0800 Subject: [PATCH 116/397] Fix typos (#3985) * fix typos in comments * fix typos in changelog. * fix typos in readme. * fix typos in string literals. * fix typos in unit tests func names. * fix typos in codes. --- CHANGELOG-old.md | 2 +- src/azure/client.rs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGELOG-old.md b/CHANGELOG-old.md index 58fb8a3..19a2766 100644 --- a/CHANGELOG-old.md +++ b/CHANGELOG-old.md @@ -217,6 +217,6 @@ - Do not pretend to cache rust build artifacts, speed up CI by ~20% [\#2150](https://github.com/apache/arrow-rs/pull/2150) ([alamb](https://github.com/alamb)) - Port `object_store` integration tests, use github actions [\#2148](https://github.com/apache/arrow-rs/pull/2148) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([alamb](https://github.com/alamb)) - Port Add stream upload \(multi-part upload\) [\#2147](https://github.com/apache/arrow-rs/pull/2147) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([alamb](https://github.com/alamb)) -- Increase upper wait time to reduce flakyness of object store test [\#2142](https://github.com/apache/arrow-rs/pull/2142) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([viirya](https://github.com/viirya)) +- Increase upper wait time to reduce flakiness of object store test [\#2142](https://github.com/apache/arrow-rs/pull/2142) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([viirya](https://github.com/viirya)) \* *This Changelog was automatically generated by [github_changelog_generator](https://github.com/github-changelog-generator/github-changelog-generator)* diff --git a/src/azure/client.rs b/src/azure/client.rs index c5a5652..494303d 100644 --- a/src/azure/client.rs +++ b/src/azure/client.rs @@ -436,7 +436,7 @@ fn to_list_result(value: ListResultInternal, prefix: Option<&str>) -> Result 0 && obj.location.as_ref().len() > prefix.as_ref().len() { Some(obj) From 9649f137a8d2a524ccee2c0675f21db49ff2e082 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Fri, 31 Mar 2023 12:02:06 +0100 Subject: [PATCH 117/397] Prepare object_store 0.5.6 (#3984) --- CHANGELOG-old.md | 24 +++++++++++++++++++ CHANGELOG.md | 40 +++++++++++++++++++++----------- Cargo.toml | 2 +- dev/release/update_change_log.sh | 4 ++-- 4 files changed, 54 insertions(+), 16 deletions(-) diff --git a/CHANGELOG-old.md b/CHANGELOG-old.md index 19a2766..cc9453b 100644 --- a/CHANGELOG-old.md +++ b/CHANGELOG-old.md @@ -19,6 +19,30 @@ # Historical Changelog +## [object_store_0.5.5](https://github.com/apache/arrow-rs/tree/object_store_0.5.5) (2023-02-27) + +[Full Changelog](https://github.com/apache/arrow-rs/compare/object_store_0.5.4...object_store_0.5.5) + +**Implemented enhancements:** + +- object\_store: support azure cli credential [\#3697](https://github.com/apache/arrow-rs/issues/3697) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- object\_store: support encoded path as input [\#3651](https://github.com/apache/arrow-rs/issues/3651) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] + +**Fixed bugs:** + +- object-store: aws\_profile fails to load static credentials [\#3765](https://github.com/apache/arrow-rs/issues/3765) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Inconsistent Behaviour Listing File [\#3712](https://github.com/apache/arrow-rs/issues/3712) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- object\_store: bearer token is azure is used like access key [\#3696](https://github.com/apache/arrow-rs/issues/3696) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] + +**Merged pull requests:** + +- object-store: fix handling of AWS profile credentials without expiry [\#3766](https://github.com/apache/arrow-rs/pull/3766) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([helmus](https://github.com/helmus)) +- update object\_store deps to patch potential security vulnerabilities [\#3761](https://github.com/apache/arrow-rs/pull/3761) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([spencerbart](https://github.com/spencerbart)) +- Filter exact list prefix matches for azure gen2 accounts [\#3714](https://github.com/apache/arrow-rs/pull/3714) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([roeap](https://github.com/roeap)) +- Filter exact list prefix matches for MemoryStore and HttpStore \(\#3712\) [\#3713](https://github.com/apache/arrow-rs/pull/3713) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- object\_store: azure cli authorization [\#3698](https://github.com/apache/arrow-rs/pull/3698) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([roeap](https://github.com/roeap)) +- object\_store: add Path::from\_url\_path [\#3663](https://github.com/apache/arrow-rs/pull/3663) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([jychen7](https://github.com/jychen7)) + ## [object_store_0.5.4](https://github.com/apache/arrow-rs/tree/object_store_0.5.4) (2023-01-30) [Full Changelog](https://github.com/apache/arrow-rs/compare/object_store_0.5.3...object_store_0.5.4) diff --git a/CHANGELOG.md b/CHANGELOG.md index b8f2fe8..b26ae71 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,29 +19,43 @@ # Changelog -## [object_store_0.5.5](https://github.com/apache/arrow-rs/tree/object_store_0.5.5) (2023-02-27) +## [object_store_0.5.6](https://github.com/apache/arrow-rs/tree/object_store_0.5.6) (2023-03-30) -[Full Changelog](https://github.com/apache/arrow-rs/compare/object_store_0.5.4...object_store_0.5.5) +[Full Changelog](https://github.com/apache/arrow-rs/compare/object_store_0.5.5...object_store_0.5.6) **Implemented enhancements:** -- object\_store: support azure cli credential [\#3697](https://github.com/apache/arrow-rs/issues/3697) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- object\_store: support encoded path as input [\#3651](https://github.com/apache/arrow-rs/issues/3651) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Document ObjectStore::list Ordering [\#3975](https://github.com/apache/arrow-rs/issues/3975) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Add option to start listing at a particular key [\#3970](https://github.com/apache/arrow-rs/issues/3970) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Implement `ObjectStore` for trait objects [\#3865](https://github.com/apache/arrow-rs/issues/3865) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Add ObjectStore::append [\#3790](https://github.com/apache/arrow-rs/issues/3790) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Make `InMemory` object store track last modified time for each entry [\#3782](https://github.com/apache/arrow-rs/issues/3782) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Support Unsigned S3 Payloads [\#3737](https://github.com/apache/arrow-rs/issues/3737) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Add Content-MD5 or checksum header for using an Object Locked S3 [\#3725](https://github.com/apache/arrow-rs/issues/3725) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] **Fixed bugs:** -- object-store: aws\_profile fails to load static credentials [\#3765](https://github.com/apache/arrow-rs/issues/3765) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- Inconsistent Behaviour Listing File [\#3712](https://github.com/apache/arrow-rs/issues/3712) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- object\_store: bearer token is azure is used like access key [\#3696](https://github.com/apache/arrow-rs/issues/3696) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- LocalFileSystem::put is not Atomic [\#3780](https://github.com/apache/arrow-rs/issues/3780) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] **Merged pull requests:** -- object-store: fix handling of AWS profile credentials without expiry [\#3766](https://github.com/apache/arrow-rs/pull/3766) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([helmus](https://github.com/helmus)) -- update object\_store deps to patch potential security vulnerabilities [\#3761](https://github.com/apache/arrow-rs/pull/3761) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([spencerbart](https://github.com/spencerbart)) -- Filter exact list prefix matches for azure gen2 accounts [\#3714](https://github.com/apache/arrow-rs/pull/3714) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([roeap](https://github.com/roeap)) -- Filter exact list prefix matches for MemoryStore and HttpStore \(\#3712\) [\#3713](https://github.com/apache/arrow-rs/pull/3713) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- object\_store: azure cli authorization [\#3698](https://github.com/apache/arrow-rs/pull/3698) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([roeap](https://github.com/roeap)) -- object\_store: add Path::from\_url\_path [\#3663](https://github.com/apache/arrow-rs/pull/3663) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([jychen7](https://github.com/jychen7)) +- Add ObjectStore::list\_with\_offset \(\#3970\) [\#3973](https://github.com/apache/arrow-rs/pull/3973) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Remove incorrect validation logic on S3 bucket names [\#3947](https://github.com/apache/arrow-rs/pull/3947) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([rtyler](https://github.com/rtyler)) +- Prepare arrow 36 [\#3935](https://github.com/apache/arrow-rs/pull/3935) ([tustvold](https://github.com/tustvold)) +- fix: Specify content length for gcp copy request [\#3921](https://github.com/apache/arrow-rs/pull/3921) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([scsmithr](https://github.com/scsmithr)) +- Revert structured ArrayData \(\#3877\) [\#3894](https://github.com/apache/arrow-rs/pull/3894) ([tustvold](https://github.com/tustvold)) +- Add support for checksum algorithms in AWS [\#3873](https://github.com/apache/arrow-rs/pull/3873) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([trueleo](https://github.com/trueleo)) +- Rename PrefixObjectStore to PrefixStore [\#3870](https://github.com/apache/arrow-rs/pull/3870) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Implement append for LimitStore, PrefixObjectStore, ThrottledStore [\#3869](https://github.com/apache/arrow-rs/pull/3869) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Supporting metadata fetch without open file read mode [\#3868](https://github.com/apache/arrow-rs/pull/3868) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([metesynnada](https://github.com/metesynnada)) +- Impl ObjectStore for trait object [\#3866](https://github.com/apache/arrow-rs/pull/3866) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([Kinrany](https://github.com/Kinrany)) +- Update quick-xml requirement from 0.27.0 to 0.28.0 [\#3857](https://github.com/apache/arrow-rs/pull/3857) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- Update changelog for 35.0.0 [\#3843](https://github.com/apache/arrow-rs/pull/3843) ([tustvold](https://github.com/tustvold)) +- Cleanup ApplicationDefaultCredentials [\#3799](https://github.com/apache/arrow-rs/pull/3799) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Make InMemory object store track last modified time for each entry [\#3796](https://github.com/apache/arrow-rs/pull/3796) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([Weijun-H](https://github.com/Weijun-H)) +- Add ObjectStore::append [\#3791](https://github.com/apache/arrow-rs/pull/3791) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Make LocalFileSystem::put atomic \(\#3780\) [\#3781](https://github.com/apache/arrow-rs/pull/3781) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Add support for unsigned payloads in aws [\#3741](https://github.com/apache/arrow-rs/pull/3741) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([trueleo](https://github.com/trueleo)) diff --git a/Cargo.toml b/Cargo.toml index d9b075f..bd0bbb7 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "object_store" -version = "0.5.5" +version = "0.5.6" edition = { workspace = true } license = "MIT/Apache-2.0" readme = "README.md" diff --git a/dev/release/update_change_log.sh b/dev/release/update_change_log.sh index de80d0f..b69d36f 100755 --- a/dev/release/update_change_log.sh +++ b/dev/release/update_change_log.sh @@ -29,8 +29,8 @@ set -e -SINCE_TAG="object_store_0.5.4" -FUTURE_RELEASE="object_store_0.5.5" +SINCE_TAG="object_store_0.5.5" +FUTURE_RELEASE="object_store_0.5.6" SOURCE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" SOURCE_TOP_DIR="$(cd "${SOURCE_DIR}/../../" && pwd)" From 7870fcc0a6e8a176f8c1640b828b8023ff7f5c44 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Fri, 31 Mar 2023 12:17:53 +0100 Subject: [PATCH 118/397] Revert workspace links for object_store (#3987) --- Cargo.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index bd0bbb7..9bf1043 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -18,12 +18,12 @@ [package] name = "object_store" version = "0.5.6" -edition = { workspace = true } +edition = "2021" license = "MIT/Apache-2.0" readme = "README.md" description = "A generic object store interface for uniformly interacting with AWS S3, Google Cloud Storage, Azure Blob Storage and local files." keywords = ["object", "storage", "cloud"] -repository = { workspace = true } +repository = "https://github.com/apache/arrow-rs/tree/master/object_store" [package.metadata.docs.rs] all-features = true From 4818da5b81292887731c6506dd1872af9ea2fee8 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Fri, 31 Mar 2023 19:10:45 +0100 Subject: [PATCH 119/397] Update AWS SDK (#3993) --- Cargo.toml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 9bf1043..fcdbd98 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -52,9 +52,9 @@ ring = { version = "0.16", default-features = false, features = ["std"], optiona rustls-pemfile = { version = "1.0", default-features = false, optional = true } # AWS Profile support -aws-types = { version = "0.54", optional = true } -aws-credential-types = { version = "0.54", optional = true } -aws-config = { version = "0.54", optional = true } +aws-types = { version = "0.55", optional = true } +aws-credential-types = { version = "0.55", optional = true } +aws-config = { version = "0.55", optional = true } [target.'cfg(not(target_arch = "wasm32"))'.dependencies] tokio = { version = "1.25.0", features = ["sync", "macros", "rt", "time", "io-util", "fs"] } From d354644d317e764fcb6ed6c01c23a6a1ed23822f Mon Sep 17 00:00:00 2001 From: Alex Huang Date: Mon, 3 Apr 2023 14:56:47 +0200 Subject: [PATCH 120/397] feat: add etag for objectMeta (#3937) * feat: add etag for objectMeta * replace the manual etag in response * fix typo * use option for e_tag * remove useless packages --- src/aws/client.rs | 3 +++ src/aws/mod.rs | 7 ++++++- src/azure/client.rs | 4 +++- src/azure/mod.rs | 12 +++++++++++- src/gcp/mod.rs | 5 ++++- src/http/client.rs | 7 ++++++- src/lib.rs | 2 ++ src/local.rs | 4 +++- src/memory.rs | 4 ++++ src/prefix.rs | 3 +++ 10 files changed, 45 insertions(+), 6 deletions(-) diff --git a/src/aws/client.rs b/src/aws/client.rs index 7ac4b70..9634c74 100644 --- a/src/aws/client.rs +++ b/src/aws/client.rs @@ -164,6 +164,8 @@ pub struct ListContents { pub key: String, pub size: usize, pub last_modified: DateTime, + #[serde(rename = "ETag")] + pub e_tag: Option, } impl TryFrom for ObjectMeta { @@ -174,6 +176,7 @@ impl TryFrom for ObjectMeta { location: Path::parse(value.key)?, last_modified: value.last_modified, size: value.size, + e_tag: value.e_tag, }) } } diff --git a/src/aws/mod.rs b/src/aws/mod.rs index 1e302e6..f88960b 100644 --- a/src/aws/mod.rs +++ b/src/aws/mod.rs @@ -232,7 +232,7 @@ impl ObjectStore for AmazonS3 { } async fn head(&self, location: &Path) -> Result { - use reqwest::header::{CONTENT_LENGTH, LAST_MODIFIED}; + use reqwest::header::{CONTENT_LENGTH, ETAG, LAST_MODIFIED}; // Extract meta from headers // https://docs.aws.amazon.com/AmazonS3/latest/API/API_HeadObject.html#API_HeadObject_ResponseSyntax @@ -256,10 +256,15 @@ impl ObjectStore for AmazonS3 { let content_length = content_length .parse() .context(InvalidContentLengthSnafu { content_length })?; + + let e_tag = headers.get(ETAG).context(MissingEtagSnafu)?; + let e_tag = e_tag.to_str().context(BadHeaderSnafu)?; + Ok(ObjectMeta { location: location.clone(), last_modified, size: content_length, + e_tag: Some(e_tag.to_string()), }) } diff --git a/src/azure/client.rs b/src/azure/client.rs index 494303d..87432f6 100644 --- a/src/azure/client.rs +++ b/src/azure/client.rs @@ -489,6 +489,7 @@ impl TryFrom for ObjectMeta { location: Path::parse(value.name)?, last_modified: value.properties.last_modified, size: value.properties.content_length as usize, + e_tag: value.properties.e_tag, }) } } @@ -501,7 +502,6 @@ impl TryFrom for ObjectMeta { struct BlobProperties { #[serde(deserialize_with = "deserialize_rfc1123", rename = "Last-Modified")] pub last_modified: DateTime, - pub etag: String, #[serde(rename = "Content-Length")] pub content_length: u64, #[serde(rename = "Content-Type")] @@ -510,6 +510,8 @@ struct BlobProperties { pub content_encoding: Option, #[serde(rename = "Content-Language")] pub content_language: Option, + #[serde(rename = "Etag")] + pub e_tag: Option, } #[derive(Debug, Clone, PartialEq, Eq)] diff --git a/src/azure/mod.rs b/src/azure/mod.rs index e5f1465..c2e72f2 100644 --- a/src/azure/mod.rs +++ b/src/azure/mod.rs @@ -140,6 +140,9 @@ enum Error { #[snafu(display("Configuration key: '{}' is not known.", key))] UnknownConfigurationKey { key: String }, + + #[snafu(display("ETag Header missing from response"))] + MissingEtag, } impl From for super::Error { @@ -232,7 +235,7 @@ impl ObjectStore for MicrosoftAzure { } async fn head(&self, location: &Path) -> Result { - use reqwest::header::{CONTENT_LENGTH, LAST_MODIFIED}; + use reqwest::header::{CONTENT_LENGTH, ETAG, LAST_MODIFIED}; // Extract meta from headers // https://docs.microsoft.com/en-us/rest/api/storageservices/get-blob-properties @@ -257,10 +260,17 @@ impl ObjectStore for MicrosoftAzure { .parse() .context(InvalidContentLengthSnafu { content_length })?; + let e_tag = headers + .get(ETAG) + .ok_or(Error::MissingEtag)? + .to_str() + .context(BadHeaderSnafu)?; + Ok(ObjectMeta { location: location.clone(), last_modified, size: content_length, + e_tag: Some(e_tag.to_string()), }) } diff --git a/src/gcp/mod.rs b/src/gcp/mod.rs index fe79a6e..5247693 100644 --- a/src/gcp/mod.rs +++ b/src/gcp/mod.rs @@ -196,6 +196,8 @@ struct Object { name: String, size: String, updated: DateTime, + #[serde(rename = "etag")] + e_tag: Option, } #[derive(serde::Deserialize, Debug)] @@ -209,7 +211,6 @@ struct InitiateMultipartUploadResult { struct MultipartPart { #[serde(rename = "PartNumber")] part_number: usize, - #[serde(rename = "ETag")] e_tag: String, } @@ -1170,11 +1171,13 @@ fn convert_object_meta(object: &Object) -> Result { let location = Path::parse(&object.name)?; let last_modified = object.updated; let size = object.size.parse().context(InvalidSizeSnafu)?; + let e_tag = object.e_tag.clone(); Ok(ObjectMeta { location, last_modified, size, + e_tag, }) } diff --git a/src/http/client.rs b/src/http/client.rs index 799c5be..5ef2721 100644 --- a/src/http/client.rs +++ b/src/http/client.rs @@ -335,10 +335,12 @@ impl MultiStatusResponse { /// Returns this objects metadata as [`ObjectMeta`] pub fn object_meta(&self, base_url: &Url) -> Result { + let last_modified = self.prop_stat.prop.last_modified; Ok(ObjectMeta { location: self.path(base_url)?, - last_modified: self.prop_stat.prop.last_modified, + last_modified, size: self.size()?, + e_tag: self.prop_stat.prop.e_tag.clone(), }) } @@ -364,6 +366,9 @@ pub struct Prop { #[serde(rename = "resourcetype")] resource_type: ResourceType, + + #[serde(rename = "getetag")] + e_tag: Option, } #[derive(Deserialize)] diff --git a/src/lib.rs b/src/lib.rs index 5737071..c31027c 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -548,6 +548,8 @@ pub struct ObjectMeta { pub last_modified: DateTime, /// The size in bytes of the object pub size: usize, + /// The unique identifier for the object + pub e_tag: Option, } /// Result for a get request diff --git a/src/local.rs b/src/local.rs index 9e710c2..d2553d4 100644 --- a/src/local.rs +++ b/src/local.rs @@ -23,6 +23,7 @@ use crate::{ }; use async_trait::async_trait; use bytes::Bytes; +use chrono::{DateTime, Utc}; use futures::future::BoxFuture; use futures::FutureExt; use futures::{stream::BoxStream, StreamExt}; @@ -887,7 +888,7 @@ fn convert_entry(entry: DirEntry, location: Path) -> Result { } fn convert_metadata(metadata: std::fs::Metadata, location: Path) -> Result { - let last_modified = metadata + let last_modified: DateTime = metadata .modified() .expect("Modified file time should be supported on this platform") .into(); @@ -900,6 +901,7 @@ fn convert_metadata(metadata: std::fs::Metadata, location: Path) -> Result Result { let entry = self.entry(location).await?; + Ok(ObjectMeta { location: location.clone(), last_modified: entry.1, size: entry.0.len(), + e_tag: None, }) } @@ -185,6 +187,7 @@ impl ObjectStore for InMemory { location: key.clone(), last_modified: value.1, size: value.0.len(), + e_tag: None, }) }) .collect(); @@ -228,6 +231,7 @@ impl ObjectStore for InMemory { location: k.clone(), last_modified: v.1, size: v.0.len(), + e_tag: None, }; objects.push(object); } diff --git a/src/prefix.rs b/src/prefix.rs index 7e7e716..eba3795 100644 --- a/src/prefix.rs +++ b/src/prefix.rs @@ -108,6 +108,7 @@ impl ObjectStore for PrefixStore { last_modified: meta.last_modified, size: meta.size, location: self.strip_prefix(&meta.location).unwrap_or(meta.location), + e_tag: meta.e_tag, }) } @@ -128,6 +129,7 @@ impl ObjectStore for PrefixStore { last_modified: meta.last_modified, size: meta.size, location: self.strip_prefix(&meta.location).unwrap_or(meta.location), + e_tag: meta.e_tag, }) .boxed()) } @@ -155,6 +157,7 @@ impl ObjectStore for PrefixStore { last_modified: meta.last_modified, size: meta.size, location: self.strip_prefix(&meta.location)?, + e_tag: meta.e_tag.clone(), }) }) .collect(), From 62b786f4d69c4d4117ff40c14b5cc3608406f7f4 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Sun, 9 Apr 2023 20:32:41 +0100 Subject: [PATCH 121/397] Use reqwest build_split (#4039) * Use reqwest build_split * Fix typo --- src/aws/credential.rs | 21 ++++----------------- src/azure/credential.rs | 36 +++++++++++++++--------------------- 2 files changed, 19 insertions(+), 38 deletions(-) diff --git a/src/aws/credential.rs b/src/aws/credential.rs index 183e843..c4cb7cf 100644 --- a/src/aws/credential.rs +++ b/src/aws/credential.rs @@ -81,8 +81,6 @@ const HASH_HEADER: &str = "x-amz-content-sha256"; const TOKEN_HEADER: &str = "x-amz-security-token"; const AUTH_HEADER: &str = "authorization"; -const ALL_HEADERS: &[&str; 4] = &[DATE_HEADER, HASH_HEADER, TOKEN_HEADER, AUTH_HEADER]; - impl<'a> RequestSigner<'a> { fn sign(&self, request: &mut Request, pre_calculated_digest: Option>) { if let Some(ref token) = self.credential.token { @@ -175,20 +173,15 @@ pub trait CredentialExt { impl CredentialExt for RequestBuilder { fn with_aws_sigv4( - mut self, + self, credential: &AwsCredential, region: &str, service: &str, sign_payload: bool, payload_sha256: Option>, ) -> Self { - // Hack around lack of access to underlying request - // https://github.com/seanmonstar/reqwest/issues/1212 - let mut request = self - .try_clone() - .expect("not stream") - .build() - .expect("request valid"); + let (client, request) = self.build_split(); + let mut request = request.expect("request valid"); let date = Utc::now(); let signer = RequestSigner { @@ -200,13 +193,7 @@ impl CredentialExt for RequestBuilder { }; signer.sign(&mut request, payload_sha256); - - for header in ALL_HEADERS { - if let Some(val) = request.headers_mut().remove(*header) { - self = self.header(*header, val) - } - } - self + Self::from_parts(client, request) } } diff --git a/src/azure/credential.rs b/src/azure/credential.rs index 9e07222..0196d93 100644 --- a/src/azure/credential.rs +++ b/src/azure/credential.rs @@ -124,16 +124,11 @@ impl CredentialExt for RequestBuilder { .header(DATE, &date_val) .header(&VERSION, &AZURE_VERSION); - // Hack around lack of access to underlying request - // https://github.com/seanmonstar/reqwest/issues/1212 - let request = self - .try_clone() - .expect("not stream") - .build() - .expect("request valid"); - match credential { AzureCredential::AccessKey(key) => { + let (client, request) = self.build_split(); + let mut request = request.expect("request valid"); + let signature = generate_authorization( request.headers(), request.url(), @@ -141,22 +136,21 @@ impl CredentialExt for RequestBuilder { account, key.as_str(), ); - self = self - // "signature" is a base 64 encoded string so it should never contain illegal characters. - .header( - AUTHORIZATION, - HeaderValue::from_str(signature.as_str()).unwrap(), - ); + + // "signature" is a base 64 encoded string so it should never + // contain illegal characters + request.headers_mut().append( + AUTHORIZATION, + HeaderValue::from_str(signature.as_str()).unwrap(), + ); + + Self::from_parts(client, request) } AzureCredential::AuthorizationToken(token) => { - self = self.header(AUTHORIZATION, token); + self.header(AUTHORIZATION, token) } - AzureCredential::SASToken(query_pairs) => { - self = self.query(&query_pairs); - } - }; - - self + AzureCredential::SASToken(query_pairs) => self.query(&query_pairs), + } } } From ddce8fb9eca9438c688f4e49686c7ab528257138 Mon Sep 17 00:00:00 2001 From: "r.4ntix" Date: Mon, 10 Apr 2023 23:13:00 +0800 Subject: [PATCH 122/397] Add get_config_value to AWS/Azure/GCP Builders (#4035) * minor: make struct fields of Builders(S3/Azure/GCS) to pub * minor: use `get_config_value` method instead of public fields * fix clippy error --- src/aws/checksum.rs | 10 ++++- src/aws/mod.rs | 103 ++++++++++++++++++++++++++++++++++++++++++++ src/azure/mod.rs | 81 ++++++++++++++++++++++++++++++++++ src/gcp/mod.rs | 56 ++++++++++++++++++++++++ 4 files changed, 249 insertions(+), 1 deletion(-) diff --git a/src/aws/checksum.rs b/src/aws/checksum.rs index ae35f06..c787c28 100644 --- a/src/aws/checksum.rs +++ b/src/aws/checksum.rs @@ -39,11 +39,19 @@ impl Checksum { } } +impl std::fmt::Display for Checksum { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match &self { + Self::SHA256 => write!(f, "sha256"), + } + } +} + impl TryFrom<&String> for Checksum { type Error = (); fn try_from(value: &String) -> Result { - match value.as_str() { + match value.to_lowercase().as_str() { "sha256" => Ok(Self::SHA256), _ => Err(()), } diff --git a/src/aws/mod.rs b/src/aws/mod.rs index f88960b..de62360 100644 --- a/src/aws/mod.rs +++ b/src/aws/mod.rs @@ -400,20 +400,35 @@ impl CloudMultiPartUploadImpl for S3MultiPartUpload { /// ``` #[derive(Debug, Default, Clone)] pub struct AmazonS3Builder { + /// Access key id access_key_id: Option, + /// Secret access_key secret_access_key: Option, + /// Region region: Option, + /// Bucket name bucket_name: Option, + /// Endpoint for communicating with AWS S3 endpoint: Option, + /// Token to use for requests token: Option, + /// Url url: Option, + /// Retry config retry_config: RetryConfig, + /// When set to true, fallback to IMDSv1 imdsv1_fallback: bool, + /// When set to true, virtual hosted style request has to be used virtual_hosted_style_request: bool, + /// When set to true, unsigned payload option has to be used unsigned_payload: bool, + /// Checksum algorithm which has to be used for object integrity check during upload checksum_algorithm: Option, + /// Metadata endpoint, see metadata_endpoint: Option, + /// Profile name, see profile: Option, + /// Client options client_options: ClientOptions, } @@ -751,6 +766,38 @@ impl AmazonS3Builder { Ok(self) } + /// Get config value via a [`AmazonS3ConfigKey`]. + /// + /// # Example + /// ``` + /// use object_store::aws::{AmazonS3Builder, AmazonS3ConfigKey}; + /// + /// let builder = AmazonS3Builder::from_env() + /// .with_bucket_name("foo"); + /// let bucket_name = builder.get_config_value(&AmazonS3ConfigKey::Bucket).unwrap_or_default(); + /// assert_eq!("foo", &bucket_name); + /// ``` + pub fn get_config_value(&self, key: &AmazonS3ConfigKey) -> Option { + match key { + AmazonS3ConfigKey::AccessKeyId => self.access_key_id.clone(), + AmazonS3ConfigKey::SecretAccessKey => self.secret_access_key.clone(), + AmazonS3ConfigKey::Region | AmazonS3ConfigKey::DefaultRegion => { + self.region.clone() + } + AmazonS3ConfigKey::Bucket => self.bucket_name.clone(), + AmazonS3ConfigKey::Endpoint => self.endpoint.clone(), + AmazonS3ConfigKey::Token => self.token.clone(), + AmazonS3ConfigKey::ImdsV1Fallback => Some(self.imdsv1_fallback.to_string()), + AmazonS3ConfigKey::VirtualHostedStyleRequest => { + Some(self.virtual_hosted_style_request.to_string()) + } + AmazonS3ConfigKey::MetadataEndpoint => self.metadata_endpoint.clone(), + AmazonS3ConfigKey::Profile => self.profile.clone(), + AmazonS3ConfigKey::UnsignedPayload => Some(self.unsigned_payload.to_string()), + AmazonS3ConfigKey::Checksum => self.checksum_algorithm.map(|v| v.to_string()), + } + } + /// Sets properties on this builder based on a URL /// /// This is a separate member function to allow fallible computation to @@ -1272,6 +1319,62 @@ mod tests { assert!(builder.unsigned_payload); } + #[test] + fn s3_test_config_get_value() { + let aws_access_key_id = "object_store:fake_access_key_id".to_string(); + let aws_secret_access_key = "object_store:fake_secret_key".to_string(); + let aws_default_region = "object_store:fake_default_region".to_string(); + let aws_endpoint = "object_store:fake_endpoint".to_string(); + let aws_session_token = "object_store:fake_session_token".to_string(); + let options = HashMap::from([ + (AmazonS3ConfigKey::AccessKeyId, aws_access_key_id.clone()), + ( + AmazonS3ConfigKey::SecretAccessKey, + aws_secret_access_key.clone(), + ), + (AmazonS3ConfigKey::DefaultRegion, aws_default_region.clone()), + (AmazonS3ConfigKey::Endpoint, aws_endpoint.clone()), + (AmazonS3ConfigKey::Token, aws_session_token.clone()), + (AmazonS3ConfigKey::UnsignedPayload, "true".to_string()), + ]); + + let builder = AmazonS3Builder::new().try_with_options(&options).unwrap(); + assert_eq!( + builder + .get_config_value(&AmazonS3ConfigKey::AccessKeyId) + .unwrap(), + aws_access_key_id + ); + assert_eq!( + builder + .get_config_value(&AmazonS3ConfigKey::SecretAccessKey) + .unwrap(), + aws_secret_access_key + ); + assert_eq!( + builder + .get_config_value(&AmazonS3ConfigKey::DefaultRegion) + .unwrap(), + aws_default_region + ); + assert_eq!( + builder + .get_config_value(&AmazonS3ConfigKey::Endpoint) + .unwrap(), + aws_endpoint + ); + assert_eq!( + builder.get_config_value(&AmazonS3ConfigKey::Token).unwrap(), + aws_session_token + ); + assert_eq!( + builder + .get_config_value(&AmazonS3ConfigKey::UnsignedPayload) + .unwrap(), + "true" + ); + } + #[test] fn s3_test_config_fallible_options() { let aws_access_key_id = "object_store:fake_access_key_id".to_string(); diff --git a/src/azure/mod.rs b/src/azure/mod.rs index c2e72f2..11350a2 100644 --- a/src/azure/mod.rs +++ b/src/azure/mod.rs @@ -394,24 +394,43 @@ impl CloudMultiPartUploadImpl for AzureMultiPartUpload { /// ``` #[derive(Default, Clone)] pub struct MicrosoftAzureBuilder { + /// Account name account_name: Option, + /// Access key access_key: Option, + /// Container name container_name: Option, + /// Bearer token bearer_token: Option, + /// Client id client_id: Option, + /// Client secret client_secret: Option, + /// Tenant id tenant_id: Option, + /// Query pairs for shared access signature authorization sas_query_pairs: Option>, + /// Shared access signature sas_key: Option, + /// Authority host authority_host: Option, + /// Url url: Option, + /// When set to true, azurite storage emulator has to be used use_emulator: bool, + /// Msi endpoint for acquiring managed identity token msi_endpoint: Option, + /// Object id for use with managed identity authentication object_id: Option, + /// Msi resource id for use with managed identity authentication msi_resource_id: Option, + /// File containing token for Azure AD workload identity federation federated_token_file: Option, + /// When set to true, azure cli has to be used for acquiring access token use_azure_cli: bool, + /// Retry config retry_config: RetryConfig, + /// Client options client_options: ClientOptions, } @@ -747,6 +766,35 @@ impl MicrosoftAzureBuilder { Ok(self) } + /// Get config value via a [`AzureConfigKey`]. + /// + /// # Example + /// ``` + /// use object_store::azure::{MicrosoftAzureBuilder, AzureConfigKey}; + /// + /// let builder = MicrosoftAzureBuilder::from_env() + /// .with_account("foo"); + /// let account_name = builder.get_config_value(&AzureConfigKey::AccountName).unwrap_or_default(); + /// assert_eq!("foo", &account_name); + /// ``` + pub fn get_config_value(&self, key: &AzureConfigKey) -> Option { + match key { + AzureConfigKey::AccountName => self.account_name.clone(), + AzureConfigKey::AccessKey => self.access_key.clone(), + AzureConfigKey::ClientId => self.client_id.clone(), + AzureConfigKey::ClientSecret => self.client_secret.clone(), + AzureConfigKey::AuthorityId => self.tenant_id.clone(), + AzureConfigKey::SasKey => self.sas_key.clone(), + AzureConfigKey::Token => self.bearer_token.clone(), + AzureConfigKey::UseEmulator => Some(self.use_emulator.to_string()), + AzureConfigKey::MsiEndpoint => self.msi_endpoint.clone(), + AzureConfigKey::ObjectId => self.object_id.clone(), + AzureConfigKey::MsiResourceId => self.msi_resource_id.clone(), + AzureConfigKey::FederatedTokenFile => self.federated_token_file.clone(), + AzureConfigKey::UseAzureCli => Some(self.use_azure_cli.to_string()), + } + } + /// Sets properties on this builder based on a URL /// /// This is a separate member function to allow fallible computation to @@ -1252,6 +1300,39 @@ mod tests { assert_eq!(builder.bearer_token.unwrap(), azure_storage_token); } + #[test] + fn azure_test_config_get_value() { + let azure_client_id = "object_store:fake_access_key_id".to_string(); + let azure_storage_account_name = "object_store:fake_secret_key".to_string(); + let azure_storage_token = "object_store:fake_default_region".to_string(); + let options = HashMap::from([ + (AzureConfigKey::ClientId, azure_client_id.clone()), + ( + AzureConfigKey::AccountName, + azure_storage_account_name.clone(), + ), + (AzureConfigKey::Token, azure_storage_token.clone()), + ]); + + let builder = MicrosoftAzureBuilder::new() + .try_with_options(&options) + .unwrap(); + assert_eq!( + builder.get_config_value(&AzureConfigKey::ClientId).unwrap(), + azure_client_id + ); + assert_eq!( + builder + .get_config_value(&AzureConfigKey::AccountName) + .unwrap(), + azure_storage_account_name + ); + assert_eq!( + builder.get_config_value(&AzureConfigKey::Token).unwrap(), + azure_storage_token + ); + } + #[test] fn azure_test_config_fallible_options() { let azure_client_id = "object_store:fake_access_key_id".to_string(); diff --git a/src/gcp/mod.rs b/src/gcp/mod.rs index 5247693..a6cf660 100644 --- a/src/gcp/mod.rs +++ b/src/gcp/mod.rs @@ -768,12 +768,19 @@ impl ObjectStore for GoogleCloudStorage { /// ``` #[derive(Debug, Clone)] pub struct GoogleCloudStorageBuilder { + /// Bucket name bucket_name: Option, + /// Url url: Option, + /// Path to the service account file service_account_path: Option, + /// The serialized service account key service_account_key: Option, + /// Path to the application credentials file. application_credentials_path: Option, + /// Retry config retry_config: RetryConfig, + /// Client options client_options: ClientOptions, } @@ -983,6 +990,28 @@ impl GoogleCloudStorageBuilder { Ok(self) } + /// Get config value via a [`GoogleConfigKey`]. + /// + /// # Example + /// ``` + /// use object_store::gcp::{GoogleCloudStorageBuilder, GoogleConfigKey}; + /// + /// let builder = GoogleCloudStorageBuilder::from_env() + /// .with_service_account_key("foo"); + /// let service_account_key = builder.get_config_value(&GoogleConfigKey::ServiceAccountKey).unwrap_or_default(); + /// assert_eq!("foo", &service_account_key); + /// ``` + pub fn get_config_value(&self, key: &GoogleConfigKey) -> Option { + match key { + GoogleConfigKey::ServiceAccount => self.service_account_path.clone(), + GoogleConfigKey::ServiceAccountKey => self.service_account_key.clone(), + GoogleConfigKey::Bucket => self.bucket_name.clone(), + GoogleConfigKey::ApplicationCredentials => { + self.application_credentials_path.clone() + } + } + } + /// Sets properties on this builder based on a URL /// /// This is a separate member function to allow fallible computation to @@ -1452,6 +1481,33 @@ mod test { assert_eq!(builder.bucket_name.unwrap(), google_bucket_name.as_str()); } + #[test] + fn gcs_test_config_get_value() { + let google_service_account = "object_store:fake_service_account".to_string(); + let google_bucket_name = "object_store:fake_bucket".to_string(); + let options = HashMap::from([ + ( + GoogleConfigKey::ServiceAccount, + google_service_account.clone(), + ), + (GoogleConfigKey::Bucket, google_bucket_name.clone()), + ]); + + let builder = GoogleCloudStorageBuilder::new() + .try_with_options(&options) + .unwrap(); + assert_eq!( + builder + .get_config_value(&GoogleConfigKey::ServiceAccount) + .unwrap(), + google_service_account + ); + assert_eq!( + builder.get_config_value(&GoogleConfigKey::Bucket).unwrap(), + google_bucket_name + ); + } + #[test] fn gcs_test_config_fallible_options() { let google_service_account = "object_store:fake_service_account".to_string(); From 833495516bcbc287b26292487d36a374ca4b6e17 Mon Sep 17 00:00:00 2001 From: Robert Pack <42610831+roeap@users.noreply.github.com> Date: Thu, 13 Apr 2023 22:36:52 +0200 Subject: [PATCH 123/397] object_store: fix: Incorrect parsing of https Path Style S3 url (#4082) * fix: parse reagion from path-style urls, not bucket * fix: test * fix: parse s3 bucket from first path segment * test: add test for parsing bucket from path style url --- src/aws/mod.rs | 26 ++++++++++++++++++++++---- 1 file changed, 22 insertions(+), 4 deletions(-) diff --git a/src/aws/mod.rs b/src/aws/mod.rs index de62360..34d468f 100644 --- a/src/aws/mod.rs +++ b/src/aws/mod.rs @@ -805,12 +805,16 @@ impl AmazonS3Builder { fn parse_url(&mut self, url: &str) -> Result<()> { let parsed = Url::parse(url).context(UnableToParseUrlSnafu { url })?; let host = parsed.host_str().context(UrlNotRecognisedSnafu { url })?; - match parsed.scheme() { "s3" | "s3a" => self.bucket_name = Some(host.to_string()), "https" => match host.splitn(4, '.').collect_tuple() { - Some(("s3", bucket, "amazonaws", "com")) => { - self.bucket_name = Some(bucket.to_string()); + Some(("s3", region, "amazonaws", "com")) => { + self.region = Some(region.to_string()); + if let Some(bucket) = + parsed.path_segments().and_then(|mut path| path.next()) + { + self.bucket_name = Some(bucket.into()); + } } Some((bucket, "s3", region, "amazonaws.com")) => { self.bucket_name = Some(bucket.to_string()); @@ -1519,10 +1523,24 @@ mod tests { let mut builder = AmazonS3Builder::new(); builder - .parse_url("https://s3.bucket.amazonaws.com") + .parse_url("https://s3.region.amazonaws.com") + .unwrap(); + assert_eq!(builder.region, Some("region".to_string())); + + let mut builder = AmazonS3Builder::new(); + builder + .parse_url("https://s3.region.amazonaws.com/bucket") .unwrap(); + assert_eq!(builder.region, Some("region".to_string())); assert_eq!(builder.bucket_name, Some("bucket".to_string())); + let mut builder = AmazonS3Builder::new(); + builder + .parse_url("https://s3.region.amazonaws.com/bucket.with.dot/path") + .unwrap(); + assert_eq!(builder.region, Some("region".to_string())); + assert_eq!(builder.bucket_name, Some("bucket.with.dot".to_string())); + let mut builder = AmazonS3Builder::new(); builder .parse_url("https://bucket.s3.region.amazonaws.com") From d4594fee87a73d793c4b597d9465f85fc04776bd Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Fri, 21 Apr 2023 14:06:48 -0400 Subject: [PATCH 124/397] Fix object_store tests with latest aho_corasick (#4109) --- src/util.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/util.rs b/src/util.rs index 08bfd86..1ec63f2 100644 --- a/src/util.rs +++ b/src/util.rs @@ -229,7 +229,7 @@ mod tests { #[tokio::test] async fn test_coalesce_ranges() { let fetches = do_fetch(vec![], 0).await; - assert_eq!(fetches, vec![]); + assert!(fetches.is_empty()); let fetches = do_fetch(vec![0..3], 0).await; assert_eq!(fetches, vec![0..3]); From aedbb4fbed446924302b5651ced0a18b4a54b3f6 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Tue, 25 Apr 2023 10:15:37 -0400 Subject: [PATCH 125/397] Fix flaky unknown_length_append (#4123) --- src/local.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/src/local.rs b/src/local.rs index d2553d4..286853d 100644 --- a/src/local.rs +++ b/src/local.rs @@ -1406,6 +1406,7 @@ mod not_wasm_tests { let mut writer = integration.append(&location).await.unwrap(); writer.write_all(data.as_ref()).await.unwrap(); + writer.flush().await.unwrap(); let read_data = integration .get(&location) From 8c2c3d07144fe331c4f6482ddfd14cce491a4e99 Mon Sep 17 00:00:00 2001 From: Xin Hao Date: Wed, 26 Apr 2023 06:11:39 +0800 Subject: [PATCH 126/397] Display the path in the open GCS credentials error (#4124) --- src/gcp/credential.rs | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/src/gcp/credential.rs b/src/gcp/credential.rs index a8dce71..057e013 100644 --- a/src/gcp/credential.rs +++ b/src/gcp/credential.rs @@ -29,14 +29,17 @@ use snafu::{ResultExt, Snafu}; use std::env; use std::fs::File; use std::io::BufReader; -use std::path::Path; +use std::path::{Path, PathBuf}; use std::time::{Duration, Instant}; use tracing::info; #[derive(Debug, Snafu)] pub enum Error { - #[snafu(display("Unable to open service account file: {}", source))] - OpenCredentials { source: std::io::Error }, + #[snafu(display("Unable to open service account file from {}: {}", path.display(), source))] + OpenCredentials { + source: std::io::Error, + path: PathBuf, + }, #[snafu(display("Unable to decode service account file: {}", source))] DecodeCredentials { source: serde_json::Error }, @@ -233,7 +236,9 @@ fn read_credentials_file( where T: serde::de::DeserializeOwned, { - let file = File::open(service_account_path).context(OpenCredentialsSnafu)?; + let file = File::open(&service_account_path).context(OpenCredentialsSnafu { + path: service_account_path.as_ref().to_owned(), + })?; let reader = BufReader::new(file); serde_json::from_reader(reader).context(DecodeCredentialsSnafu) } From bb9d4f9e7346a8984dfd35d3a2358f69c88c07da Mon Sep 17 00:00:00 2001 From: kindly Date: Wed, 26 Apr 2023 12:07:08 +0100 Subject: [PATCH 127/397] Retry when no or partial response from server. (#4120) Retry when server fails unexpectedly, or if there are network issues that are not handled by hyper. --- Cargo.toml | 3 ++- src/client/retry.rs | 42 +++++++++++++++++++++++++++++++++++++----- 2 files changed, 39 insertions(+), 6 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index fcdbd98..b27482b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -43,6 +43,7 @@ walkdir = "2" # Cloud storage support base64 = { version = "0.21", default-features = false, features = ["std"], optional = true } +hyper = { version = "0.14", default-features = false, optional = true } quick-xml = { version = "0.28.0", features = ["serialize"], optional = true } serde = { version = "1.0", default-features = false, features = ["derive"], optional = true } serde_json = { version = "1.0", default-features = false, optional = true } @@ -66,7 +67,7 @@ tokio = { version = "1.25.0", features = ["sync", "macros", "rt", "time", "io-ut nix = "0.26.1" [features] -cloud = ["serde", "serde_json", "quick-xml", "reqwest", "reqwest/json", "reqwest/stream", "chrono/serde", "base64", "rand", "ring"] +cloud = ["serde", "serde_json", "quick-xml", "hyper", "reqwest", "reqwest/json","reqwest/stream", "chrono/serde", "base64", "rand", "ring"] azure = ["cloud"] gcp = ["cloud", "rustls-pemfile"] aws = ["cloud"] diff --git a/src/client/retry.rs b/src/client/retry.rs index e6dd2eb..e6e92f0 100644 --- a/src/client/retry.rs +++ b/src/client/retry.rs @@ -24,6 +24,7 @@ use reqwest::header::LOCATION; use reqwest::{Response, StatusCode}; use std::time::{Duration, Instant}; use tracing::info; +use snafu::Error as SnafuError; /// Retry request error #[derive(Debug)] @@ -192,11 +193,29 @@ impl RetryExt for reqwest::RequestBuilder { }, Err(e) => { - return Err(Error{ - retries, - message: "request error".to_string(), - source: Some(e) - }) + let mut do_retry = false; + if let Some(source) = e.source() { + if let Some(e) = source.downcast_ref::() { + if e.is_connect() || e.is_closed() || e.is_incomplete_message() { + do_retry = true; + } + } + } + + if retries == max_retries + || now.elapsed() > retry_timeout + || !do_retry { + + return Err(Error{ + retries, + message: "request error".to_string(), + source: Some(e) + }) + } + let sleep = backoff.next(); + retries += 1; + info!("Encountered request error ({}) backing off for {} seconds, retry {} of {}", e, sleep.as_secs_f32(), retries, max_retries); + tokio::time::sleep(sleep).await; } } } @@ -345,6 +364,19 @@ mod tests { assert_eq!(e.retries, retry.max_retries); assert_eq!(e.message, "502 Bad Gateway"); + // Panic results in an incomplete message error in the client + mock.push_fn(|_| {panic!()}); + let r = do_request().await.unwrap(); + assert_eq!(r.status(), StatusCode::OK); + + // Gives up after retrying mulitiple panics + for _ in 0..=retry.max_retries { + mock.push_fn(|_| {panic!()}); + } + let e = do_request().await.unwrap_err(); + assert_eq!(e.retries, retry.max_retries); + assert_eq!(e.message, "request error"); + // Shutdown mock.shutdown().await } From b907133a17fbd868a03e6146114b9aa11f51a614 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Berkay=20=C5=9Eahin?= <124376117+berkaysynnada@users.noreply.github.com> Date: Fri, 28 Apr 2023 21:15:09 +0300 Subject: [PATCH 128/397] InMemory append API (#4153) * ready to review * clippy fix * Refactor code to remove byte duplication * simplify shutdown Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> --------- Co-authored-by: metesynnada <100111937+metesynnada@users.noreply.github.com> Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> --- src/memory.rs | 104 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 104 insertions(+) diff --git a/src/memory.rs b/src/memory.rs index 057a260..b01ffbb 100644 --- a/src/memory.rs +++ b/src/memory.rs @@ -117,6 +117,17 @@ impl ObjectStore for InMemory { Ok(()) } + async fn append( + &self, + location: &Path, + ) -> Result> { + Ok(Box::new(InMemoryAppend { + location: location.clone(), + data: Vec::::new(), + storage: StorageType::clone(&self.storage), + })) + } + async fn get(&self, location: &Path) -> Result { let data = self.entry(location).await?; @@ -329,8 +340,55 @@ impl AsyncWrite for InMemoryUpload { } } +struct InMemoryAppend { + location: Path, + data: Vec, + storage: StorageType, +} + +impl AsyncWrite for InMemoryAppend { + fn poll_write( + mut self: Pin<&mut Self>, + _cx: &mut std::task::Context<'_>, + buf: &[u8], + ) -> std::task::Poll> { + self.data.extend_from_slice(buf); + Poll::Ready(Ok(buf.len())) + } + + fn poll_flush( + mut self: Pin<&mut Self>, + _cx: &mut std::task::Context<'_>, + ) -> std::task::Poll> { + let storage = StorageType::clone(&self.storage); + + let mut writer = storage.write(); + + if let Some((bytes, _)) = writer.remove(&self.location) { + let buf = std::mem::take(&mut self.data); + let concat = Bytes::from_iter(bytes.into_iter().chain(buf.into_iter())); + writer.insert(self.location.clone(), (concat, Utc::now())); + } else { + writer.insert( + self.location.clone(), + (Bytes::from(std::mem::take(&mut self.data)), Utc::now()), + ); + }; + Poll::Ready(Ok(())) + } + + fn poll_shutdown( + self: Pin<&mut Self>, + cx: &mut std::task::Context<'_>, + ) -> std::task::Poll> { + self.poll_flush(cx) + } +} + #[cfg(test)] mod tests { + use tokio::io::AsyncWriteExt; + use super::*; use crate::{ @@ -396,4 +454,50 @@ mod tests { panic!("unexpected error type: {err:?}"); } } + + #[tokio::test] + async fn test_append_new() { + let in_memory = InMemory::new(); + let location = Path::from("some_file"); + let data = Bytes::from("arbitrary data"); + let expected_data = data.clone(); + + let mut writer = in_memory.append(&location).await.unwrap(); + writer.write_all(&data).await.unwrap(); + writer.flush().await.unwrap(); + + let read_data = in_memory + .get(&location) + .await + .unwrap() + .bytes() + .await + .unwrap(); + assert_eq!(&*read_data, expected_data); + } + + #[tokio::test] + async fn test_append_existing() { + let in_memory = InMemory::new(); + let location = Path::from("some_file"); + let data = Bytes::from("arbitrary"); + let data_appended = Bytes::from(" data"); + let expected_data = Bytes::from("arbitrary data"); + + let mut writer = in_memory.append(&location).await.unwrap(); + writer.write_all(&data).await.unwrap(); + writer.flush().await.unwrap(); + + writer.write_all(&data_appended).await.unwrap(); + writer.flush().await.unwrap(); + + let read_data = in_memory + .get(&location) + .await + .unwrap() + .bytes() + .await + .unwrap(); + assert_eq!(&*read_data, expected_data); + } } From 12ae83d4ff36b988b42f92623b2bd8d7eb9b2779 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Tue, 9 May 2023 17:43:23 +0100 Subject: [PATCH 129/397] Faster prefix match in object_store path handling (#4164) * Faster prefix match * Simplify parts --- src/path/mod.rs | 33 ++++++++++++--------------------- 1 file changed, 12 insertions(+), 21 deletions(-) diff --git a/src/path/mod.rs b/src/path/mod.rs index a15f7ca..29b1341 100644 --- a/src/path/mod.rs +++ b/src/path/mod.rs @@ -227,14 +227,9 @@ impl Path { /// Returns the [`PathPart`] of this [`Path`] pub fn parts(&self) -> impl Iterator> { - match self.raw.is_empty() { - true => itertools::Either::Left(std::iter::empty()), - false => itertools::Either::Right( - self.raw - .split(DELIMITER) - .map(|s| PathPart { raw: s.into() }), - ), - } + self.raw + .split_terminator(DELIMITER) + .map(|s| PathPart { raw: s.into() }) } /// Returns the last path segment containing the filename stored in this [`Path`] @@ -265,20 +260,14 @@ impl Path { &self, prefix: &Self, ) -> Option> + '_> { - let diff = itertools::diff_with(self.parts(), prefix.parts(), |a, b| a == b); - - match diff { - // Both were equal - None => Some(itertools::Either::Left(std::iter::empty())), - // Mismatch or prefix was longer => None - Some( - itertools::Diff::FirstMismatch(_, _, _) | itertools::Diff::Longer(_, _), - ) => None, - // Match with remaining - Some(itertools::Diff::Shorter(_, back)) => { - Some(itertools::Either::Right(back)) - } + let mut stripped = self.raw.strip_prefix(&prefix.raw)?; + if !stripped.is_empty() && !prefix.raw.is_empty() { + stripped = stripped.strip_prefix(DELIMITER)?; } + let iter = stripped + .split_terminator(DELIMITER) + .map(|x| PathPart { raw: x.into() }); + Some(iter) } /// Returns true if this [`Path`] starts with `prefix` @@ -453,6 +442,8 @@ mod tests { let prefix = existing_path.clone(); assert_eq!(existing_path.prefix_match(&prefix).unwrap().count(), 0); + + assert_eq!(Path::default().parts().count(), 0); } #[test] From deca169fa611bd02e526575117d61ecadc8446c4 Mon Sep 17 00:00:00 2001 From: Josh Wiley Date: Wed, 10 May 2023 01:37:17 -0700 Subject: [PATCH 130/397] Object Store (AWS): Support dynamically resolving S3 bucket region (#4188) * feat(object_store): resolve aws region using bucket name * feat(object_store): resolve bucket region as floating fn * fix(object_store): clippy warnings * Cleanup error handling --------- Co-authored-by: Raphael Taylor-Davies --- src/aws/mod.rs | 73 +++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 72 insertions(+), 1 deletion(-) diff --git a/src/aws/mod.rs b/src/aws/mod.rs index 34d468f..bc852ed 100644 --- a/src/aws/mod.rs +++ b/src/aws/mod.rs @@ -38,7 +38,7 @@ use futures::stream::BoxStream; use futures::TryStreamExt; use itertools::Itertools; use serde::{Deserialize, Serialize}; -use snafu::{OptionExt, ResultExt, Snafu}; +use snafu::{ensure, OptionExt, ResultExt, Snafu}; use std::collections::BTreeSet; use std::ops::Range; use std::str::FromStr; @@ -144,6 +144,18 @@ enum Error { #[snafu(display("Configuration key: '{}' is not known.", key))] UnknownConfigurationKey { key: String }, + + #[snafu(display("Bucket '{}' not found", bucket))] + BucketNotFound { bucket: String }, + + #[snafu(display("Failed to resolve region for bucket '{}'", bucket))] + ResolveRegion { + bucket: String, + source: reqwest::Error, + }, + + #[snafu(display("Failed to parse the region for bucket '{}'", bucket))] + RegionParse { bucket: String }, } impl From for super::Error { @@ -160,6 +172,38 @@ impl From for super::Error { } } +/// Get the bucket region using the [HeadBucket API]. This will fail if the bucket does not exist. +/// [HeadBucket API]: https://docs.aws.amazon.com/AmazonS3/latest/API/API_HeadBucket.html +pub async fn resolve_bucket_region( + bucket: &str, + client_options: &ClientOptions, +) -> Result { + use reqwest::StatusCode; + + let endpoint = format!("https://{}.s3.amazonaws.com", bucket); + + let client = client_options.client()?; + + let response = client + .head(&endpoint) + .send() + .await + .context(ResolveRegionSnafu { bucket })?; + + ensure!( + response.status() != StatusCode::NOT_FOUND, + BucketNotFoundSnafu { bucket } + ); + + let region = response + .headers() + .get("x-amz-bucket-region") + .and_then(|x| x.to_str().ok()) + .context(RegionParseSnafu { bucket })?; + + Ok(region.to_string()) +} + /// Interface for [Amazon S3](https://aws.amazon.com/s3/). #[derive(Debug)] pub struct AmazonS3 { @@ -1563,3 +1607,30 @@ mod tests { } } } + +#[cfg(test)] +mod s3_resolve_bucket_region_tests { + use super::*; + + #[tokio::test] + async fn test_private_bucket() { + let bucket = "bloxbender"; + + let region = resolve_bucket_region(bucket, &ClientOptions::new()) + .await + .unwrap(); + + let expected = "us-west-2".to_string(); + + assert_eq!(region, expected); + } + + #[tokio::test] + async fn test_bucket_does_not_exist() { + let bucket = "please-dont-exist"; + + let result = resolve_bucket_region(bucket, &ClientOptions::new()).await; + + assert!(result.is_err()); + } +} From 4d3339214b80041b673546d7c42312953bf7a1aa Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Wed, 10 May 2023 14:21:55 +0100 Subject: [PATCH 131/397] Fix ImdsManagedIdentityProvider (#4096) (#4193) --- src/azure/credential.rs | 21 +++++++++++++++------ src/azure/mod.rs | 2 +- 2 files changed, 16 insertions(+), 7 deletions(-) diff --git a/src/azure/credential.rs b/src/azure/credential.rs index 0196d93..8130df6 100644 --- a/src/azure/credential.rs +++ b/src/azure/credential.rs @@ -50,8 +50,17 @@ pub(crate) const RFC1123_FMT: &str = "%a, %d %h %Y %T GMT"; const CONTENT_TYPE_JSON: &str = "application/json"; const MSI_SECRET_ENV_KEY: &str = "IDENTITY_HEADER"; const MSI_API_VERSION: &str = "2019-08-01"; + +/// OIDC scope used when interacting with OAuth2 APIs +/// +/// const AZURE_STORAGE_SCOPE: &str = "https://storage.azure.com/.default"; +/// Resource ID used when obtaining an access token from the metadata endpoint +/// +/// +const AZURE_STORAGE_RESOURCE: &str = "https://storage.azure.com"; + #[derive(Debug, Snafu)] pub enum Error { #[snafu(display("Error performing token request: {}", source))] @@ -383,7 +392,7 @@ struct MsiTokenResponse { /// This authentication type works in Azure VMs, App Service and Azure Functions applications, as well as the Azure Cloud Shell /// #[derive(Debug)] -pub struct ImdsManagedIdentityOAuthProvider { +pub struct ImdsManagedIdentityProvider { msi_endpoint: String, client_id: Option, object_id: Option, @@ -391,8 +400,8 @@ pub struct ImdsManagedIdentityOAuthProvider { client: Client, } -impl ImdsManagedIdentityOAuthProvider { - /// Create a new [`ImdsManagedIdentityOAuthProvider`] for an azure backed store +impl ImdsManagedIdentityProvider { + /// Create a new [`ImdsManagedIdentityProvider`] for an azure backed store pub fn new( client_id: Option, object_id: Option, @@ -415,7 +424,7 @@ impl ImdsManagedIdentityOAuthProvider { } #[async_trait::async_trait] -impl TokenCredential for ImdsManagedIdentityOAuthProvider { +impl TokenCredential for ImdsManagedIdentityProvider { /// Fetch a token async fn fetch_token( &self, @@ -424,7 +433,7 @@ impl TokenCredential for ImdsManagedIdentityOAuthProvider { ) -> Result> { let mut query_items = vec![ ("api-version", MSI_API_VERSION), - ("resource", AZURE_STORAGE_SCOPE), + ("resource", AZURE_STORAGE_RESOURCE), ]; let mut identity = None; @@ -709,7 +718,7 @@ mod tests { )) }); - let credential = ImdsManagedIdentityOAuthProvider::new( + let credential = ImdsManagedIdentityProvider::new( Some("client_id".into()), None, None, diff --git a/src/azure/mod.rs b/src/azure/mod.rs index 11350a2..ddfd028 100644 --- a/src/azure/mod.rs +++ b/src/azure/mod.rs @@ -1035,7 +1035,7 @@ impl MicrosoftAzureBuilder { } else { let client = self.client_options.clone().with_allow_http(true).client()?; - let msi_credential = credential::ImdsManagedIdentityOAuthProvider::new( + let msi_credential = credential::ImdsManagedIdentityProvider::new( self.client_id, self.object_id, self.msi_resource_id, From 292c46bc84b47dd95a739368604918b730f8c9c0 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Wed, 10 May 2023 14:51:19 +0100 Subject: [PATCH 132/397] Simplify ObjectStore configuration pattern (#4189) --- src/aws/checksum.rs | 17 +++-- src/aws/mod.rs | 155 +++++++++++++++++--------------------------- src/azure/mod.rs | 107 +++++++++--------------------- src/client/retry.rs | 6 +- src/gcp/mod.rs | 112 ++++++++++---------------------- 5 files changed, 139 insertions(+), 258 deletions(-) diff --git a/src/aws/checksum.rs b/src/aws/checksum.rs index c787c28..57762b6 100644 --- a/src/aws/checksum.rs +++ b/src/aws/checksum.rs @@ -16,6 +16,7 @@ // under the License. use ring::digest::{self, digest as ring_digest}; +use std::str::FromStr; #[allow(non_camel_case_types)] #[derive(Debug, Clone, Copy, PartialEq, Eq)] @@ -47,13 +48,21 @@ impl std::fmt::Display for Checksum { } } -impl TryFrom<&String> for Checksum { - type Error = (); +impl FromStr for Checksum { + type Err = (); - fn try_from(value: &String) -> Result { - match value.to_lowercase().as_str() { + fn from_str(s: &str) -> Result { + match s.to_lowercase().as_str() { "sha256" => Ok(Self::SHA256), _ => Err(()), } } } + +impl TryFrom<&String> for Checksum { + type Error = (); + + fn try_from(value: &String) -> Result { + value.parse() + } +} diff --git a/src/aws/mod.rs b/src/aws/mod.rs index bc852ed..5de177a 100644 --- a/src/aws/mod.rs +++ b/src/aws/mod.rs @@ -467,7 +467,7 @@ pub struct AmazonS3Builder { /// When set to true, unsigned payload option has to be used unsigned_payload: bool, /// Checksum algorithm which has to be used for object integrity check during upload - checksum_algorithm: Option, + checksum_algorithm: Option, /// Metadata endpoint, see metadata_endpoint: Option, /// Profile name, see @@ -478,30 +478,17 @@ pub struct AmazonS3Builder { /// Configuration keys for [`AmazonS3Builder`] /// -/// Configuration via keys can be dome via the [`try_with_option`](AmazonS3Builder::try_with_option) -/// or [`with_options`](AmazonS3Builder::try_with_options) methods on the builder. +/// Configuration via keys can be done via [`AmazonS3Builder::with_config`] /// /// # Example /// ``` -/// use std::collections::HashMap; -/// use object_store::aws::{AmazonS3Builder, AmazonS3ConfigKey}; -/// -/// let options = HashMap::from([ -/// ("aws_access_key_id", "my-access-key-id"), -/// ("aws_secret_access_key", "my-secret-access-key"), -/// ]); -/// let typed_options = vec![ -/// (AmazonS3ConfigKey::DefaultRegion, "my-default-region"), -/// ]; -/// let aws = AmazonS3Builder::new() -/// .try_with_options(options) -/// .unwrap() -/// .try_with_options(typed_options) -/// .unwrap() -/// .try_with_option(AmazonS3ConfigKey::Region, "my-region") -/// .unwrap(); +/// # use object_store::aws::{AmazonS3Builder, AmazonS3ConfigKey}; +/// let builder = AmazonS3Builder::new() +/// .with_config("aws_access_key_id".parse().unwrap(), "my-access-key-id") +/// .with_config(AmazonS3ConfigKey::DefaultRegion, "my-default-region"); /// ``` #[derive(PartialEq, Eq, Hash, Clone, Debug, Copy, Serialize, Deserialize)] +#[non_exhaustive] pub enum AmazonS3ConfigKey { /// AWS Access Key /// @@ -706,7 +693,7 @@ impl AmazonS3Builder { if let Ok(config_key) = AmazonS3ConfigKey::from_str(&key.to_ascii_lowercase()) { - builder = builder.try_with_option(config_key, value).unwrap(); + builder = builder.with_config(config_key, value); } } } @@ -754,14 +741,12 @@ impl AmazonS3Builder { } /// Set an option on the builder via a key - value pair. - /// - /// This method will return an `UnknownConfigKey` error if key cannot be parsed into [`AmazonS3ConfigKey`]. - pub fn try_with_option( + pub fn with_config( mut self, - key: impl AsRef, + key: AmazonS3ConfigKey, value: impl Into, - ) -> Result { - match AmazonS3ConfigKey::from_str(key.as_ref())? { + ) -> Self { + match key { AmazonS3ConfigKey::AccessKeyId => self.access_key_id = Some(value.into()), AmazonS3ConfigKey::SecretAccessKey => { self.secret_access_key = Some(value.into()) @@ -786,18 +771,28 @@ impl AmazonS3Builder { AmazonS3ConfigKey::UnsignedPayload => { self.unsigned_payload = str_is_truthy(&value.into()) } - AmazonS3ConfigKey::Checksum => { - let algorithm = Checksum::try_from(&value.into()) - .map_err(|_| Error::InvalidChecksumAlgorithm)?; - self.checksum_algorithm = Some(algorithm) - } + AmazonS3ConfigKey::Checksum => self.checksum_algorithm = Some(value.into()), }; - Ok(self) + self + } + + /// Set an option on the builder via a key - value pair. + /// + /// This method will return an `UnknownConfigKey` error if key cannot be parsed into [`AmazonS3ConfigKey`]. + #[deprecated(note = "Use with_config")] + pub fn try_with_option( + self, + key: impl AsRef, + value: impl Into, + ) -> Result { + Ok(self.with_config(key.as_ref().parse()?, value)) } /// Hydrate builder from key value pairs /// /// This method will return an `UnknownConfigKey` error if any key cannot be parsed into [`AmazonS3ConfigKey`]. + #[deprecated(note = "Use with_config")] + #[allow(deprecated)] pub fn try_with_options< I: IntoIterator, impl Into)>, >( @@ -838,7 +833,7 @@ impl AmazonS3Builder { AmazonS3ConfigKey::MetadataEndpoint => self.metadata_endpoint.clone(), AmazonS3ConfigKey::Profile => self.profile.clone(), AmazonS3ConfigKey::UnsignedPayload => Some(self.unsigned_payload.to_string()), - AmazonS3ConfigKey::Checksum => self.checksum_algorithm.map(|v| v.to_string()), + AmazonS3ConfigKey::Checksum => self.checksum_algorithm.clone(), } } @@ -979,7 +974,8 @@ impl AmazonS3Builder { /// /// [checksum algorithm]: https://docs.aws.amazon.com/AmazonS3/latest/userguide/checking-object-integrity.html pub fn with_checksum_algorithm(mut self, checksum_algorithm: Checksum) -> Self { - self.checksum_algorithm = Some(checksum_algorithm); + // Convert to String to enable deferred parsing of config + self.checksum_algorithm = Some(checksum_algorithm.to_string()); self } @@ -1032,6 +1028,11 @@ impl AmazonS3Builder { let bucket = self.bucket_name.context(MissingBucketNameSnafu)?; let region = self.region.context(MissingRegionSnafu)?; + let checksum = self + .checksum_algorithm + .map(|c| c.parse()) + .transpose() + .map_err(|_| Error::InvalidChecksumAlgorithm)?; let credentials = match (self.access_key_id, self.secret_access_key, self.token) { (Some(key_id), Some(secret_key), token) => { @@ -1129,7 +1130,7 @@ impl AmazonS3Builder { retry_config: self.retry_config, client_options: self.client_options, sign_payload: !self.unsigned_payload, - checksum: self.checksum_algorithm, + checksum, }; let client = Arc::new(S3Client::new(config)?); @@ -1303,7 +1304,10 @@ mod tests { assert_eq!(builder.token.unwrap(), aws_session_token); let metadata_uri = format!("{METADATA_ENDPOINT}{container_creds_relative_uri}"); assert_eq!(builder.metadata_endpoint.unwrap(), metadata_uri); - assert_eq!(builder.checksum_algorithm.unwrap(), Checksum::SHA256); + assert_eq!( + builder.checksum_algorithm.unwrap(), + Checksum::SHA256.to_string() + ); assert!(builder.unsigned_payload); } @@ -1324,46 +1328,22 @@ mod tests { ("aws_checksum_algorithm", "sha256".to_string()), ]); - let builder = AmazonS3Builder::new() - .try_with_options(&options) - .unwrap() - .try_with_option("aws_secret_access_key", "new-secret-key") - .unwrap(); - assert_eq!(builder.access_key_id.unwrap(), aws_access_key_id.as_str()); - assert_eq!(builder.secret_access_key.unwrap(), "new-secret-key"); - assert_eq!(builder.region.unwrap(), aws_default_region); - assert_eq!(builder.endpoint.unwrap(), aws_endpoint); - assert_eq!(builder.token.unwrap(), aws_session_token); - assert_eq!(builder.checksum_algorithm.unwrap(), Checksum::SHA256); - assert!(builder.unsigned_payload); - } - - #[test] - fn s3_test_config_from_typed_map() { - let aws_access_key_id = "object_store:fake_access_key_id".to_string(); - let aws_secret_access_key = "object_store:fake_secret_key".to_string(); - let aws_default_region = "object_store:fake_default_region".to_string(); - let aws_endpoint = "object_store:fake_endpoint".to_string(); - let aws_session_token = "object_store:fake_session_token".to_string(); - let options = HashMap::from([ - (AmazonS3ConfigKey::AccessKeyId, aws_access_key_id.clone()), - (AmazonS3ConfigKey::SecretAccessKey, aws_secret_access_key), - (AmazonS3ConfigKey::DefaultRegion, aws_default_region.clone()), - (AmazonS3ConfigKey::Endpoint, aws_endpoint.clone()), - (AmazonS3ConfigKey::Token, aws_session_token.clone()), - (AmazonS3ConfigKey::UnsignedPayload, "true".to_string()), - ]); + let builder = options + .into_iter() + .fold(AmazonS3Builder::new(), |builder, (key, value)| { + builder.with_config(key.parse().unwrap(), value) + }) + .with_config(AmazonS3ConfigKey::SecretAccessKey, "new-secret-key"); - let builder = AmazonS3Builder::new() - .try_with_options(&options) - .unwrap() - .try_with_option(AmazonS3ConfigKey::SecretAccessKey, "new-secret-key") - .unwrap(); assert_eq!(builder.access_key_id.unwrap(), aws_access_key_id.as_str()); assert_eq!(builder.secret_access_key.unwrap(), "new-secret-key"); assert_eq!(builder.region.unwrap(), aws_default_region); assert_eq!(builder.endpoint.unwrap(), aws_endpoint); assert_eq!(builder.token.unwrap(), aws_session_token); + assert_eq!( + builder.checksum_algorithm.unwrap(), + Checksum::SHA256.to_string() + ); assert!(builder.unsigned_payload); } @@ -1374,19 +1354,15 @@ mod tests { let aws_default_region = "object_store:fake_default_region".to_string(); let aws_endpoint = "object_store:fake_endpoint".to_string(); let aws_session_token = "object_store:fake_session_token".to_string(); - let options = HashMap::from([ - (AmazonS3ConfigKey::AccessKeyId, aws_access_key_id.clone()), - ( - AmazonS3ConfigKey::SecretAccessKey, - aws_secret_access_key.clone(), - ), - (AmazonS3ConfigKey::DefaultRegion, aws_default_region.clone()), - (AmazonS3ConfigKey::Endpoint, aws_endpoint.clone()), - (AmazonS3ConfigKey::Token, aws_session_token.clone()), - (AmazonS3ConfigKey::UnsignedPayload, "true".to_string()), - ]); - let builder = AmazonS3Builder::new().try_with_options(&options).unwrap(); + let builder = AmazonS3Builder::new() + .with_config(AmazonS3ConfigKey::AccessKeyId, &aws_access_key_id) + .with_config(AmazonS3ConfigKey::SecretAccessKey, &aws_secret_access_key) + .with_config(AmazonS3ConfigKey::DefaultRegion, &aws_default_region) + .with_config(AmazonS3ConfigKey::Endpoint, &aws_endpoint) + .with_config(AmazonS3ConfigKey::Token, &aws_session_token) + .with_config(AmazonS3ConfigKey::UnsignedPayload, "true"); + assert_eq!( builder .get_config_value(&AmazonS3ConfigKey::AccessKeyId) @@ -1423,19 +1399,6 @@ mod tests { ); } - #[test] - fn s3_test_config_fallible_options() { - let aws_access_key_id = "object_store:fake_access_key_id".to_string(); - let aws_secret_access_key = "object_store:fake_secret_key".to_string(); - let options = HashMap::from([ - ("aws_access_key_id", aws_access_key_id), - ("invalid-key", aws_secret_access_key), - ]); - - let builder = AmazonS3Builder::new().try_with_options(&options); - assert!(builder.is_err()); - } - #[tokio::test] async fn s3_test() { let config = maybe_skip_integration!(); diff --git a/src/azure/mod.rs b/src/azure/mod.rs index ddfd028..15033dc 100644 --- a/src/azure/mod.rs +++ b/src/azure/mod.rs @@ -436,30 +436,17 @@ pub struct MicrosoftAzureBuilder { /// Configuration keys for [`MicrosoftAzureBuilder`] /// -/// Configuration via keys can be dome via the [`try_with_option`](MicrosoftAzureBuilder::try_with_option) -/// or [`with_options`](MicrosoftAzureBuilder::try_with_options) methods on the builder. +/// Configuration via keys can be done via [`MicrosoftAzureBuilder::with_config`] /// /// # Example /// ``` -/// use std::collections::HashMap; -/// use object_store::azure::{MicrosoftAzureBuilder, AzureConfigKey}; -/// -/// let options = HashMap::from([ -/// ("azure_client_id", "my-client-id"), -/// ("azure_client_secret", "my-account-name"), -/// ]); -/// let typed_options = vec![ -/// (AzureConfigKey::AccountName, "my-account-name"), -/// ]; -/// let azure = MicrosoftAzureBuilder::new() -/// .try_with_options(options) -/// .unwrap() -/// .try_with_options(typed_options) -/// .unwrap() -/// .try_with_option(AzureConfigKey::AuthorityId, "my-tenant-id") -/// .unwrap(); +/// # use object_store::azure::{MicrosoftAzureBuilder, AzureConfigKey}; +/// let builder = MicrosoftAzureBuilder::new() +/// .with_config("azure_client_id".parse().unwrap(), "my-client-id") +/// .with_config(AzureConfigKey::AuthorityId, "my-tenant-id"); /// ``` #[derive(PartialEq, Eq, Hash, Clone, Debug, Copy, Deserialize, Serialize)] +#[non_exhaustive] pub enum AzureConfigKey { /// The name of the azure storage account /// @@ -678,7 +665,7 @@ impl MicrosoftAzureBuilder { if let Ok(config_key) = AzureConfigKey::from_str(&key.to_ascii_lowercase()) { - builder = builder.try_with_option(config_key, value).unwrap(); + builder = builder.with_config(config_key, value); } } } @@ -724,12 +711,8 @@ impl MicrosoftAzureBuilder { } /// Set an option on the builder via a key - value pair. - pub fn try_with_option( - mut self, - key: impl AsRef, - value: impl Into, - ) -> Result { - match AzureConfigKey::from_str(key.as_ref())? { + pub fn with_config(mut self, key: AzureConfigKey, value: impl Into) -> Self { + match key { AzureConfigKey::AccessKey => self.access_key = Some(value.into()), AzureConfigKey::AccountName => self.account_name = Some(value.into()), AzureConfigKey::ClientId => self.client_id = Some(value.into()), @@ -750,10 +733,22 @@ impl MicrosoftAzureBuilder { self.use_emulator = str_is_truthy(&value.into()) } }; - Ok(self) + self + } + + /// Set an option on the builder via a key - value pair. + #[deprecated(note = "Use with_config")] + pub fn try_with_option( + self, + key: impl AsRef, + value: impl Into, + ) -> Result { + Ok(self.with_config(key.as_ref().parse()?, value)) } /// Hydrate builder from key value pairs + #[deprecated(note = "Use with_config")] + #[allow(deprecated)] pub fn try_with_options< I: IntoIterator, impl Into)>, >( @@ -1270,31 +1265,11 @@ mod tests { ("azure_storage_token", azure_storage_token), ]); - let builder = MicrosoftAzureBuilder::new() - .try_with_options(options) - .unwrap(); - assert_eq!(builder.client_id.unwrap(), azure_client_id); - assert_eq!(builder.account_name.unwrap(), azure_storage_account_name); - assert_eq!(builder.bearer_token.unwrap(), azure_storage_token); - } - - #[test] - fn azure_test_config_from_typed_map() { - let azure_client_id = "object_store:fake_access_key_id".to_string(); - let azure_storage_account_name = "object_store:fake_secret_key".to_string(); - let azure_storage_token = "object_store:fake_default_region".to_string(); - let options = HashMap::from([ - (AzureConfigKey::ClientId, azure_client_id.clone()), - ( - AzureConfigKey::AccountName, - azure_storage_account_name.clone(), - ), - (AzureConfigKey::Token, azure_storage_token.clone()), - ]); - - let builder = MicrosoftAzureBuilder::new() - .try_with_options(&options) - .unwrap(); + let builder = options + .into_iter() + .fold(MicrosoftAzureBuilder::new(), |builder, (key, value)| { + builder.with_config(key.parse().unwrap(), value) + }); assert_eq!(builder.client_id.unwrap(), azure_client_id); assert_eq!(builder.account_name.unwrap(), azure_storage_account_name); assert_eq!(builder.bearer_token.unwrap(), azure_storage_token); @@ -1305,18 +1280,11 @@ mod tests { let azure_client_id = "object_store:fake_access_key_id".to_string(); let azure_storage_account_name = "object_store:fake_secret_key".to_string(); let azure_storage_token = "object_store:fake_default_region".to_string(); - let options = HashMap::from([ - (AzureConfigKey::ClientId, azure_client_id.clone()), - ( - AzureConfigKey::AccountName, - azure_storage_account_name.clone(), - ), - (AzureConfigKey::Token, azure_storage_token.clone()), - ]); - let builder = MicrosoftAzureBuilder::new() - .try_with_options(&options) - .unwrap(); + .with_config(AzureConfigKey::ClientId, &azure_client_id) + .with_config(AzureConfigKey::AccountName, &azure_storage_account_name) + .with_config(AzureConfigKey::Token, &azure_storage_token); + assert_eq!( builder.get_config_value(&AzureConfigKey::ClientId).unwrap(), azure_client_id @@ -1333,19 +1301,6 @@ mod tests { ); } - #[test] - fn azure_test_config_fallible_options() { - let azure_client_id = "object_store:fake_access_key_id".to_string(); - let azure_storage_token = "object_store:fake_default_region".to_string(); - let options = HashMap::from([ - ("azure_client_id", azure_client_id), - ("invalid-key", azure_storage_token), - ]); - - let builder = MicrosoftAzureBuilder::new().try_with_options(&options); - assert!(builder.is_err()); - } - #[test] fn azure_test_split_sas() { let raw_sas = "?sv=2021-10-04&st=2023-01-04T17%3A48%3A57Z&se=2023-01-04T18%3A15%3A00Z&sr=c&sp=rcwl&sig=C7%2BZeEOWbrxPA3R0Cw%2Fw1EZz0%2B4KBvQexeKZKe%2BB6h0%3D"; diff --git a/src/client/retry.rs b/src/client/retry.rs index e6e92f0..f9c2dd3 100644 --- a/src/client/retry.rs +++ b/src/client/retry.rs @@ -22,9 +22,9 @@ use futures::future::BoxFuture; use futures::FutureExt; use reqwest::header::LOCATION; use reqwest::{Response, StatusCode}; +use snafu::Error as SnafuError; use std::time::{Duration, Instant}; use tracing::info; -use snafu::Error as SnafuError; /// Retry request error #[derive(Debug)] @@ -365,13 +365,13 @@ mod tests { assert_eq!(e.message, "502 Bad Gateway"); // Panic results in an incomplete message error in the client - mock.push_fn(|_| {panic!()}); + mock.push_fn(|_| panic!()); let r = do_request().await.unwrap(); assert_eq!(r.status(), StatusCode::OK); // Gives up after retrying mulitiple panics for _ in 0..=retry.max_retries { - mock.push_fn(|_| {panic!()}); + mock.push_fn(|_| panic!()); } let e = do_request().await.unwrap_err(); assert_eq!(e.retries, retry.max_retries); diff --git a/src/gcp/mod.rs b/src/gcp/mod.rs index a6cf660..6f3d53d 100644 --- a/src/gcp/mod.rs +++ b/src/gcp/mod.rs @@ -786,29 +786,17 @@ pub struct GoogleCloudStorageBuilder { /// Configuration keys for [`GoogleCloudStorageBuilder`] /// -/// Configuration via keys can be done via the [`try_with_option`](GoogleCloudStorageBuilder::try_with_option) -/// or [`try_with_options`](GoogleCloudStorageBuilder::try_with_options) methods on the builder. +/// Configuration via keys can be done via [`GoogleCloudStorageBuilder::with_config`] /// /// # Example /// ``` -/// use std::collections::HashMap; -/// use object_store::gcp::{GoogleCloudStorageBuilder, GoogleConfigKey}; -/// -/// let options = HashMap::from([ -/// ("google_service_account", "my-service-account"), -/// ]); -/// let typed_options = vec![ -/// (GoogleConfigKey::Bucket, "my-bucket"), -/// ]; -/// let azure = GoogleCloudStorageBuilder::new() -/// .try_with_options(options) -/// .unwrap() -/// .try_with_options(typed_options) -/// .unwrap() -/// .try_with_option(GoogleConfigKey::Bucket, "my-new-bucket") -/// .unwrap(); +/// # use object_store::gcp::{GoogleCloudStorageBuilder, GoogleConfigKey}; +/// let builder = GoogleCloudStorageBuilder::new() +/// .with_config("google_service_account".parse().unwrap(), "my-service-account") +/// .with_config(GoogleConfigKey::Bucket, "my-bucket"); /// ``` #[derive(PartialEq, Eq, Hash, Clone, Debug, Copy, Serialize, Deserialize)] +#[non_exhaustive] pub enum GoogleConfigKey { /// Path to the service account file /// @@ -926,7 +914,7 @@ impl GoogleCloudStorageBuilder { if let Ok(config_key) = GoogleConfigKey::from_str(&key.to_ascii_lowercase()) { - builder = builder.try_with_option(config_key, value).unwrap(); + builder = builder.with_config(config_key, value); } } } @@ -957,12 +945,8 @@ impl GoogleCloudStorageBuilder { } /// Set an option on the builder via a key - value pair. - pub fn try_with_option( - mut self, - key: impl AsRef, - value: impl Into, - ) -> Result { - match GoogleConfigKey::from_str(key.as_ref())? { + pub fn with_config(mut self, key: GoogleConfigKey, value: impl Into) -> Self { + match key { GoogleConfigKey::ServiceAccount => { self.service_account_path = Some(value.into()) } @@ -974,10 +958,22 @@ impl GoogleCloudStorageBuilder { self.application_credentials_path = Some(value.into()) } }; - Ok(self) + self + } + + /// Set an option on the builder via a key - value pair. + #[deprecated(note = "Use with_config")] + pub fn try_with_option( + self, + key: impl AsRef, + value: impl Into, + ) -> Result { + Ok(self.with_config(key.as_ref().parse()?, value)) } /// Hydrate builder from key value pairs + #[deprecated(note = "Use with_config")] + #[allow(deprecated)] pub fn try_with_options< I: IntoIterator, impl Into)>, >( @@ -1449,31 +1445,12 @@ mod test { ("google_bucket_name", google_bucket_name.clone()), ]); - let builder = GoogleCloudStorageBuilder::new() - .try_with_options(&options) - .unwrap(); - assert_eq!( - builder.service_account_path.unwrap(), - google_service_account.as_str() - ); - assert_eq!(builder.bucket_name.unwrap(), google_bucket_name.as_str()); - } + let builder = options + .iter() + .fold(GoogleCloudStorageBuilder::new(), |builder, (key, value)| { + builder.with_config(key.parse().unwrap(), value) + }); - #[test] - fn gcs_test_config_from_typed_map() { - let google_service_account = "object_store:fake_service_account".to_string(); - let google_bucket_name = "object_store:fake_bucket".to_string(); - let options = HashMap::from([ - ( - GoogleConfigKey::ServiceAccount, - google_service_account.clone(), - ), - (GoogleConfigKey::Bucket, google_bucket_name.clone()), - ]); - - let builder = GoogleCloudStorageBuilder::new() - .try_with_options(&options) - .unwrap(); assert_eq!( builder.service_account_path.unwrap(), google_service_account.as_str() @@ -1485,17 +1462,10 @@ mod test { fn gcs_test_config_get_value() { let google_service_account = "object_store:fake_service_account".to_string(); let google_bucket_name = "object_store:fake_bucket".to_string(); - let options = HashMap::from([ - ( - GoogleConfigKey::ServiceAccount, - google_service_account.clone(), - ), - (GoogleConfigKey::Bucket, google_bucket_name.clone()), - ]); - let builder = GoogleCloudStorageBuilder::new() - .try_with_options(&options) - .unwrap(); + .with_config(GoogleConfigKey::ServiceAccount, &google_service_account) + .with_config(GoogleConfigKey::Bucket, &google_bucket_name); + assert_eq!( builder .get_config_value(&GoogleConfigKey::ServiceAccount) @@ -1508,19 +1478,6 @@ mod test { ); } - #[test] - fn gcs_test_config_fallible_options() { - let google_service_account = "object_store:fake_service_account".to_string(); - let google_bucket_name = "object_store:fake_bucket".to_string(); - let options = HashMap::from([ - ("google_service_account", google_service_account), - ("invalid-key", google_bucket_name), - ]); - - let builder = GoogleCloudStorageBuilder::new().try_with_options(&options); - assert!(builder.is_err()); - } - #[test] fn gcs_test_config_aliases() { // Service account path @@ -1531,16 +1488,14 @@ mod test { "service_account_path", ] { let builder = GoogleCloudStorageBuilder::new() - .try_with_options([(alias, "/fake/path.json")]) - .unwrap(); + .with_config(alias.parse().unwrap(), "/fake/path.json"); assert_eq!("/fake/path.json", builder.service_account_path.unwrap()); } // Service account key for alias in ["google_service_account_key", "service_account_key"] { let builder = GoogleCloudStorageBuilder::new() - .try_with_options([(alias, FAKE_KEY)]) - .unwrap(); + .with_config(alias.parse().unwrap(), FAKE_KEY); assert_eq!(FAKE_KEY, builder.service_account_key.unwrap()); } @@ -1552,8 +1507,7 @@ mod test { "bucket_name", ] { let builder = GoogleCloudStorageBuilder::new() - .try_with_options([(alias, "fake_bucket")]) - .unwrap(); + .with_config(alias.parse().unwrap(), "fake_bucket"); assert_eq!("fake_bucket", builder.bucket_name.unwrap()); } } From d6baf04cc36b2ee6ba00b827819b5389b76ae66e Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Wed, 10 May 2023 18:43:13 +0100 Subject: [PATCH 133/397] Recognise R2 URLs (#4190) (#4194) --- src/aws/mod.rs | 28 +++++++++++++++++++++++++--- 1 file changed, 25 insertions(+), 3 deletions(-) diff --git a/src/aws/mod.rs b/src/aws/mod.rs index 5de177a..6ea24fb 100644 --- a/src/aws/mod.rs +++ b/src/aws/mod.rs @@ -724,6 +724,7 @@ impl AmazonS3Builder { /// - `s3a:///` /// - `https://s3..amazonaws.com` /// - `https://.s3..amazonaws.com` + /// - `https://ACCOUNT_ID.r2.cloudflarestorage.com/bucket` /// /// Note: Settings derived from the URL will override any others set on this builder /// @@ -849,9 +850,8 @@ impl AmazonS3Builder { "https" => match host.splitn(4, '.').collect_tuple() { Some(("s3", region, "amazonaws", "com")) => { self.region = Some(region.to_string()); - if let Some(bucket) = - parsed.path_segments().and_then(|mut path| path.next()) - { + let bucket = parsed.path_segments().into_iter().flatten().next(); + if let Some(bucket) = bucket { self.bucket_name = Some(bucket.into()); } } @@ -860,6 +860,16 @@ impl AmazonS3Builder { self.region = Some(region.to_string()); self.virtual_hosted_style_request = true; } + Some((account, "r2", "cloudflarestorage", "com")) => { + self.region = Some("auto".to_string()); + let endpoint = format!("https://{account}.r2.cloudflarestorage.com"); + self.endpoint = Some(endpoint); + + let bucket = parsed.path_segments().into_iter().flatten().next(); + if let Some(bucket) = bucket { + self.bucket_name = Some(bucket.into()); + } + } _ => return Err(UrlNotRecognisedSnafu { url }.build().into()), }, scheme => return Err(UnknownUrlSchemeSnafu { scheme }.build().into()), @@ -1556,6 +1566,18 @@ mod tests { assert_eq!(builder.region, Some("region".to_string())); assert!(builder.virtual_hosted_style_request); + let mut builder = AmazonS3Builder::new(); + builder + .parse_url("https://account123.r2.cloudflarestorage.com/bucket-123") + .unwrap(); + + assert_eq!(builder.bucket_name, Some("bucket-123".to_string())); + assert_eq!(builder.region, Some("auto".to_string())); + assert_eq!( + builder.endpoint, + Some("https://account123.r2.cloudflarestorage.com".to_string()) + ); + let err_cases = [ "mailto://bucket/path", "https://s3.bucket.mydomain.com", From d76e3065bd3ac65541590a2ffd513e5f57e90c44 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Wed, 10 May 2023 18:44:30 +0100 Subject: [PATCH 134/397] Deffered config parsing (#4191) (#4192) --- src/aws/checksum.rs | 10 +++++ src/aws/mod.rs | 100 +++++++++++++++++++++++++++----------------- src/azure/mod.rs | 29 +++++++------ src/client/mod.rs | 31 ++++++++++++-- src/config.rs | 81 +++++++++++++++++++++++++++++++++++ src/lib.rs | 3 ++ src/util.rs | 9 ---- 7 files changed, 198 insertions(+), 65 deletions(-) create mode 100644 src/config.rs diff --git a/src/aws/checksum.rs b/src/aws/checksum.rs index 57762b6..a50bd2d 100644 --- a/src/aws/checksum.rs +++ b/src/aws/checksum.rs @@ -15,6 +15,7 @@ // specific language governing permissions and limitations // under the License. +use crate::config::Parse; use ring::digest::{self, digest as ring_digest}; use std::str::FromStr; @@ -66,3 +67,12 @@ impl TryFrom<&String> for Checksum { value.parse() } } + +impl Parse for Checksum { + fn parse(v: &str) -> crate::Result { + v.parse().map_err(|_| crate::Error::Generic { + store: "Config", + source: format!("\"{v}\" is not a valid checksum algorithm").into(), + }) + } +} diff --git a/src/aws/mod.rs b/src/aws/mod.rs index 6ea24fb..fe49471 100644 --- a/src/aws/mod.rs +++ b/src/aws/mod.rs @@ -53,8 +53,9 @@ use crate::aws::credential::{ AwsCredential, CredentialProvider, InstanceCredentialProvider, StaticCredentialProvider, WebIdentityProvider, }; +use crate::client::ClientConfigKey; +use crate::config::ConfigValue; use crate::multipart::{CloudMultiPartUpload, CloudMultiPartUploadImpl, UploadPart}; -use crate::util::str_is_truthy; use crate::{ ClientOptions, GetResult, ListResult, MultipartId, ObjectMeta, ObjectStore, Path, Result, RetryConfig, StreamExt, @@ -103,9 +104,6 @@ enum Error { source: std::num::ParseIntError, }, - #[snafu(display("Invalid Checksum algorithm"))] - InvalidChecksumAlgorithm, - #[snafu(display("Missing region"))] MissingRegion, @@ -461,13 +459,13 @@ pub struct AmazonS3Builder { /// Retry config retry_config: RetryConfig, /// When set to true, fallback to IMDSv1 - imdsv1_fallback: bool, + imdsv1_fallback: ConfigValue, /// When set to true, virtual hosted style request has to be used - virtual_hosted_style_request: bool, + virtual_hosted_style_request: ConfigValue, /// When set to true, unsigned payload option has to be used - unsigned_payload: bool, + unsigned_payload: ConfigValue, /// Checksum algorithm which has to be used for object integrity check during upload - checksum_algorithm: Option, + checksum_algorithm: Option>, /// Metadata endpoint, see metadata_endpoint: Option, /// Profile name, see @@ -709,8 +707,9 @@ impl AmazonS3Builder { } if let Ok(text) = std::env::var("AWS_ALLOW_HTTP") { - builder.client_options = - builder.client_options.with_allow_http(str_is_truthy(&text)); + builder.client_options = builder + .client_options + .with_config(ClientConfigKey::AllowHttp, text); } builder @@ -756,11 +755,9 @@ impl AmazonS3Builder { AmazonS3ConfigKey::Bucket => self.bucket_name = Some(value.into()), AmazonS3ConfigKey::Endpoint => self.endpoint = Some(value.into()), AmazonS3ConfigKey::Token => self.token = Some(value.into()), - AmazonS3ConfigKey::ImdsV1Fallback => { - self.imdsv1_fallback = str_is_truthy(&value.into()) - } + AmazonS3ConfigKey::ImdsV1Fallback => self.imdsv1_fallback.parse(value), AmazonS3ConfigKey::VirtualHostedStyleRequest => { - self.virtual_hosted_style_request = str_is_truthy(&value.into()) + self.virtual_hosted_style_request.parse(value) } AmazonS3ConfigKey::DefaultRegion => { self.region = self.region.or_else(|| Some(value.into())) @@ -769,10 +766,10 @@ impl AmazonS3Builder { self.metadata_endpoint = Some(value.into()) } AmazonS3ConfigKey::Profile => self.profile = Some(value.into()), - AmazonS3ConfigKey::UnsignedPayload => { - self.unsigned_payload = str_is_truthy(&value.into()) + AmazonS3ConfigKey::UnsignedPayload => self.unsigned_payload.parse(value), + AmazonS3ConfigKey::Checksum => { + self.checksum_algorithm = Some(ConfigValue::Deferred(value.into())) } - AmazonS3ConfigKey::Checksum => self.checksum_algorithm = Some(value.into()), }; self } @@ -834,7 +831,9 @@ impl AmazonS3Builder { AmazonS3ConfigKey::MetadataEndpoint => self.metadata_endpoint.clone(), AmazonS3ConfigKey::Profile => self.profile.clone(), AmazonS3ConfigKey::UnsignedPayload => Some(self.unsigned_payload.to_string()), - AmazonS3ConfigKey::Checksum => self.checksum_algorithm.clone(), + AmazonS3ConfigKey::Checksum => { + self.checksum_algorithm.as_ref().map(ToString::to_string) + } } } @@ -858,7 +857,7 @@ impl AmazonS3Builder { Some((bucket, "s3", region, "amazonaws.com")) => { self.bucket_name = Some(bucket.to_string()); self.region = Some(region.to_string()); - self.virtual_hosted_style_request = true; + self.virtual_hosted_style_request = true.into(); } Some((account, "r2", "cloudflarestorage", "com")) => { self.region = Some("auto".to_string()); @@ -944,7 +943,7 @@ impl AmazonS3Builder { mut self, virtual_hosted_style_request: bool, ) -> Self { - self.virtual_hosted_style_request = virtual_hosted_style_request; + self.virtual_hosted_style_request = virtual_hosted_style_request.into(); self } @@ -967,7 +966,7 @@ impl AmazonS3Builder { /// [SSRF attack]: https://aws.amazon.com/blogs/security/defense-in-depth-open-firewalls-reverse-proxies-ssrf-vulnerabilities-ec2-instance-metadata-service/ /// pub fn with_imdsv1_fallback(mut self) -> Self { - self.imdsv1_fallback = true; + self.imdsv1_fallback = true.into(); self } @@ -976,7 +975,7 @@ impl AmazonS3Builder { /// * false (default): Signed payload option is used, where the checksum for the request body is computed and included when constructing a canonical request. /// * true: Unsigned payload option is used. `UNSIGNED-PAYLOAD` literal is included when constructing a canonical request, pub fn with_unsigned_payload(mut self, unsigned_payload: bool) -> Self { - self.unsigned_payload = unsigned_payload; + self.unsigned_payload = unsigned_payload.into(); self } @@ -985,7 +984,7 @@ impl AmazonS3Builder { /// [checksum algorithm]: https://docs.aws.amazon.com/AmazonS3/latest/userguide/checking-object-integrity.html pub fn with_checksum_algorithm(mut self, checksum_algorithm: Checksum) -> Self { // Convert to String to enable deferred parsing of config - self.checksum_algorithm = Some(checksum_algorithm.to_string()); + self.checksum_algorithm = Some(checksum_algorithm.into()); self } @@ -1038,11 +1037,7 @@ impl AmazonS3Builder { let bucket = self.bucket_name.context(MissingBucketNameSnafu)?; let region = self.region.context(MissingRegionSnafu)?; - let checksum = self - .checksum_algorithm - .map(|c| c.parse()) - .transpose() - .map_err(|_| Error::InvalidChecksumAlgorithm)?; + let checksum = self.checksum_algorithm.map(|x| x.get()).transpose()?; let credentials = match (self.access_key_id, self.secret_access_key, self.token) { (Some(key_id), Some(secret_key), token) => { @@ -1103,7 +1098,7 @@ impl AmazonS3Builder { cache: Default::default(), client: client_options.client()?, retry_config: self.retry_config.clone(), - imdsv1_fallback: self.imdsv1_fallback, + imdsv1_fallback: self.imdsv1_fallback.get()?, metadata_endpoint: self .metadata_endpoint .unwrap_or_else(|| METADATA_ENDPOINT.into()), @@ -1119,7 +1114,7 @@ impl AmazonS3Builder { // If `endpoint` is provided then its assumed to be consistent with // `virtual_hosted_style_request`. i.e. if `virtual_hosted_style_request` is true then // `endpoint` should have bucket name included. - if self.virtual_hosted_style_request { + if self.virtual_hosted_style_request.get()? { endpoint = self .endpoint .unwrap_or_else(|| format!("https://{bucket}.s3.{region}.amazonaws.com")); @@ -1139,7 +1134,7 @@ impl AmazonS3Builder { credentials, retry_config: self.retry_config, client_options: self.client_options, - sign_payload: !self.unsigned_payload, + sign_payload: !self.unsigned_payload.get()?, checksum, }; @@ -1315,10 +1310,10 @@ mod tests { let metadata_uri = format!("{METADATA_ENDPOINT}{container_creds_relative_uri}"); assert_eq!(builder.metadata_endpoint.unwrap(), metadata_uri); assert_eq!( - builder.checksum_algorithm.unwrap(), - Checksum::SHA256.to_string() + builder.checksum_algorithm.unwrap().get().unwrap(), + Checksum::SHA256 ); - assert!(builder.unsigned_payload); + assert!(builder.unsigned_payload.get().unwrap()); } #[test] @@ -1351,10 +1346,10 @@ mod tests { assert_eq!(builder.endpoint.unwrap(), aws_endpoint); assert_eq!(builder.token.unwrap(), aws_session_token); assert_eq!( - builder.checksum_algorithm.unwrap(), - Checksum::SHA256.to_string() + builder.checksum_algorithm.unwrap().get().unwrap(), + Checksum::SHA256 ); - assert!(builder.unsigned_payload); + assert!(builder.unsigned_payload.get().unwrap()); } #[test] @@ -1564,7 +1559,7 @@ mod tests { .unwrap(); assert_eq!(builder.bucket_name, Some("bucket".to_string())); assert_eq!(builder.region, Some("region".to_string())); - assert!(builder.virtual_hosted_style_request); + assert!(builder.virtual_hosted_style_request.get().unwrap()); let mut builder = AmazonS3Builder::new(); builder @@ -1591,6 +1586,35 @@ mod tests { builder.parse_url(case).unwrap_err(); } } + + #[test] + fn test_invalid_config() { + let err = AmazonS3Builder::new() + .with_config(AmazonS3ConfigKey::ImdsV1Fallback, "enabled") + .with_bucket_name("bucket") + .with_region("region") + .build() + .unwrap_err() + .to_string(); + + assert_eq!( + err, + "Generic Config error: failed to parse \"enabled\" as boolean" + ); + + let err = AmazonS3Builder::new() + .with_config(AmazonS3ConfigKey::Checksum, "md5") + .with_bucket_name("bucket") + .with_region("region") + .build() + .unwrap_err() + .to_string(); + + assert_eq!( + err, + "Generic Config error: \"md5\" is not a valid checksum algorithm" + ); + } } #[cfg(test)] diff --git a/src/azure/mod.rs b/src/azure/mod.rs index 15033dc..2b5b43a 100644 --- a/src/azure/mod.rs +++ b/src/azure/mod.rs @@ -51,7 +51,9 @@ use std::{collections::BTreeSet, str::FromStr}; use tokio::io::AsyncWrite; use url::Url; -use crate::util::{str_is_truthy, RFC1123_FMT}; +use crate::client::ClientConfigKey; +use crate::config::ConfigValue; +use crate::util::RFC1123_FMT; pub use credential::authority_hosts; mod client; @@ -417,7 +419,7 @@ pub struct MicrosoftAzureBuilder { /// Url url: Option, /// When set to true, azurite storage emulator has to be used - use_emulator: bool, + use_emulator: ConfigValue, /// Msi endpoint for acquiring managed identity token msi_endpoint: Option, /// Object id for use with managed identity authentication @@ -427,7 +429,7 @@ pub struct MicrosoftAzureBuilder { /// File containing token for Azure AD workload identity federation federated_token_file: Option, /// When set to true, azure cli has to be used for acquiring access token - use_azure_cli: bool, + use_azure_cli: ConfigValue, /// Retry config retry_config: RetryConfig, /// Client options @@ -672,8 +674,9 @@ impl MicrosoftAzureBuilder { } if let Ok(text) = std::env::var("AZURE_ALLOW_HTTP") { - builder.client_options = - builder.client_options.with_allow_http(str_is_truthy(&text)); + builder.client_options = builder + .client_options + .with_config(ClientConfigKey::AllowHttp, text) } if let Ok(text) = std::env::var(MSI_ENDPOINT_ENV_KEY) { @@ -726,12 +729,8 @@ impl MicrosoftAzureBuilder { AzureConfigKey::FederatedTokenFile => { self.federated_token_file = Some(value.into()) } - AzureConfigKey::UseAzureCli => { - self.use_azure_cli = str_is_truthy(&value.into()) - } - AzureConfigKey::UseEmulator => { - self.use_emulator = str_is_truthy(&value.into()) - } + AzureConfigKey::UseAzureCli => self.use_azure_cli.parse(value), + AzureConfigKey::UseEmulator => self.use_emulator.parse(value), }; self } @@ -898,7 +897,7 @@ impl MicrosoftAzureBuilder { /// Set if the Azure emulator should be used (defaults to false) pub fn with_use_emulator(mut self, use_emulator: bool) -> Self { - self.use_emulator = use_emulator; + self.use_emulator = use_emulator.into(); self } @@ -956,7 +955,7 @@ impl MicrosoftAzureBuilder { /// Set if the Azure Cli should be used for acquiring access token /// pub fn with_use_azure_cli(mut self, use_azure_cli: bool) -> Self { - self.use_azure_cli = use_azure_cli; + self.use_azure_cli = use_azure_cli.into(); self } @@ -969,7 +968,7 @@ impl MicrosoftAzureBuilder { let container = self.container_name.ok_or(Error::MissingContainerName {})?; - let (is_emulator, storage_url, auth, account) = if self.use_emulator { + let (is_emulator, storage_url, auth, account) = if self.use_emulator.get()? { let account_name = self .account_name .unwrap_or_else(|| EMULATOR_ACCOUNT.to_string()); @@ -1022,7 +1021,7 @@ impl MicrosoftAzureBuilder { credential::CredentialProvider::SASToken(query_pairs) } else if let Some(sas) = self.sas_key { credential::CredentialProvider::SASToken(split_sas(&sas)?) - } else if self.use_azure_cli { + } else if self.use_azure_cli.get()? { credential::CredentialProvider::TokenCredential( TokenCache::default(), Box::new(credential::AzureCliCredential::new()), diff --git a/src/client/mod.rs b/src/client/mod.rs index d019e81..d7b0b86 100644 --- a/src/client/mod.rs +++ b/src/client/mod.rs @@ -26,8 +26,10 @@ pub mod retry; #[cfg(any(feature = "aws", feature = "gcp", feature = "azure"))] pub mod token; +use crate::config::ConfigValue; use reqwest::header::{HeaderMap, HeaderValue}; use reqwest::{Client, ClientBuilder, Proxy}; +use serde::{Deserialize, Serialize}; use std::collections::HashMap; use std::time::Duration; @@ -43,6 +45,14 @@ fn map_client_error(e: reqwest::Error) -> super::Error { static DEFAULT_USER_AGENT: &str = concat!(env!("CARGO_PKG_NAME"), "/", env!("CARGO_PKG_VERSION"),); +/// Configuration keys for [`ClientOptions`] +#[derive(PartialEq, Eq, Hash, Clone, Debug, Copy, Deserialize, Serialize)] +#[non_exhaustive] +pub enum ClientConfigKey { + /// Allow non-TLS, i.e. non-HTTPS connections + AllowHttp, +} + /// HTTP client configuration for remote object stores #[derive(Debug, Clone, Default)] pub struct ClientOptions { @@ -51,7 +61,7 @@ pub struct ClientOptions { default_content_type: Option, default_headers: Option, proxy_url: Option, - allow_http: bool, + allow_http: ConfigValue, allow_insecure: bool, timeout: Option, connect_timeout: Option, @@ -70,6 +80,21 @@ impl ClientOptions { Default::default() } + /// Set an option by key + pub fn with_config(mut self, key: ClientConfigKey, value: impl Into) -> Self { + match key { + ClientConfigKey::AllowHttp => self.allow_http.parse(value), + } + self + } + + /// Get an option by key + pub fn get_config_value(&self, key: &ClientConfigKey) -> Option { + match key { + ClientConfigKey::AllowHttp => Some(self.allow_http.to_string()), + } + } + /// Sets the User-Agent header to be used by this client /// /// Default is based on the version of this crate @@ -104,7 +129,7 @@ impl ClientOptions { /// * false (default): Only HTTPS are allowed /// * true: HTTP and HTTPS are allowed pub fn with_allow_http(mut self, allow_http: bool) -> Self { - self.allow_http = allow_http; + self.allow_http = allow_http.into(); self } /// Allows connections to invalid SSL certificates @@ -280,7 +305,7 @@ impl ClientOptions { } builder - .https_only(!self.allow_http) + .https_only(!self.allow_http.get()?) .build() .map_err(map_client_error) } diff --git a/src/config.rs b/src/config.rs new file mode 100644 index 0000000..3ecce2e --- /dev/null +++ b/src/config.rs @@ -0,0 +1,81 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::{Error, Result}; +use std::fmt::{Debug, Display, Formatter}; + +/// Provides deferred parsing of a value +/// +/// This allows builders to defer fallibility to build +#[derive(Debug, Clone)] +pub enum ConfigValue { + Parsed(T), + Deferred(String), +} + +impl Display for ConfigValue { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + match self { + Self::Parsed(v) => write!(f, "{v}"), + Self::Deferred(v) => write!(f, "{v}"), + } + } +} + +impl From for ConfigValue { + fn from(value: T) -> Self { + Self::Parsed(value) + } +} + +impl ConfigValue { + pub fn parse(&mut self, v: impl Into) { + *self = Self::Deferred(v.into()) + } + + pub fn get(&self) -> Result { + match self { + Self::Parsed(v) => Ok(v.clone()), + Self::Deferred(v) => T::parse(v), + } + } +} + +impl Default for ConfigValue { + fn default() -> Self { + Self::Parsed(T::default()) + } +} + +/// A value that can be stored in [`ConfigValue`] +pub trait Parse: Sized { + fn parse(v: &str) -> Result; +} + +impl Parse for bool { + fn parse(v: &str) -> Result { + let lower = v.to_ascii_lowercase(); + match lower.as_str() { + "1" | "true" | "on" | "yes" | "y" => Ok(true), + "0" | "false" | "off" | "no" | "n" => Ok(false), + _ => Err(Error::Generic { + store: "Config", + source: format!("failed to parse \"{v}\" as boolean").into(), + }), + } + } +} diff --git a/src/lib.rs b/src/lib.rs index c31027c..1390a01 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -247,6 +247,9 @@ mod client; #[cfg(any(feature = "gcp", feature = "aws", feature = "azure", feature = "http"))] pub use client::{backoff::BackoffConfig, retry::RetryConfig}; +#[cfg(any(feature = "gcp", feature = "aws", feature = "azure", feature = "http"))] +mod config; + #[cfg(any(feature = "azure", feature = "aws", feature = "gcp"))] mod multipart; mod util; diff --git a/src/util.rs b/src/util.rs index 1ec63f2..e5c701d 100644 --- a/src/util.rs +++ b/src/util.rs @@ -185,15 +185,6 @@ fn merge_ranges( ret } -#[allow(dead_code)] -pub(crate) fn str_is_truthy(val: &str) -> bool { - val.eq_ignore_ascii_case("1") - | val.eq_ignore_ascii_case("true") - | val.eq_ignore_ascii_case("on") - | val.eq_ignore_ascii_case("yes") - | val.eq_ignore_ascii_case("y") -} - #[cfg(test)] mod tests { use super::*; From 467a7797945e5a224aeb00993478d0e2fc4b269b Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 11 May 2023 13:19:45 +0100 Subject: [PATCH 135/397] Allow setting ClientOptions with Options API (#4202) * Allow setting ClientOptions with options API * More clippy --- src/aws/mod.rs | 25 +++++++++++++++---------- src/azure/mod.rs | 25 +++++++++++++++---------- src/client/mod.rs | 23 +++++++++++++++++++++++ src/gcp/mod.rs | 18 ++++++++++++++---- 4 files changed, 67 insertions(+), 24 deletions(-) diff --git a/src/aws/mod.rs b/src/aws/mod.rs index fe49471..17d779f 100644 --- a/src/aws/mod.rs +++ b/src/aws/mod.rs @@ -604,6 +604,9 @@ pub enum AmazonS3ConfigKey { /// - `aws_profile` /// - `profile` Profile, + + /// Client options + Client(ClientConfigKey), } impl AsRef for AmazonS3ConfigKey { @@ -622,6 +625,7 @@ impl AsRef for AmazonS3ConfigKey { Self::Profile => "aws_profile", Self::UnsignedPayload => "aws_unsigned_payload", Self::Checksum => "aws_checksum_algorithm", + Self::Client(opt) => opt.as_ref(), } } } @@ -652,7 +656,12 @@ impl FromStr for AmazonS3ConfigKey { "aws_metadata_endpoint" | "metadata_endpoint" => Ok(Self::MetadataEndpoint), "aws_unsigned_payload" | "unsigned_payload" => Ok(Self::UnsignedPayload), "aws_checksum_algorithm" | "checksum_algorithm" => Ok(Self::Checksum), - _ => Err(Error::UnknownConfigurationKey { key: s.into() }.into()), + // Backwards compatibility + "aws_allow_http" => Ok(Self::Client(ClientConfigKey::AllowHttp)), + _ => match s.parse() { + Ok(key) => Ok(Self::Client(key)), + Err(_) => Err(Error::UnknownConfigurationKey { key: s.into() }.into()), + }, } } } @@ -688,9 +697,7 @@ impl AmazonS3Builder { for (os_key, os_value) in std::env::vars_os() { if let (Some(key), Some(value)) = (os_key.to_str(), os_value.to_str()) { if key.starts_with("AWS_") { - if let Ok(config_key) = - AmazonS3ConfigKey::from_str(&key.to_ascii_lowercase()) - { + if let Ok(config_key) = key.to_ascii_lowercase().parse() { builder = builder.with_config(config_key, value); } } @@ -706,12 +713,6 @@ impl AmazonS3Builder { Some(format!("{METADATA_ENDPOINT}{metadata_relative_uri}")); } - if let Ok(text) = std::env::var("AWS_ALLOW_HTTP") { - builder.client_options = builder - .client_options - .with_config(ClientConfigKey::AllowHttp, text); - } - builder } @@ -770,6 +771,9 @@ impl AmazonS3Builder { AmazonS3ConfigKey::Checksum => { self.checksum_algorithm = Some(ConfigValue::Deferred(value.into())) } + AmazonS3ConfigKey::Client(key) => { + self.client_options = self.client_options.with_config(key, value) + } }; self } @@ -834,6 +838,7 @@ impl AmazonS3Builder { AmazonS3ConfigKey::Checksum => { self.checksum_algorithm.as_ref().map(ToString::to_string) } + AmazonS3ConfigKey::Client(key) => self.client_options.get_config_value(key), } } diff --git a/src/azure/mod.rs b/src/azure/mod.rs index 2b5b43a..c2cfdfe 100644 --- a/src/azure/mod.rs +++ b/src/azure/mod.rs @@ -559,6 +559,9 @@ pub enum AzureConfigKey { /// - `azure_use_azure_cli` /// - `use_azure_cli` UseAzureCli, + + /// Client options + Client(ClientConfigKey), } impl AsRef for AzureConfigKey { @@ -577,6 +580,7 @@ impl AsRef for AzureConfigKey { Self::MsiResourceId => "azure_msi_resource_id", Self::FederatedTokenFile => "azure_federated_token_file", Self::UseAzureCli => "azure_use_azure_cli", + Self::Client(key) => key.as_ref(), } } } @@ -621,7 +625,12 @@ impl FromStr for AzureConfigKey { Ok(Self::FederatedTokenFile) } "azure_use_azure_cli" | "use_azure_cli" => Ok(Self::UseAzureCli), - _ => Err(Error::UnknownConfigurationKey { key: s.into() }.into()), + // Backwards compatibility + "azure_allow_http" => Ok(Self::Client(ClientConfigKey::AllowHttp)), + _ => match s.parse() { + Ok(key) => Ok(Self::Client(key)), + Err(_) => Err(Error::UnknownConfigurationKey { key: s.into() }.into()), + }, } } } @@ -664,21 +673,13 @@ impl MicrosoftAzureBuilder { for (os_key, os_value) in std::env::vars_os() { if let (Some(key), Some(value)) = (os_key.to_str(), os_value.to_str()) { if key.starts_with("AZURE_") { - if let Ok(config_key) = - AzureConfigKey::from_str(&key.to_ascii_lowercase()) - { + if let Ok(config_key) = key.to_ascii_lowercase().parse() { builder = builder.with_config(config_key, value); } } } } - if let Ok(text) = std::env::var("AZURE_ALLOW_HTTP") { - builder.client_options = builder - .client_options - .with_config(ClientConfigKey::AllowHttp, text) - } - if let Ok(text) = std::env::var(MSI_ENDPOINT_ENV_KEY) { builder = builder.with_msi_endpoint(text); } @@ -731,6 +732,9 @@ impl MicrosoftAzureBuilder { } AzureConfigKey::UseAzureCli => self.use_azure_cli.parse(value), AzureConfigKey::UseEmulator => self.use_emulator.parse(value), + AzureConfigKey::Client(key) => { + self.client_options = self.client_options.with_config(key, value) + } }; self } @@ -786,6 +790,7 @@ impl MicrosoftAzureBuilder { AzureConfigKey::MsiResourceId => self.msi_resource_id.clone(), AzureConfigKey::FederatedTokenFile => self.federated_token_file.clone(), AzureConfigKey::UseAzureCli => Some(self.use_azure_cli.to_string()), + AzureConfigKey::Client(key) => self.client_options.get_config_value(key), } } diff --git a/src/client/mod.rs b/src/client/mod.rs index d7b0b86..d2242dd 100644 --- a/src/client/mod.rs +++ b/src/client/mod.rs @@ -31,6 +31,7 @@ use reqwest::header::{HeaderMap, HeaderValue}; use reqwest::{Client, ClientBuilder, Proxy}; use serde::{Deserialize, Serialize}; use std::collections::HashMap; +use std::str::FromStr; use std::time::Duration; use crate::path::Path; @@ -53,6 +54,28 @@ pub enum ClientConfigKey { AllowHttp, } +impl AsRef for ClientConfigKey { + fn as_ref(&self) -> &str { + match self { + Self::AllowHttp => "allow_http", + } + } +} + +impl FromStr for ClientConfigKey { + type Err = super::Error; + + fn from_str(s: &str) -> Result { + match s { + "allow_http" => Ok(Self::AllowHttp), + _ => Err(super::Error::UnknownConfigurationKey { + store: "HTTP", + key: s.into(), + }), + } + } +} + /// HTTP client configuration for remote object stores #[derive(Debug, Clone, Default)] pub struct ClientOptions { diff --git a/src/gcp/mod.rs b/src/gcp/mod.rs index 6f3d53d..375b4d8 100644 --- a/src/gcp/mod.rs +++ b/src/gcp/mod.rs @@ -49,6 +49,7 @@ use url::Url; use crate::client::pagination::stream_paginated; use crate::client::retry::RetryExt; +use crate::client::ClientConfigKey; use crate::{ client::token::TokenCache, multipart::{CloudMultiPartUpload, CloudMultiPartUploadImpl, UploadPart}, @@ -829,6 +830,9 @@ pub enum GoogleConfigKey { /// /// See [`GoogleCloudStorageBuilder::with_application_credentials`]. ApplicationCredentials, + + /// Client options + Client(ClientConfigKey), } impl AsRef for GoogleConfigKey { @@ -838,6 +842,7 @@ impl AsRef for GoogleConfigKey { Self::ServiceAccountKey => "google_service_account_key", Self::Bucket => "google_bucket", Self::ApplicationCredentials => "google_application_credentials", + Self::Client(key) => key.as_ref(), } } } @@ -858,7 +863,10 @@ impl FromStr for GoogleConfigKey { Ok(Self::Bucket) } "google_application_credentials" => Ok(Self::ApplicationCredentials), - _ => Err(Error::UnknownConfigurationKey { key: s.into() }.into()), + _ => match s.parse() { + Ok(key) => Ok(Self::Client(key)), + Err(_) => Err(Error::UnknownConfigurationKey { key: s.into() }.into()), + }, } } } @@ -911,9 +919,7 @@ impl GoogleCloudStorageBuilder { for (os_key, os_value) in std::env::vars_os() { if let (Some(key), Some(value)) = (os_key.to_str(), os_value.to_str()) { if key.starts_with("GOOGLE_") { - if let Ok(config_key) = - GoogleConfigKey::from_str(&key.to_ascii_lowercase()) - { + if let Ok(config_key) = key.to_ascii_lowercase().parse() { builder = builder.with_config(config_key, value); } } @@ -957,6 +963,9 @@ impl GoogleCloudStorageBuilder { GoogleConfigKey::ApplicationCredentials => { self.application_credentials_path = Some(value.into()) } + GoogleConfigKey::Client(key) => { + self.client_options = self.client_options.with_config(key, value) + } }; self } @@ -1005,6 +1014,7 @@ impl GoogleCloudStorageBuilder { GoogleConfigKey::ApplicationCredentials => { self.application_credentials_path.clone() } + GoogleConfigKey::Client(key) => self.client_options.get_config_value(key), } } From 0b48f7fd548674c4db4002063b1b56b595ea660b Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 11 May 2023 18:03:56 +0100 Subject: [PATCH 136/397] Skip test_list_root on OS X (#3772) (#4198) * Skip test_list_root if cannot list root filesystem (#3772) * do not run on max * Remove list check --------- Co-authored-by: Andrew Lamb --- src/local.rs | 28 ++++++++++++++++------------ 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/src/local.rs b/src/local.rs index 286853d..b40f5a7 100644 --- a/src/local.rs +++ b/src/local.rs @@ -1117,19 +1117,23 @@ mod tests { } #[tokio::test] + #[cfg(target_family = "windows")] async fn test_list_root() { - let integration = LocalFileSystem::new(); - let result = integration.list_with_delimiter(None).await; - if cfg!(target_family = "windows") { - let r = result.unwrap_err().to_string(); - assert!( - r.contains("Unable to convert URL \"file:///\" to filesystem path"), - "{}", - r - ); - } else { - result.unwrap(); - } + let fs = LocalFileSystem::new(); + let r = fs.list_with_delimiter(None).await.unwrap_err().to_string(); + + assert!( + r.contains("Unable to convert URL \"file:///\" to filesystem path"), + "{}", + r + ); + } + + #[tokio::test] + #[cfg(target_os = "linux")] + async fn test_list_root() { + let fs = LocalFileSystem::new(); + fs.list_with_delimiter(None).await.unwrap(); } async fn check_list( From c39e22bef5ca4fc36e1e207efef8295efb92293e Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Fri, 12 May 2023 12:01:33 +0100 Subject: [PATCH 137/397] Create ObjectStore from URL and Options (#4047) (#4200) * Add parse_url function (#4047) * Clippy * Fix copypasta * Fix wasm32 build * More wasm fixes * Return remaining path * Don't use from_env --- src/aws/mod.rs | 2 +- src/lib.rs | 3 + src/parse.rs | 265 +++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 269 insertions(+), 1 deletion(-) create mode 100644 src/parse.rs diff --git a/src/aws/mod.rs b/src/aws/mod.rs index 17d779f..6fa5e1c 100644 --- a/src/aws/mod.rs +++ b/src/aws/mod.rs @@ -722,7 +722,7 @@ impl AmazonS3Builder { /// /// - `s3:///` /// - `s3a:///` - /// - `https://s3..amazonaws.com` + /// - `https://s3..amazonaws.com/` /// - `https://.s3..amazonaws.com` /// - `https://ACCOUNT_ID.r2.cloudflarestorage.com/bucket` /// diff --git a/src/lib.rs b/src/lib.rs index 1390a01..2c93802 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -252,8 +252,11 @@ mod config; #[cfg(any(feature = "azure", feature = "aws", feature = "gcp"))] mod multipart; +mod parse; mod util; +pub use parse::{parse_url, parse_url_opts}; + use crate::path::Path; #[cfg(not(target_arch = "wasm32"))] use crate::util::maybe_spawn_blocking; diff --git a/src/parse.rs b/src/parse.rs new file mode 100644 index 0000000..7b89e58 --- /dev/null +++ b/src/parse.rs @@ -0,0 +1,265 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#[cfg(not(target_arch = "wasm32"))] +use crate::local::LocalFileSystem; +use crate::memory::InMemory; +use crate::path::Path; +use crate::ObjectStore; +use snafu::Snafu; +use url::Url; + +#[derive(Debug, Snafu)] +enum Error { + #[snafu(display("Unable to convert URL \"{}\" to filesystem path", url))] + InvalidUrl { url: Url }, + + #[snafu(display("Unable to recognise URL \"{}\"", url))] + Unrecognised { url: Url }, + + #[snafu(display("Feature {scheme:?} not enabled"))] + NotEnabled { scheme: ObjectStoreScheme }, + + #[snafu(context(false))] + Path { source: crate::path::Error }, +} + +impl From for super::Error { + fn from(e: Error) -> Self { + Self::Generic { + store: "URL", + source: Box::new(e), + } + } +} + +/// Recognises various URL formats, identifying the relevant [`ObjectStore`](crate::ObjectStore) +#[derive(Debug, Eq, PartialEq)] +enum ObjectStoreScheme { + /// Url corresponding to [`LocalFileSystem`](crate::local::LocalFileSystem) + Local, + /// Url corresponding to [`InMemory`](crate::memory::InMemory) + Memory, + /// Url corresponding to [`AmazonS3`](crate::aws::AmazonS3) + AmazonS3, + /// Url corresponding to [`GoogleCloudStorage`](crate::gcp::GoogleCloudStorage) + GoogleCloudStorage, + /// Url corresponding to [`MicrosoftAzure`](crate::azure::MicrosoftAzure) + MicrosoftAzure, + /// Url corresponding to [`HttpStore`](crate::http::HttpStore) + Http, +} + +impl ObjectStoreScheme { + /// Create an [`ObjectStoreScheme`] from the provided [`Url`] + /// + /// Returns the [`ObjectStoreScheme`] and the remaining [`Path`] + fn parse(url: &Url) -> Result<(Self, Path), Error> { + let strip_bucket = || Some(url.path().strip_prefix('/')?.split_once('/')?.1); + + let (scheme, path) = match (url.scheme(), url.host_str()) { + ("file", None) => (Self::Local, url.path()), + ("memory", None) => (Self::Memory, url.path()), + ("s3" | "s3a", Some(_)) => (Self::AmazonS3, url.path()), + ("gs", Some(_)) => (Self::GoogleCloudStorage, url.path()), + ("az" | "adl" | "azure" | "abfs" | "abfss", Some(_)) => { + (Self::MicrosoftAzure, url.path()) + } + ("http", Some(_)) => (Self::Http, url.path()), + ("https", Some(host)) => { + if host.ends_with("dfs.core.windows.net") + || host.ends_with("blob.core.windows.net") + { + (Self::MicrosoftAzure, url.path()) + } else if host.ends_with("amazonaws.com") { + match host.starts_with("s3") { + true => (Self::AmazonS3, strip_bucket().unwrap_or_default()), + false => (Self::AmazonS3, url.path()), + } + } else if host.ends_with("r2.cloudflarestorage.com") { + (Self::AmazonS3, strip_bucket().unwrap_or_default()) + } else { + (Self::Http, url.path()) + } + } + _ => return Err(Error::Unrecognised { url: url.clone() }), + }; + + let path = Path::parse(path)?; + Ok((scheme, path)) + } +} + +#[cfg(any(feature = "aws", feature = "gcp", feature = "azure", feature = "http"))] +macro_rules! builder_opts { + ($builder:ty, $url:expr, $options:expr) => {{ + let builder = $options.into_iter().fold( + <$builder>::new().with_url($url.as_str()), + |builder, (key, value)| match key.as_ref().parse() { + Ok(k) => builder.with_config(k, value), + Err(_) => builder, + }, + ); + Box::new(builder.build()?) as _ + }}; +} + +/// Create an [`ObjectStore`] based on the provided `url` +/// +/// Returns +/// - An [`ObjectStore`] of the corresponding type +/// - The [`Path`] into the [`ObjectStore`] of the addressed resource +pub fn parse_url(url: &Url) -> Result<(Box, Path), super::Error> { + parse_url_opts(url, std::iter::empty::<(&str, &str)>()) +} + +/// Create an [`ObjectStore`] based on the provided `url` and options +/// +/// Returns +/// - An [`ObjectStore`] of the corresponding type +/// - The [`Path`] into the [`ObjectStore`] of the addressed resource +pub fn parse_url_opts( + url: &Url, + options: I, +) -> Result<(Box, Path), super::Error> +where + I: IntoIterator, + K: AsRef, + V: Into, +{ + let _options = options; + let (scheme, path) = ObjectStoreScheme::parse(url)?; + let path = Path::parse(path)?; + + let store = match scheme { + #[cfg(not(target_arch = "wasm32"))] + ObjectStoreScheme::Local => Box::new(LocalFileSystem::new()) as _, + ObjectStoreScheme::Memory => Box::new(InMemory::new()) as _, + #[cfg(feature = "aws")] + ObjectStoreScheme::AmazonS3 => { + builder_opts!(crate::aws::AmazonS3Builder, url, _options) + } + #[cfg(feature = "gcp")] + ObjectStoreScheme::GoogleCloudStorage => { + builder_opts!(crate::gcp::GoogleCloudStorageBuilder, url, _options) + } + #[cfg(feature = "azure")] + ObjectStoreScheme::MicrosoftAzure => { + builder_opts!(crate::azure::MicrosoftAzureBuilder, url, _options) + } + #[cfg(feature = "http")] + ObjectStoreScheme::Http => { + let url = &url[..url::Position::BeforePath]; + Box::new(crate::http::HttpBuilder::new().with_url(url).build()?) as _ + } + #[cfg(not(all( + feature = "aws", + feature = "azure", + feature = "gcp", + feature = "http" + )))] + s => { + return Err(super::Error::Generic { + store: "parse_url", + source: format!("feature for {s:?} not enabled").into(), + }) + } + }; + + Ok((store, path)) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_parse() { + let cases = [ + ("file:/path", (ObjectStoreScheme::Local, "path")), + ("file:///path", (ObjectStoreScheme::Local, "path")), + ("memory:/path", (ObjectStoreScheme::Memory, "path")), + ("memory:///", (ObjectStoreScheme::Memory, "")), + ("s3://bucket/path", (ObjectStoreScheme::AmazonS3, "path")), + ("s3a://bucket/path", (ObjectStoreScheme::AmazonS3, "path")), + ( + "https://s3.region.amazonaws.com/bucket", + (ObjectStoreScheme::AmazonS3, ""), + ), + ( + "https://s3.region.amazonaws.com/bucket/path", + (ObjectStoreScheme::AmazonS3, "path"), + ), + ( + "https://bucket.s3.region.amazonaws.com", + (ObjectStoreScheme::AmazonS3, ""), + ), + ( + "https://ACCOUNT_ID.r2.cloudflarestorage.com/bucket", + (ObjectStoreScheme::AmazonS3, ""), + ), + ( + "https://ACCOUNT_ID.r2.cloudflarestorage.com/bucket/path", + (ObjectStoreScheme::AmazonS3, "path"), + ), + ( + "abfs://container/path", + (ObjectStoreScheme::MicrosoftAzure, "path"), + ), + ( + "abfs://file_system@account_name.dfs.core.windows.net/path", + (ObjectStoreScheme::MicrosoftAzure, "path"), + ), + ( + "abfss://file_system@account_name.dfs.core.windows.net/path", + (ObjectStoreScheme::MicrosoftAzure, "path"), + ), + ( + "https://account.dfs.core.windows.net", + (ObjectStoreScheme::MicrosoftAzure, ""), + ), + ( + "https://account.blob.core.windows.net", + (ObjectStoreScheme::MicrosoftAzure, ""), + ), + ( + "gs://bucket/path", + (ObjectStoreScheme::GoogleCloudStorage, "path"), + ), + ("http://mydomain/path", (ObjectStoreScheme::Http, "path")), + ("https://mydomain/path", (ObjectStoreScheme::Http, "path")), + ]; + + for (s, (expected_scheme, expected_path)) in cases { + let url = Url::parse(s).unwrap(); + let (scheme, path) = ObjectStoreScheme::parse(&url).unwrap(); + + assert_eq!(scheme, expected_scheme, "{s}"); + assert_eq!(path, Path::parse(expected_path).unwrap(), "{s}"); + } + + let neg_cases = [ + "unix:/run/foo.socket", + "file://remote/path", + "memory://remote/", + ]; + for s in neg_cases { + let url = Url::parse(s).unwrap(); + assert!(ObjectStoreScheme::parse(&url).is_err()); + } + } +} From 17280bd9253fea1136e4834b520aeefbbc969bb8 Mon Sep 17 00:00:00 2001 From: Robert Pack <42610831+roeap@users.noreply.github.com> Date: Fri, 12 May 2023 19:00:02 +0200 Subject: [PATCH 138/397] feat: extend client option configuration keys (#4208) --- Cargo.toml | 1 + src/client/mod.rs | 356 ++++++++++++++++++++++++++++++++++++++++------ src/config.rs | 55 ++++++- 3 files changed, 367 insertions(+), 45 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index b27482b..e25801b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -33,6 +33,7 @@ async-trait = "0.1.53" bytes = "1.0" chrono = { version = "0.4.23", default-features = false, features = ["clock"] } futures = "0.3" +humantime = "2.1" itertools = "0.10.1" parking_lot = { version = "0.12" } percent-encoding = "2.1" diff --git a/src/client/mod.rs b/src/client/mod.rs index d2242dd..ccf1b4a 100644 --- a/src/client/mod.rs +++ b/src/client/mod.rs @@ -26,14 +26,15 @@ pub mod retry; #[cfg(any(feature = "aws", feature = "gcp", feature = "azure"))] pub mod token; -use crate::config::ConfigValue; -use reqwest::header::{HeaderMap, HeaderValue}; -use reqwest::{Client, ClientBuilder, Proxy}; -use serde::{Deserialize, Serialize}; use std::collections::HashMap; use std::str::FromStr; use std::time::Duration; +use reqwest::header::{HeaderMap, HeaderValue}; +use reqwest::{Client, ClientBuilder, Proxy}; +use serde::{Deserialize, Serialize}; + +use crate::config::{fmt_duration, ConfigValue}; use crate::path::Path; fn map_client_error(e: reqwest::Error) -> super::Error { @@ -52,12 +53,64 @@ static DEFAULT_USER_AGENT: &str = pub enum ClientConfigKey { /// Allow non-TLS, i.e. non-HTTPS connections AllowHttp, + /// Skip certificate validation on https connections. + /// + /// # Warning + /// + /// You should think very carefully before using this method. If + /// invalid certificates are trusted, *any* certificate for *any* site + /// will be trusted for use. This includes expired certificates. This + /// introduces significant vulnerabilities, and should only be used + /// as a last resort or for testing + AllowInvalidCertificates, + /// Timeout for only the connect phase of a Client + ConnectTimeout, + /// default CONTENT_TYPE for uploads + DefaultContentType, + /// Only use http1 connections + Http1Only, + /// Interval for HTTP2 Ping frames should be sent to keep a connection alive. + Http2KeepAliveInterval, + /// Timeout for receiving an acknowledgement of the keep-alive ping. + Http2KeepAliveTimeout, + /// Enable HTTP2 keep alive pings for idle connections + Http2KeepAliveWhileIdle, + /// Only use http2 connections + Http2Only, + /// The pool max idle timeout + /// + /// This is the length of time an idle connection will be kept alive + PoolIdleTimeout, + /// maximum number of idle connections per host + PoolMaxIdlePerHost, + /// HTTP proxy to use for requests + ProxyUrl, + /// Request timeout + /// + /// The timeout is applied from when the request starts connecting until the + /// response body has finished + Timeout, + /// User-Agent header to be used by this client + UserAgent, } impl AsRef for ClientConfigKey { fn as_ref(&self) -> &str { match self { Self::AllowHttp => "allow_http", + Self::AllowInvalidCertificates => "allow_invalid_certificates", + Self::ConnectTimeout => "connect_timeout", + Self::DefaultContentType => "default_content_type", + Self::Http1Only => "http1_only", + Self::Http2Only => "http2_only", + Self::Http2KeepAliveInterval => "http2_keep_alive_interval", + Self::Http2KeepAliveTimeout => "http2_keep_alive_timeout", + Self::Http2KeepAliveWhileIdle => "http2_keep_alive_while_idle", + Self::PoolIdleTimeout => "pool_idle_timeout", + Self::PoolMaxIdlePerHost => "pool_max_idle_per_host", + Self::ProxyUrl => "proxy_url", + Self::Timeout => "timeout", + Self::UserAgent => "user_agent", } } } @@ -68,6 +121,19 @@ impl FromStr for ClientConfigKey { fn from_str(s: &str) -> Result { match s { "allow_http" => Ok(Self::AllowHttp), + "allow_invalid_certificates" => Ok(Self::AllowInvalidCertificates), + "connect_timeout" => Ok(Self::ConnectTimeout), + "default_content_type" => Ok(Self::DefaultContentType), + "http1_only" => Ok(Self::Http1Only), + "http2_only" => Ok(Self::Http2Only), + "http2_keep_alive_interval" => Ok(Self::Http2KeepAliveInterval), + "http2_keep_alive_timeout" => Ok(Self::Http2KeepAliveTimeout), + "http2_keep_alive_while_idle" => Ok(Self::Http2KeepAliveWhileIdle), + "pool_idle_timeout" => Ok(Self::PoolIdleTimeout), + "pool_max_idle_per_host" => Ok(Self::PoolMaxIdlePerHost), + "proxy_url" => Ok(Self::ProxyUrl), + "timeout" => Ok(Self::Timeout), + "user_agent" => Ok(Self::UserAgent), _ => Err(super::Error::UnknownConfigurationKey { store: "HTTP", key: s.into(), @@ -79,22 +145,22 @@ impl FromStr for ClientConfigKey { /// HTTP client configuration for remote object stores #[derive(Debug, Clone, Default)] pub struct ClientOptions { - user_agent: Option, + user_agent: Option>, content_type_map: HashMap, default_content_type: Option, default_headers: Option, proxy_url: Option, allow_http: ConfigValue, - allow_insecure: bool, - timeout: Option, - connect_timeout: Option, - pool_idle_timeout: Option, - pool_max_idle_per_host: Option, - http2_keep_alive_interval: Option, - http2_keep_alive_timeout: Option, - http2_keep_alive_while_idle: bool, - http1_only: bool, - http2_only: bool, + allow_insecure: ConfigValue, + timeout: Option>, + connect_timeout: Option>, + pool_idle_timeout: Option>, + pool_max_idle_per_host: Option>, + http2_keep_alive_interval: Option>, + http2_keep_alive_timeout: Option>, + http2_keep_alive_while_idle: ConfigValue, + http1_only: ConfigValue, + http2_only: ConfigValue, } impl ClientOptions { @@ -107,6 +173,37 @@ impl ClientOptions { pub fn with_config(mut self, key: ClientConfigKey, value: impl Into) -> Self { match key { ClientConfigKey::AllowHttp => self.allow_http.parse(value), + ClientConfigKey::AllowInvalidCertificates => self.allow_insecure.parse(value), + ClientConfigKey::ConnectTimeout => { + self.connect_timeout = Some(ConfigValue::Deferred(value.into())) + } + ClientConfigKey::DefaultContentType => { + self.default_content_type = Some(value.into()) + } + ClientConfigKey::Http1Only => self.http1_only.parse(value), + ClientConfigKey::Http2Only => self.http2_only.parse(value), + ClientConfigKey::Http2KeepAliveInterval => { + self.http2_keep_alive_interval = Some(ConfigValue::Deferred(value.into())) + } + ClientConfigKey::Http2KeepAliveTimeout => { + self.http2_keep_alive_timeout = Some(ConfigValue::Deferred(value.into())) + } + ClientConfigKey::Http2KeepAliveWhileIdle => { + self.http2_keep_alive_while_idle.parse(value) + } + ClientConfigKey::PoolIdleTimeout => { + self.pool_idle_timeout = Some(ConfigValue::Deferred(value.into())) + } + ClientConfigKey::PoolMaxIdlePerHost => { + self.pool_max_idle_per_host = Some(ConfigValue::Deferred(value.into())) + } + ClientConfigKey::ProxyUrl => self.proxy_url = Some(value.into()), + ClientConfigKey::Timeout => { + self.timeout = Some(ConfigValue::Deferred(value.into())) + } + ClientConfigKey::UserAgent => { + self.user_agent = Some(ConfigValue::Deferred(value.into())) + } } self } @@ -115,6 +212,37 @@ impl ClientOptions { pub fn get_config_value(&self, key: &ClientConfigKey) -> Option { match key { ClientConfigKey::AllowHttp => Some(self.allow_http.to_string()), + ClientConfigKey::AllowInvalidCertificates => { + Some(self.allow_insecure.to_string()) + } + ClientConfigKey::ConnectTimeout => { + self.connect_timeout.as_ref().map(fmt_duration) + } + ClientConfigKey::DefaultContentType => self.default_content_type.clone(), + ClientConfigKey::Http1Only => Some(self.http1_only.to_string()), + ClientConfigKey::Http2KeepAliveInterval => { + self.http2_keep_alive_interval.as_ref().map(fmt_duration) + } + ClientConfigKey::Http2KeepAliveTimeout => { + self.http2_keep_alive_timeout.as_ref().map(fmt_duration) + } + ClientConfigKey::Http2KeepAliveWhileIdle => { + Some(self.http2_keep_alive_while_idle.to_string()) + } + ClientConfigKey::Http2Only => Some(self.http2_only.to_string()), + ClientConfigKey::PoolIdleTimeout => { + self.pool_idle_timeout.as_ref().map(fmt_duration) + } + ClientConfigKey::PoolMaxIdlePerHost => { + self.pool_max_idle_per_host.as_ref().map(|v| v.to_string()) + } + ClientConfigKey::ProxyUrl => self.proxy_url.clone(), + ClientConfigKey::Timeout => self.timeout.as_ref().map(fmt_duration), + ClientConfigKey::UserAgent => self + .user_agent + .as_ref() + .and_then(|v| v.get().ok()) + .and_then(|v| v.to_str().ok().map(|s| s.to_string())), } } @@ -122,7 +250,7 @@ impl ClientOptions { /// /// Default is based on the version of this crate pub fn with_user_agent(mut self, agent: HeaderValue) -> Self { - self.user_agent = Some(agent); + self.user_agent = Some(agent.into()); self } @@ -167,19 +295,19 @@ impl ClientOptions { /// introduces significant vulnerabilities, and should only be used /// as a last resort or for testing pub fn with_allow_invalid_certificates(mut self, allow_insecure: bool) -> Self { - self.allow_insecure = allow_insecure; + self.allow_insecure = allow_insecure.into(); self } /// Only use http1 connections pub fn with_http1_only(mut self) -> Self { - self.http1_only = true; + self.http1_only = true.into(); self } /// Only use http2 connections pub fn with_http2_only(mut self) -> Self { - self.http2_only = true; + self.http2_only = true.into(); self } @@ -194,13 +322,13 @@ impl ClientOptions { /// The timeout is applied from when the request starts connecting until the /// response body has finished pub fn with_timeout(mut self, timeout: Duration) -> Self { - self.timeout = Some(timeout); + self.timeout = Some(ConfigValue::Parsed(timeout)); self } /// Set a timeout for only the connect phase of a Client pub fn with_connect_timeout(mut self, timeout: Duration) -> Self { - self.connect_timeout = Some(timeout); + self.connect_timeout = Some(ConfigValue::Parsed(timeout)); self } @@ -210,7 +338,7 @@ impl ClientOptions { /// /// Default is 90 seconds pub fn with_pool_idle_timeout(mut self, timeout: Duration) -> Self { - self.pool_idle_timeout = Some(timeout); + self.pool_idle_timeout = Some(ConfigValue::Parsed(timeout)); self } @@ -218,7 +346,7 @@ impl ClientOptions { /// /// Default is no limit pub fn with_pool_max_idle_per_host(mut self, max: usize) -> Self { - self.pool_max_idle_per_host = Some(max); + self.pool_max_idle_per_host = Some(max.into()); self } @@ -226,7 +354,7 @@ impl ClientOptions { /// /// Default is disabled pub fn with_http2_keep_alive_interval(mut self, interval: Duration) -> Self { - self.http2_keep_alive_interval = Some(interval); + self.http2_keep_alive_interval = Some(ConfigValue::Parsed(interval)); self } @@ -237,7 +365,7 @@ impl ClientOptions { /// /// Default is disabled pub fn with_http2_keep_alive_timeout(mut self, interval: Duration) -> Self { - self.http2_keep_alive_timeout = Some(interval); + self.http2_keep_alive_timeout = Some(ConfigValue::Parsed(interval)); self } @@ -248,7 +376,7 @@ impl ClientOptions { /// /// Default is disabled pub fn with_http2_keep_alive_while_idle(mut self) -> Self { - self.http2_keep_alive_while_idle = true; + self.http2_keep_alive_while_idle = true.into(); self } @@ -274,7 +402,7 @@ impl ClientOptions { let mut builder = ClientBuilder::new(); match &self.user_agent { - Some(user_agent) => builder = builder.user_agent(user_agent), + Some(user_agent) => builder = builder.user_agent(user_agent.get()?), None => builder = builder.user_agent(DEFAULT_USER_AGENT), } @@ -287,44 +415,44 @@ impl ClientOptions { builder = builder.proxy(proxy); } - if let Some(timeout) = self.timeout { - builder = builder.timeout(timeout) + if let Some(timeout) = &self.timeout { + builder = builder.timeout(timeout.get()?) } - if let Some(timeout) = self.connect_timeout { - builder = builder.connect_timeout(timeout) + if let Some(timeout) = &self.connect_timeout { + builder = builder.connect_timeout(timeout.get()?) } - if let Some(timeout) = self.pool_idle_timeout { - builder = builder.pool_idle_timeout(timeout) + if let Some(timeout) = &self.pool_idle_timeout { + builder = builder.pool_idle_timeout(timeout.get()?) } - if let Some(max) = self.pool_max_idle_per_host { - builder = builder.pool_max_idle_per_host(max) + if let Some(max) = &self.pool_max_idle_per_host { + builder = builder.pool_max_idle_per_host(max.get()?) } - if let Some(interval) = self.http2_keep_alive_interval { - builder = builder.http2_keep_alive_interval(interval) + if let Some(interval) = &self.http2_keep_alive_interval { + builder = builder.http2_keep_alive_interval(interval.get()?) } - if let Some(interval) = self.http2_keep_alive_timeout { - builder = builder.http2_keep_alive_timeout(interval) + if let Some(interval) = &self.http2_keep_alive_timeout { + builder = builder.http2_keep_alive_timeout(interval.get()?) } - if self.http2_keep_alive_while_idle { + if self.http2_keep_alive_while_idle.get()? { builder = builder.http2_keep_alive_while_idle(true) } - if self.http1_only { + if self.http1_only.get()? { builder = builder.http1_only() } - if self.http2_only { + if self.http2_only.get()? { builder = builder.http2_prior_knowledge() } - if self.allow_insecure { - builder = builder.danger_accept_invalid_certs(self.allow_insecure) + if self.allow_insecure.get()? { + builder = builder.danger_accept_invalid_certs(true) } builder @@ -333,3 +461,143 @@ impl ClientOptions { .map_err(map_client_error) } } + +#[cfg(test)] +mod tests { + use super::*; + use std::collections::HashMap; + + #[test] + fn client_test_config_from_map() { + let allow_http = "true".to_string(); + let allow_invalid_certificates = "false".to_string(); + let connect_timeout = "90 seconds".to_string(); + let default_content_type = "object_store:fake_default_content_type".to_string(); + let http1_only = "true".to_string(); + let http2_only = "false".to_string(); + let http2_keep_alive_interval = "90 seconds".to_string(); + let http2_keep_alive_timeout = "91 seconds".to_string(); + let http2_keep_alive_while_idle = "92 seconds".to_string(); + let pool_idle_timeout = "93 seconds".to_string(); + let pool_max_idle_per_host = "94".to_string(); + let proxy_url = "https://fake_proxy_url".to_string(); + let timeout = "95 seconds".to_string(); + let user_agent = "object_store:fake_user_agent".to_string(); + + let options = HashMap::from([ + ("allow_http", allow_http.clone()), + ( + "allow_invalid_certificates", + allow_invalid_certificates.clone(), + ), + ("connect_timeout", connect_timeout.clone()), + ("default_content_type", default_content_type.clone()), + ("http1_only", http1_only.clone()), + ("http2_only", http2_only.clone()), + ( + "http2_keep_alive_interval", + http2_keep_alive_interval.clone(), + ), + ("http2_keep_alive_timeout", http2_keep_alive_timeout.clone()), + ( + "http2_keep_alive_while_idle", + http2_keep_alive_while_idle.clone(), + ), + ("pool_idle_timeout", pool_idle_timeout.clone()), + ("pool_max_idle_per_host", pool_max_idle_per_host.clone()), + ("proxy_url", proxy_url.clone()), + ("timeout", timeout.clone()), + ("user_agent", user_agent.clone()), + ]); + + let builder = options + .into_iter() + .fold(ClientOptions::new(), |builder, (key, value)| { + builder.with_config(key.parse().unwrap(), value) + }); + + assert_eq!( + builder + .get_config_value(&ClientConfigKey::AllowHttp) + .unwrap(), + allow_http + ); + assert_eq!( + builder + .get_config_value(&ClientConfigKey::AllowInvalidCertificates) + .unwrap(), + allow_invalid_certificates + ); + assert_eq!( + builder + .get_config_value(&ClientConfigKey::ConnectTimeout) + .unwrap(), + connect_timeout + ); + assert_eq!( + builder + .get_config_value(&ClientConfigKey::DefaultContentType) + .unwrap(), + default_content_type + ); + assert_eq!( + builder + .get_config_value(&ClientConfigKey::Http1Only) + .unwrap(), + http1_only + ); + assert_eq!( + builder + .get_config_value(&ClientConfigKey::Http2Only) + .unwrap(), + http2_only + ); + assert_eq!( + builder + .get_config_value(&ClientConfigKey::Http2KeepAliveInterval) + .unwrap(), + http2_keep_alive_interval + ); + assert_eq!( + builder + .get_config_value(&ClientConfigKey::Http2KeepAliveTimeout) + .unwrap(), + http2_keep_alive_timeout + ); + assert_eq!( + builder + .get_config_value(&ClientConfigKey::Http2KeepAliveWhileIdle) + .unwrap(), + http2_keep_alive_while_idle + ); + + assert_eq!( + builder + .get_config_value(&ClientConfigKey::PoolIdleTimeout) + .unwrap(), + pool_idle_timeout + ); + assert_eq!( + builder + .get_config_value(&ClientConfigKey::PoolMaxIdlePerHost) + .unwrap(), + pool_max_idle_per_host + ); + assert_eq!( + builder + .get_config_value(&ClientConfigKey::ProxyUrl) + .unwrap(), + proxy_url + ); + assert_eq!( + builder.get_config_value(&ClientConfigKey::Timeout).unwrap(), + timeout + ); + assert_eq!( + builder + .get_config_value(&ClientConfigKey::UserAgent) + .unwrap(), + user_agent + ); + } +} diff --git a/src/config.rs b/src/config.rs index 3ecce2e..987e6e4 100644 --- a/src/config.rs +++ b/src/config.rs @@ -14,9 +14,14 @@ // KIND, either express or implied. See the License for the // specific language governing permissions and limitations // under the License. +use std::fmt::{Debug, Display, Formatter}; +use std::str::FromStr; +use std::time::Duration; + +use humantime::{format_duration, parse_duration}; +use reqwest::header::HeaderValue; use crate::{Error, Result}; -use std::fmt::{Debug, Display, Formatter}; /// Provides deferred parsing of a value /// @@ -79,3 +84,51 @@ impl Parse for bool { } } } + +impl Parse for Duration { + fn parse(v: &str) -> Result { + parse_duration(v).map_err(|_| Error::Generic { + store: "Config", + source: format!("failed to parse \"{v}\" as Duration").into(), + }) + } +} + +impl Parse for usize { + fn parse(v: &str) -> Result { + Self::from_str(v).map_err(|_| Error::Generic { + store: "Config", + source: format!("failed to parse \"{v}\" as usize").into(), + }) + } +} + +impl Parse for HeaderValue { + fn parse(v: &str) -> Result { + Self::from_str(v).map_err(|_| Error::Generic { + store: "Config", + source: format!("failed to parse \"{v}\" as HeaderValue").into(), + }) + } +} + +pub(crate) fn fmt_duration(duration: &ConfigValue) -> String { + match duration { + ConfigValue::Parsed(v) => format_duration(*v).to_string(), + ConfigValue::Deferred(v) => v.clone(), + } +} + +#[cfg(test)] +mod tests { + use super::*; + use std::time::Duration; + + #[test] + fn test_parse_duration() { + let duration = Duration::from_secs(60); + assert_eq!(Duration::parse("60 seconds").unwrap(), duration); + assert_eq!(Duration::parse("60 s").unwrap(), duration); + assert_eq!(Duration::parse("60s").unwrap(), duration) + } +} From ca74a3e380b6902f7444b013a71ee680c396cb93 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Sun, 14 May 2023 16:36:41 +0100 Subject: [PATCH 139/397] Implement list_with_offset for PrefixStore (#4203) --- src/prefix.rs | 147 ++++++++++++++++++++++++-------------------------- 1 file changed, 69 insertions(+), 78 deletions(-) diff --git a/src/prefix.rs b/src/prefix.rs index eba3795..94836d3 100644 --- a/src/prefix.rs +++ b/src/prefix.rs @@ -22,10 +22,7 @@ use std::ops::Range; use tokio::io::AsyncWrite; use crate::path::Path; -use crate::{ - GetResult, ListResult, MultipartId, ObjectMeta, ObjectStore, - Result as ObjectStoreResult, -}; +use crate::{GetResult, ListResult, MultipartId, ObjectMeta, ObjectStore, Result}; #[doc(hidden)] #[deprecated(note = "Use PrefixStore")] @@ -59,36 +56,63 @@ impl PrefixStore { } /// Strip the constant prefix from a given path - fn strip_prefix(&self, path: &Path) -> Option { - Some(path.prefix_match(&self.prefix)?.collect()) + fn strip_prefix(&self, path: Path) -> Path { + // Note cannot use match because of borrow checker + if let Some(suffix) = path.prefix_match(&self.prefix) { + return suffix.collect(); + } + path + } + + /// Strip the constant prefix from a given ObjectMeta + fn strip_meta(&self, meta: ObjectMeta) -> ObjectMeta { + ObjectMeta { + last_modified: meta.last_modified, + size: meta.size, + location: self.strip_prefix(meta.location), + e_tag: meta.e_tag, + } } } #[async_trait::async_trait] impl ObjectStore for PrefixStore { - async fn put(&self, location: &Path, bytes: Bytes) -> ObjectStoreResult<()> { + async fn put(&self, location: &Path, bytes: Bytes) -> Result<()> { let full_path = self.full_path(location); self.inner.put(&full_path, bytes).await } + async fn put_multipart( + &self, + location: &Path, + ) -> Result<(MultipartId, Box)> { + let full_path = self.full_path(location); + self.inner.put_multipart(&full_path).await + } + + async fn abort_multipart( + &self, + location: &Path, + multipart_id: &MultipartId, + ) -> Result<()> { + let full_path = self.full_path(location); + self.inner.abort_multipart(&full_path, multipart_id).await + } + async fn append( &self, location: &Path, - ) -> ObjectStoreResult> { + ) -> Result> { let full_path = self.full_path(location); self.inner.append(&full_path).await } - async fn get(&self, location: &Path) -> ObjectStoreResult { + async fn get(&self, location: &Path) -> Result { let full_path = self.full_path(location); self.inner.get(&full_path).await } - async fn get_range( - &self, - location: &Path, - range: Range, - ) -> ObjectStoreResult { + async fn get_range(&self, location: &Path, range: Range) -> Result { let full_path = self.full_path(location); self.inner.get_range(&full_path, range).await } @@ -97,22 +121,18 @@ impl ObjectStore for PrefixStore { &self, location: &Path, ranges: &[Range], - ) -> ObjectStoreResult> { + ) -> Result> { let full_path = self.full_path(location); self.inner.get_ranges(&full_path, ranges).await } - async fn head(&self, location: &Path) -> ObjectStoreResult { + async fn head(&self, location: &Path) -> Result { let full_path = self.full_path(location); - self.inner.head(&full_path).await.map(|meta| ObjectMeta { - last_modified: meta.last_modified, - size: meta.size, - location: self.strip_prefix(&meta.location).unwrap_or(meta.location), - e_tag: meta.e_tag, - }) + let meta = self.inner.head(&full_path).await?; + Ok(self.strip_meta(meta)) } - async fn delete(&self, location: &Path) -> ObjectStoreResult<()> { + async fn delete(&self, location: &Path) -> Result<()> { let full_path = self.full_path(location); self.inner.delete(&full_path).await } @@ -120,94 +140,65 @@ impl ObjectStore for PrefixStore { async fn list( &self, prefix: Option<&Path>, - ) -> ObjectStoreResult>> { - Ok(self - .inner - .list(Some(&self.full_path(prefix.unwrap_or(&Path::from("/"))))) - .await? - .map_ok(|meta| ObjectMeta { - last_modified: meta.last_modified, - size: meta.size, - location: self.strip_prefix(&meta.location).unwrap_or(meta.location), - e_tag: meta.e_tag, - }) - .boxed()) + ) -> Result>> { + let prefix = self.full_path(prefix.unwrap_or(&Path::default())); + let s = self.inner.list(Some(&prefix)).await?; + Ok(s.map_ok(|meta| self.strip_meta(meta)).boxed()) } - async fn list_with_delimiter( + async fn list_with_offset( &self, prefix: Option<&Path>, - ) -> ObjectStoreResult { + offset: &Path, + ) -> Result>> { + let offset = self.full_path(offset); + let prefix = self.full_path(prefix.unwrap_or(&Path::default())); + let s = self.inner.list_with_offset(Some(&prefix), &offset).await?; + Ok(s.map_ok(|meta| self.strip_meta(meta)).boxed()) + } + + async fn list_with_delimiter(&self, prefix: Option<&Path>) -> Result { + let prefix = self.full_path(prefix.unwrap_or(&Path::default())); self.inner - .list_with_delimiter(Some( - &self.full_path(prefix.unwrap_or(&Path::from("/"))), - )) + .list_with_delimiter(Some(&prefix)) .await .map(|lst| ListResult { common_prefixes: lst .common_prefixes - .iter() - .filter_map(|p| self.strip_prefix(p)) + .into_iter() + .map(|p| self.strip_prefix(p)) .collect(), objects: lst .objects - .iter() - .filter_map(|meta| { - Some(ObjectMeta { - last_modified: meta.last_modified, - size: meta.size, - location: self.strip_prefix(&meta.location)?, - e_tag: meta.e_tag.clone(), - }) - }) + .into_iter() + .map(|meta| self.strip_meta(meta)) .collect(), }) } - async fn copy(&self, from: &Path, to: &Path) -> ObjectStoreResult<()> { + async fn copy(&self, from: &Path, to: &Path) -> Result<()> { let full_from = self.full_path(from); let full_to = self.full_path(to); self.inner.copy(&full_from, &full_to).await } - async fn copy_if_not_exists(&self, from: &Path, to: &Path) -> ObjectStoreResult<()> { + async fn rename(&self, from: &Path, to: &Path) -> Result<()> { let full_from = self.full_path(from); let full_to = self.full_path(to); - self.inner.copy_if_not_exists(&full_from, &full_to).await + self.inner.rename(&full_from, &full_to).await } - async fn rename(&self, from: &Path, to: &Path) -> ObjectStoreResult<()> { + async fn copy_if_not_exists(&self, from: &Path, to: &Path) -> Result<()> { let full_from = self.full_path(from); let full_to = self.full_path(to); - self.inner.rename(&full_from, &full_to).await + self.inner.copy_if_not_exists(&full_from, &full_to).await } - async fn rename_if_not_exists( - &self, - from: &Path, - to: &Path, - ) -> ObjectStoreResult<()> { + async fn rename_if_not_exists(&self, from: &Path, to: &Path) -> Result<()> { let full_from = self.full_path(from); let full_to = self.full_path(to); self.inner.rename_if_not_exists(&full_from, &full_to).await } - - async fn put_multipart( - &self, - location: &Path, - ) -> ObjectStoreResult<(MultipartId, Box)> { - let full_path = self.full_path(location); - self.inner.put_multipart(&full_path).await - } - - async fn abort_multipart( - &self, - location: &Path, - multipart_id: &MultipartId, - ) -> ObjectStoreResult<()> { - let full_path = self.full_path(location); - self.inner.abort_multipart(&full_path, multipart_id).await - } } #[cfg(test)] From 797d719e9428339f2a270062dd237fc5c9637818 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Mon, 15 May 2023 12:01:19 +0100 Subject: [PATCH 140/397] Add ObjectStore::get_opts (#2241) (#4212) * Add ObjectStore::get_opts (#2241) * Cleanup error handling * Review feedback --- src/aws/client.rs | 36 +++------ src/aws/credential.rs | 8 +- src/aws/mod.rs | 39 ++++------ src/azure/client.rs | 58 +++++--------- src/azure/mod.rs | 46 +++++------ src/chunked.rs | 7 +- src/client/mod.rs | 37 ++++++++- src/client/retry.rs | 37 ++++++++- src/gcp/mod.rs | 128 +++++++++++-------------------- src/http/client.rs | 21 ++---- src/http/mod.rs | 21 +----- src/lib.rs | 172 +++++++++++++++++++++++++++++++++++++++++- src/limit.rs | 20 +++-- src/local.rs | 57 ++++++++------ src/memory.rs | 28 +++---- src/prefix.rs | 19 +++-- src/throttle.rs | 43 ++++++----- src/util.rs | 7 -- 18 files changed, 470 insertions(+), 314 deletions(-) diff --git a/src/aws/client.rs b/src/aws/client.rs index 9634c74..b2d01ab 100644 --- a/src/aws/client.rs +++ b/src/aws/client.rs @@ -17,27 +17,25 @@ use crate::aws::checksum::Checksum; use crate::aws::credential::{AwsCredential, CredentialExt, CredentialProvider}; -use crate::aws::STRICT_PATH_ENCODE_SET; +use crate::aws::{STORE, STRICT_PATH_ENCODE_SET}; use crate::client::pagination::stream_paginated; use crate::client::retry::RetryExt; +use crate::client::GetOptionsExt; use crate::multipart::UploadPart; use crate::path::DELIMITER; -use crate::util::{format_http_range, format_prefix}; +use crate::util::format_prefix; use crate::{ - BoxStream, ClientOptions, ListResult, MultipartId, ObjectMeta, Path, Result, - RetryConfig, StreamExt, + BoxStream, ClientOptions, GetOptions, ListResult, MultipartId, ObjectMeta, Path, + Result, RetryConfig, StreamExt, }; use base64::prelude::BASE64_STANDARD; use base64::Engine; use bytes::{Buf, Bytes}; use chrono::{DateTime, Utc}; use percent_encoding::{utf8_percent_encode, PercentEncode}; -use reqwest::{ - header::CONTENT_TYPE, Client as ReqwestClient, Method, Response, StatusCode, -}; +use reqwest::{header::CONTENT_TYPE, Client as ReqwestClient, Method, Response}; use serde::{Deserialize, Serialize}; use snafu::{ResultExt, Snafu}; -use std::ops::Range; use std::sync::Arc; /// A specialized `Error` for object store-related errors @@ -102,16 +100,9 @@ impl From for crate::Error { Error::GetRequest { source, path } | Error::DeleteRequest { source, path } | Error::CopyRequest { source, path } - | Error::PutRequest { source, path } - if matches!(source.status(), Some(StatusCode::NOT_FOUND)) => - { - Self::NotFound { - path, - source: Box::new(source), - } - } + | Error::PutRequest { source, path } => source.error(STORE, path), _ => Self::Generic { - store: "S3", + store: STORE, source: Box::new(err), }, } @@ -245,11 +236,9 @@ impl S3Client { pub async fn get_request( &self, path: &Path, - range: Option>, + options: GetOptions, head: bool, ) -> Result { - use reqwest::header::RANGE; - let credential = self.get_credential().await?; let url = self.config.path_url(path); let method = match head { @@ -257,13 +246,10 @@ impl S3Client { false => Method::GET, }; - let mut builder = self.client.request(method, url); - - if let Some(range) = range { - builder = builder.header(RANGE, format_http_range(range)); - } + let builder = self.client.request(method, url); let response = builder + .with_get_options(options) .with_aws_sigv4( credential.as_ref(), &self.config.region, diff --git a/src/aws/credential.rs b/src/aws/credential.rs index c4cb7cf..16cdf35 100644 --- a/src/aws/credential.rs +++ b/src/aws/credential.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -use crate::aws::STRICT_ENCODE_SET; +use crate::aws::{STORE, STRICT_ENCODE_SET}; use crate::client::retry::RetryExt; use crate::client::token::{TemporaryToken, TokenCache}; use crate::util::hmac_sha256; @@ -330,7 +330,7 @@ impl CredentialProvider for InstanceCredentialProvider { self.imdsv1_fallback, ) .map_err(|source| crate::Error::Generic { - store: "S3", + store: STORE, source, }) })) @@ -363,7 +363,7 @@ impl CredentialProvider for WebIdentityProvider { &self.endpoint, ) .map_err(|source| crate::Error::Generic { - store: "S3", + store: STORE, source, }) })) @@ -552,7 +552,7 @@ mod profile { .provide_credentials() .await .map_err(|source| crate::Error::Generic { - store: "S3", + store: STORE, source: Box::new(source), })?; let t_now = SystemTime::now(); diff --git a/src/aws/mod.rs b/src/aws/mod.rs index 6fa5e1c..3f9b480 100644 --- a/src/aws/mod.rs +++ b/src/aws/mod.rs @@ -40,7 +40,6 @@ use itertools::Itertools; use serde::{Deserialize, Serialize}; use snafu::{ensure, OptionExt, ResultExt, Snafu}; use std::collections::BTreeSet; -use std::ops::Range; use std::str::FromStr; use std::sync::Arc; use tokio::io::AsyncWrite; @@ -57,8 +56,8 @@ use crate::client::ClientConfigKey; use crate::config::ConfigValue; use crate::multipart::{CloudMultiPartUpload, CloudMultiPartUploadImpl, UploadPart}; use crate::{ - ClientOptions, GetResult, ListResult, MultipartId, ObjectMeta, ObjectStore, Path, - Result, RetryConfig, StreamExt, + ClientOptions, GetOptions, GetResult, ListResult, MultipartId, ObjectMeta, + ObjectStore, Path, Result, RetryConfig, StreamExt, }; mod checksum; @@ -79,6 +78,8 @@ pub(crate) const STRICT_ENCODE_SET: percent_encoding::AsciiSet = /// This struct is used to maintain the URI path encoding const STRICT_PATH_ENCODE_SET: percent_encoding::AsciiSet = STRICT_ENCODE_SET.remove(b'/'); +const STORE: &str = "S3"; + /// Default metadata endpoint static METADATA_ENDPOINT: &str = "http://169.254.169.254"; @@ -160,10 +161,10 @@ impl From for super::Error { fn from(source: Error) -> Self { match source { Error::UnknownConfigurationKey { key } => { - Self::UnknownConfigurationKey { store: "S3", key } + Self::UnknownConfigurationKey { store: STORE, key } } _ => Self::Generic { - store: "S3", + store: STORE, source: Box::new(source), }, } @@ -246,12 +247,12 @@ impl ObjectStore for AmazonS3 { .await } - async fn get(&self, location: &Path) -> Result { - let response = self.client.get_request(location, None, false).await?; + async fn get_opts(&self, location: &Path, options: GetOptions) -> Result { + let response = self.client.get_request(location, options, false).await?; let stream = response .bytes_stream() .map_err(|source| crate::Error::Generic { - store: "S3", + store: STORE, source: Box::new(source), }) .boxed(); @@ -259,26 +260,13 @@ impl ObjectStore for AmazonS3 { Ok(GetResult::Stream(stream)) } - async fn get_range(&self, location: &Path, range: Range) -> Result { - let bytes = self - .client - .get_request(location, Some(range), false) - .await? - .bytes() - .await - .map_err(|source| client::Error::GetResponseBody { - source, - path: location.to_string(), - })?; - Ok(bytes) - } - async fn head(&self, location: &Path) -> Result { use reqwest::header::{CONTENT_LENGTH, ETAG, LAST_MODIFIED}; + let options = GetOptions::default(); // Extract meta from headers // https://docs.aws.amazon.com/AmazonS3/latest/API/API_HeadObject.html#API_HeadObject_ResponseSyntax - let response = self.client.get_request(location, None, true).await?; + let response = self.client.get_request(location, options, true).await?; let headers = response.headers(); let last_modified = headers @@ -1169,8 +1157,8 @@ fn profile_credentials( mod tests { use super::*; use crate::tests::{ - get_nonexistent_object, list_uses_directories_correctly, list_with_delimiter, - put_get_delete_list_opts, rename_and_copy, stream_get, + get_nonexistent_object, get_opts, list_uses_directories_correctly, + list_with_delimiter, put_get_delete_list_opts, rename_and_copy, stream_get, }; use bytes::Bytes; use std::collections::HashMap; @@ -1417,6 +1405,7 @@ mod tests { // Localstack doesn't support listing with spaces https://github.com/localstack/localstack/issues/6328 put_get_delete_list_opts(&integration, is_local).await; + get_opts(&integration).await; list_uses_directories_correctly(&integration).await; list_with_delimiter(&integration).await; rename_and_copy(&integration).await; diff --git a/src/azure/client.rs b/src/azure/client.rs index 87432f6..4611986 100644 --- a/src/azure/client.rs +++ b/src/azure/client.rs @@ -17,13 +17,15 @@ use super::credential::{AzureCredential, CredentialProvider}; use crate::azure::credential::*; +use crate::azure::STORE; use crate::client::pagination::stream_paginated; use crate::client::retry::RetryExt; +use crate::client::GetOptionsExt; use crate::path::DELIMITER; -use crate::util::{deserialize_rfc1123, format_http_range, format_prefix}; +use crate::util::{deserialize_rfc1123, format_prefix}; use crate::{ - BoxStream, ClientOptions, ListResult, ObjectMeta, Path, Result, RetryConfig, - StreamExt, + BoxStream, ClientOptions, GetOptions, ListResult, ObjectMeta, Path, Result, + RetryConfig, StreamExt, }; use base64::prelude::BASE64_STANDARD; use base64::Engine; @@ -32,13 +34,12 @@ use chrono::{DateTime, Utc}; use itertools::Itertools; use reqwest::header::CONTENT_TYPE; use reqwest::{ - header::{HeaderValue, CONTENT_LENGTH, IF_NONE_MATCH, RANGE}, + header::{HeaderValue, CONTENT_LENGTH, IF_NONE_MATCH}, Client as ReqwestClient, Method, Response, StatusCode, }; use serde::{Deserialize, Serialize}; use snafu::{ResultExt, Snafu}; use std::collections::HashMap; -use std::ops::Range; use url::Url; /// A specialized `Error` for object store-related errors @@ -69,12 +70,6 @@ pub(crate) enum Error { path: String, }, - #[snafu(display("Error performing copy request {}: {}", path, source))] - CopyRequest { - source: crate::client::retry::Error, - path: String, - }, - #[snafu(display("Error performing list request: {}", source))] ListRequest { source: crate::client::retry::Error }, @@ -95,25 +90,9 @@ impl From for crate::Error { match err { Error::GetRequest { source, path } | Error::DeleteRequest { source, path } - | Error::CopyRequest { source, path } - | Error::PutRequest { source, path } - if matches!(source.status(), Some(StatusCode::NOT_FOUND)) => - { - Self::NotFound { - path, - source: Box::new(source), - } - } - Error::CopyRequest { source, path } - if matches!(source.status(), Some(StatusCode::CONFLICT)) => - { - Self::AlreadyExists { - path, - source: Box::new(source), - } - } + | Error::PutRequest { source, path } => source.error(STORE, path), _ => Self::Generic { - store: "MicrosoftAzure", + store: STORE, source: Box::new(err), }, } @@ -175,7 +154,7 @@ impl AzureClient { // and we want to use it in an infallible function HeaderValue::from_str(&format!("Bearer {token}")).map_err(|err| { crate::Error::Generic { - store: "MicrosoftAzure", + store: STORE, source: Box::new(err), } })?, @@ -193,7 +172,7 @@ impl AzureClient { // and we want to use it in an infallible function HeaderValue::from_str(&format!("Bearer {token}")).map_err(|err| { crate::Error::Generic { - store: "MicrosoftAzure", + store: STORE, source: Box::new(err), } })?, @@ -253,7 +232,7 @@ impl AzureClient { pub async fn get_request( &self, path: &Path, - range: Option>, + options: GetOptions, head: bool, ) -> Result { let credential = self.get_credential().await?; @@ -263,17 +242,14 @@ impl AzureClient { false => Method::GET, }; - let mut builder = self + let builder = self .client .request(method, url) .header(CONTENT_LENGTH, HeaderValue::from_static("0")) .body(Bytes::new()); - if let Some(range) = range { - builder = builder.header(RANGE, format_http_range(range)); - } - let response = builder + .with_get_options(options) .with_azure_authorization(&credential, &self.config.account) .send_retry(&self.config.retry_config) .await @@ -338,8 +314,12 @@ impl AzureClient { .with_azure_authorization(&credential, &self.config.account) .send_retry(&self.config.retry_config) .await - .context(CopyRequestSnafu { - path: from.as_ref(), + .map_err(|err| match err.status() { + Some(StatusCode::CONFLICT) => crate::Error::AlreadyExists { + source: Box::new(err), + path: to.to_string(), + }, + _ => err.error(STORE, from.to_string()), })?; Ok(()) diff --git a/src/azure/mod.rs b/src/azure/mod.rs index c2cfdfe..6726241 100644 --- a/src/azure/mod.rs +++ b/src/azure/mod.rs @@ -31,8 +31,8 @@ use crate::client::token::TokenCache; use crate::{ multipart::{CloudMultiPartUpload, CloudMultiPartUploadImpl, UploadPart}, path::Path, - ClientOptions, GetResult, ListResult, MultipartId, ObjectMeta, ObjectStore, Result, - RetryConfig, + ClientOptions, GetOptions, GetResult, ListResult, MultipartId, ObjectMeta, + ObjectStore, Result, RetryConfig, }; use async_trait::async_trait; use base64::prelude::BASE64_STANDARD; @@ -45,7 +45,6 @@ use serde::{Deserialize, Serialize}; use snafu::{OptionExt, ResultExt, Snafu}; use std::fmt::{Debug, Formatter}; use std::io; -use std::ops::Range; use std::sync::Arc; use std::{collections::BTreeSet, str::FromStr}; use tokio::io::AsyncWrite; @@ -59,6 +58,8 @@ pub use credential::authority_hosts; mod client; mod credential; +const STORE: &str = "MicrosoftAzure"; + /// The well-known account used by Azurite and the legacy Azure Storage Emulator. /// const EMULATOR_ACCOUNT: &str = "devstoreaccount1"; @@ -150,12 +151,11 @@ enum Error { impl From for super::Error { fn from(source: Error) -> Self { match source { - Error::UnknownConfigurationKey { key } => Self::UnknownConfigurationKey { - store: "MicrosoftAzure", - key, - }, + Error::UnknownConfigurationKey { key } => { + Self::UnknownConfigurationKey { store: STORE, key } + } _ => Self::Generic { - store: "MicrosoftAzure", + store: STORE, source: Box::new(source), }, } @@ -209,12 +209,12 @@ impl ObjectStore for MicrosoftAzure { Ok(()) } - async fn get(&self, location: &Path) -> Result { - let response = self.client.get_request(location, None, false).await?; + async fn get_opts(&self, location: &Path, options: GetOptions) -> Result { + let response = self.client.get_request(location, options, false).await?; let stream = response .bytes_stream() .map_err(|source| crate::Error::Generic { - store: "MicrosoftAzure", + store: STORE, source: Box::new(source), }) .boxed(); @@ -222,26 +222,13 @@ impl ObjectStore for MicrosoftAzure { Ok(GetResult::Stream(stream)) } - async fn get_range(&self, location: &Path, range: Range) -> Result { - let bytes = self - .client - .get_request(location, Some(range), false) - .await? - .bytes() - .await - .map_err(|source| client::Error::GetResponseBody { - source, - path: location.to_string(), - })?; - Ok(bytes) - } - async fn head(&self, location: &Path) -> Result { use reqwest::header::{CONTENT_LENGTH, ETAG, LAST_MODIFIED}; + let options = GetOptions::default(); // Extract meta from headers // https://docs.microsoft.com/en-us/rest/api/storageservices/get-blob-properties - let response = self.client.get_request(location, None, true).await?; + let response = self.client.get_request(location, options, true).await?; let headers = response.headers(); let last_modified = headers @@ -1103,8 +1090,9 @@ fn split_sas(sas: &str) -> Result, Error> { mod tests { use super::*; use crate::tests::{ - copy_if_not_exists, list_uses_directories_correctly, list_with_delimiter, - put_get_delete_list, put_get_delete_list_opts, rename_and_copy, stream_get, + copy_if_not_exists, get_opts, list_uses_directories_correctly, + list_with_delimiter, put_get_delete_list, put_get_delete_list_opts, + rename_and_copy, stream_get, }; use std::collections::HashMap; use std::env; @@ -1175,6 +1163,7 @@ mod tests { async fn azure_blob_test() { let integration = maybe_skip_integration!().build().unwrap(); put_get_delete_list_opts(&integration, false).await; + get_opts(&integration).await; list_uses_directories_correctly(&integration).await; list_with_delimiter(&integration).await; rename_and_copy(&integration).await; @@ -1203,6 +1192,7 @@ mod tests { let integration = builder.build().unwrap(); put_get_delete_list(&integration).await; + get_opts(&integration).await; list_uses_directories_correctly(&integration).await; list_with_delimiter(&integration).await; rename_and_copy(&integration).await; diff --git a/src/chunked.rs b/src/chunked.rs index aebefec..c639d7e 100644 --- a/src/chunked.rs +++ b/src/chunked.rs @@ -30,7 +30,7 @@ use tokio::io::AsyncWrite; use crate::path::Path; use crate::util::maybe_spawn_blocking; -use crate::{GetResult, ListResult, ObjectMeta, ObjectStore}; +use crate::{GetOptions, GetResult, ListResult, ObjectMeta, ObjectStore}; use crate::{MultipartId, Result}; /// Wraps a [`ObjectStore`] and makes its get response return chunks @@ -81,8 +81,8 @@ impl ObjectStore for ChunkedStore { self.inner.abort_multipart(location, multipart_id).await } - async fn get(&self, location: &Path) -> Result { - match self.inner.get(location).await? { + async fn get_opts(&self, location: &Path, options: GetOptions) -> Result { + match self.inner.get_opts(location, options).await? { GetResult::File(std_file, ..) => { let reader = BufReader::new(std_file); let chunk_size = self.chunk_size; @@ -245,6 +245,7 @@ mod tests { let integration = ChunkedStore::new(Arc::clone(integration), 100); put_get_delete_list(&integration).await; + get_opts(&integration).await; list_uses_directories_correctly(&integration).await; list_with_delimiter(&integration).await; rename_and_copy(&integration).await; diff --git a/src/client/mod.rs b/src/client/mod.rs index ccf1b4a..be44a9f 100644 --- a/src/client/mod.rs +++ b/src/client/mod.rs @@ -31,11 +31,12 @@ use std::str::FromStr; use std::time::Duration; use reqwest::header::{HeaderMap, HeaderValue}; -use reqwest::{Client, ClientBuilder, Proxy}; +use reqwest::{Client, ClientBuilder, Proxy, RequestBuilder}; use serde::{Deserialize, Serialize}; use crate::config::{fmt_duration, ConfigValue}; use crate::path::Path; +use crate::GetOptions; fn map_client_error(e: reqwest::Error) -> super::Error { super::Error::Generic { @@ -462,6 +463,40 @@ impl ClientOptions { } } +pub trait GetOptionsExt { + fn with_get_options(self, options: GetOptions) -> Self; +} + +impl GetOptionsExt for RequestBuilder { + fn with_get_options(mut self, options: GetOptions) -> Self { + use hyper::header::*; + + if let Some(range) = options.range { + let range = format!("bytes={}-{}", range.start, range.end.saturating_sub(1)); + self = self.header(RANGE, range); + } + + if let Some(tag) = options.if_match { + self = self.header(IF_MATCH, tag); + } + + if let Some(tag) = options.if_none_match { + self = self.header(IF_NONE_MATCH, tag); + } + + const DATE_FORMAT: &str = "%a, %d %b %Y %H:%M:%S GMT"; + if let Some(date) = options.if_unmodified_since { + self = self.header(IF_UNMODIFIED_SINCE, date.format(DATE_FORMAT).to_string()); + } + + if let Some(date) = options.if_modified_since { + self = self.header(IF_MODIFIED_SINCE, date.format(DATE_FORMAT).to_string()); + } + + self + } +} + #[cfg(test)] mod tests { use super::*; diff --git a/src/client/retry.rs b/src/client/retry.rs index f9c2dd3..39a9131 100644 --- a/src/client/retry.rs +++ b/src/client/retry.rs @@ -32,6 +32,7 @@ pub struct Error { retries: usize, message: String, source: Option, + status: Option, } impl std::fmt::Display for Error { @@ -57,7 +58,28 @@ impl std::error::Error for Error { impl Error { /// Returns the status code associated with this error if any pub fn status(&self) -> Option { - self.source.as_ref().and_then(|e| e.status()) + self.status + } + + pub fn error(self, store: &'static str, path: String) -> crate::Error { + match self.status { + Some(StatusCode::NOT_FOUND) => crate::Error::NotFound { + path, + source: Box::new(self), + }, + Some(StatusCode::NOT_MODIFIED) => crate::Error::NotModified { + path, + source: Box::new(self), + }, + Some(StatusCode::PRECONDITION_FAILED) => crate::Error::Precondition { + path, + source: Box::new(self), + }, + _ => crate::Error::Generic { + store, + source: Box::new(self), + }, + } } } @@ -146,6 +168,14 @@ impl RetryExt for reqwest::RequestBuilder { match s.send().await { Ok(r) => match r.error_for_status_ref() { Ok(_) if r.status().is_success() => return Ok(r), + Ok(r) if r.status() == StatusCode::NOT_MODIFIED => { + return Err(Error{ + message: "not modified".to_string(), + retries, + status: Some(r.status()), + source: None, + }) + } Ok(r) => { let is_bare_redirect = r.status().is_redirection() && !r.headers().contains_key(LOCATION); let message = match is_bare_redirect { @@ -157,6 +187,7 @@ impl RetryExt for reqwest::RequestBuilder { return Err(Error{ message, retries, + status: Some(r.status()), source: None, }) } @@ -180,6 +211,7 @@ impl RetryExt for reqwest::RequestBuilder { return Err(Error{ message, retries, + status: Some(status), source: Some(e), }) @@ -209,7 +241,8 @@ impl RetryExt for reqwest::RequestBuilder { return Err(Error{ retries, message: "request error".to_string(), - source: Some(e) + status: e.status(), + source: Some(e), }) } let sleep = backoff.next(); diff --git a/src/gcp/mod.rs b/src/gcp/mod.rs index 375b4d8..41a91fe 100644 --- a/src/gcp/mod.rs +++ b/src/gcp/mod.rs @@ -31,7 +31,6 @@ //! week. use std::collections::BTreeSet; use std::io; -use std::ops::Range; use std::str::FromStr; use std::sync::Arc; @@ -40,7 +39,6 @@ use bytes::{Buf, Bytes}; use chrono::{DateTime, Utc}; use futures::{stream::BoxStream, StreamExt, TryStreamExt}; use percent_encoding::{percent_encode, NON_ALPHANUMERIC}; -use reqwest::header::RANGE; use reqwest::{header, Client, Method, Response, StatusCode}; use serde::{Deserialize, Serialize}; use snafu::{OptionExt, ResultExt, Snafu}; @@ -49,14 +47,14 @@ use url::Url; use crate::client::pagination::stream_paginated; use crate::client::retry::RetryExt; -use crate::client::ClientConfigKey; +use crate::client::{ClientConfigKey, GetOptionsExt}; use crate::{ client::token::TokenCache, multipart::{CloudMultiPartUpload, CloudMultiPartUploadImpl, UploadPart}, path::{Path, DELIMITER}, - util::{format_http_range, format_prefix}, - ClientOptions, GetResult, ListResult, MultipartId, ObjectMeta, ObjectStore, Result, - RetryConfig, + util::format_prefix, + ClientOptions, GetOptions, GetResult, ListResult, MultipartId, ObjectMeta, + ObjectStore, Result, RetryConfig, }; use self::credential::{ @@ -66,6 +64,8 @@ use self::credential::{ mod credential; +const STORE: &str = "GCS"; + #[derive(Debug, Snafu)] enum Error { #[snafu(display("Got invalid XML response for {} {}: {}", method, url, source))] @@ -100,15 +100,12 @@ enum Error { path: String, }, - #[snafu(display("Error performing copy request {}: {}", path, source))] - CopyRequest { + #[snafu(display("Error performing put request {}: {}", path, source))] + PutRequest { source: crate::client::retry::Error, path: String, }, - #[snafu(display("Error performing put request: {}", source))] - PutRequest { source: crate::client::retry::Error }, - #[snafu(display("Error getting put response body: {}", source))] PutResponseBody { source: reqwest::Error }, @@ -129,12 +126,6 @@ enum Error { #[snafu(display("GCP credential error: {}", source))] Credential { source: credential::Error }, - #[snafu(display("Already exists: {}", path))] - AlreadyExists { - source: crate::client::retry::Error, - path: String, - }, - #[snafu(display("Unable parse source url. Url: {}, Error: {}", url, source))] UnableToParseUrl { source: url::ParseError, @@ -159,23 +150,12 @@ impl From for super::Error { match err { Error::GetRequest { source, path } | Error::DeleteRequest { source, path } - | Error::CopyRequest { source, path } - if matches!(source.status(), Some(StatusCode::NOT_FOUND)) => - { - Self::NotFound { - path, - source: Box::new(source), - } - } - Error::AlreadyExists { source, path } => Self::AlreadyExists { - source: Box::new(source), - path, - }, + | Error::PutRequest { source, path } => source.error(STORE, path), Error::UnknownConfigurationKey { key } => { - Self::UnknownConfigurationKey { store: "GCS", key } + Self::UnknownConfigurationKey { store: STORE, key } } _ => Self::Generic { - store: "GCS", + store: STORE, source: Box::new(err), }, } @@ -280,26 +260,23 @@ impl GoogleCloudStorageClient { async fn get_request( &self, path: &Path, - range: Option>, + options: GetOptions, head: bool, ) -> Result { let token = self.get_token().await?; let url = self.object_url(path); - let mut builder = self.client.request(Method::GET, url); - - if let Some(range) = range { - builder = builder.header(RANGE, format_http_range(range)); - } - let alt = match head { true => "json", false => "media", }; + let builder = self.client.request(Method::GET, url); + let response = builder .bearer_auth(token) .query(&[("alt", alt)]) + .with_get_options(options) .send_retry(&self.retry_config) .await .context(GetRequestSnafu { @@ -331,7 +308,9 @@ impl GoogleCloudStorageClient { .body(payload) .send_retry(&self.retry_config) .await - .context(PutRequestSnafu)?; + .context(PutRequestSnafu { + path: path.as_ref(), + })?; Ok(()) } @@ -355,7 +334,9 @@ impl GoogleCloudStorageClient { .query(&[("uploads", "")]) .send_retry(&self.retry_config) .await - .context(PutRequestSnafu)?; + .context(PutRequestSnafu { + path: path.as_ref(), + })?; let data = response.bytes().await.context(PutResponseBodySnafu)?; let result: InitiateMultipartUploadResult = quick_xml::de::from_reader( @@ -387,7 +368,7 @@ impl GoogleCloudStorageClient { .query(&[("uploadId", multipart_id)]) .send_retry(&self.retry_config) .await - .context(PutRequestSnafu)?; + .context(PutRequestSnafu { path })?; Ok(()) } @@ -444,22 +425,12 @@ impl GoogleCloudStorageClient { .header(header::CONTENT_LENGTH, 0) .send_retry(&self.retry_config) .await - .map_err(|err| { - if err - .status() - .map(|status| status == reqwest::StatusCode::PRECONDITION_FAILED) - .unwrap_or_else(|| false) - { - Error::AlreadyExists { - source: err, - path: to.to_string(), - } - } else { - Error::CopyRequest { - source: err, - path: from.to_string(), - } - } + .map_err(|err| match err.status() { + Some(StatusCode::PRECONDITION_FAILED) => crate::Error::AlreadyExists { + source: Box::new(err), + path: to.to_string(), + }, + _ => err.error(STORE, from.to_string()), })?; Ok(()) @@ -667,12 +638,18 @@ impl ObjectStore for GoogleCloudStorage { Ok(()) } - async fn get(&self, location: &Path) -> Result { - let response = self.client.get_request(location, None, false).await?; + async fn get_opts(&self, location: &Path, options: GetOptions) -> Result { + if options.if_modified_since.is_some() || options.if_unmodified_since.is_some() { + return Err(super::Error::NotSupported { + source: "ModifiedSince Preconditions not supported by GoogleCloudStorage JSON API".to_string().into(), + }); + } + + let response = self.client.get_request(location, options, false).await?; let stream = response .bytes_stream() .map_err(|source| crate::Error::Generic { - store: "GCS", + store: STORE, source: Box::new(source), }) .boxed(); @@ -680,18 +657,9 @@ impl ObjectStore for GoogleCloudStorage { Ok(GetResult::Stream(stream)) } - async fn get_range(&self, location: &Path, range: Range) -> Result { - let response = self - .client - .get_request(location, Some(range), false) - .await?; - Ok(response.bytes().await.context(GetResponseBodySnafu { - path: location.as_ref(), - })?) - } - async fn head(&self, location: &Path) -> Result { - let response = self.client.get_request(location, None, true).await?; + let options = GetOptions::default(); + let response = self.client.get_request(location, options, true).await?; let object = response.json().await.context(GetResponseBodySnafu { path: location.as_ref(), })?; @@ -1224,13 +1192,7 @@ mod test { use std::io::Write; use tempfile::NamedTempFile; - use crate::{ - tests::{ - copy_if_not_exists, get_nonexistent_object, list_uses_directories_correctly, - list_with_delimiter, put_get_delete_list, rename_and_copy, stream_get, - }, - Error as ObjectStoreError, ObjectStore, - }; + use crate::tests::*; use super::*; @@ -1299,6 +1261,8 @@ mod test { // Fake GCS server does not yet implement XML Multipart uploads // https://github.com/fsouza/fake-gcs-server/issues/852 stream_get(&integration).await; + // Fake GCS server doesn't currently honor preconditions + get_opts(&integration).await; } } @@ -1311,7 +1275,7 @@ mod test { let err = integration.get(&location).await.unwrap_err(); assert!( - matches!(err, ObjectStoreError::NotFound { .. }), + matches!(err, crate::Error::NotFound { .. }), "unexpected error type: {err}" ); } @@ -1330,7 +1294,7 @@ mod test { .unwrap_err(); assert!( - matches!(err, ObjectStoreError::NotFound { .. }), + matches!(err, crate::Error::NotFound { .. }), "unexpected error type: {err}" ); } @@ -1343,7 +1307,7 @@ mod test { let err = integration.delete(&location).await.unwrap_err(); assert!( - matches!(err, ObjectStoreError::NotFound { .. }), + matches!(err, crate::Error::NotFound { .. }), "unexpected error type: {err}" ); } @@ -1359,7 +1323,7 @@ mod test { let err = integration.delete(&location).await.unwrap_err(); assert!( - matches!(err, ObjectStoreError::NotFound { .. }), + matches!(err, crate::Error::NotFound { .. }), "unexpected error type: {err}" ); } diff --git a/src/http/client.rs b/src/http/client.rs index 5ef2721..4e58eb0 100644 --- a/src/http/client.rs +++ b/src/http/client.rs @@ -16,17 +16,17 @@ // under the License. use crate::client::retry::{self, RetryConfig, RetryExt}; +use crate::client::GetOptionsExt; use crate::path::{Path, DELIMITER}; -use crate::util::{deserialize_rfc1123, format_http_range}; -use crate::{ClientOptions, ObjectMeta, Result}; +use crate::util::deserialize_rfc1123; +use crate::{ClientOptions, GetOptions, ObjectMeta, Result}; use bytes::{Buf, Bytes}; use chrono::{DateTime, Utc}; use percent_encoding::percent_decode_str; -use reqwest::header::{CONTENT_TYPE, RANGE}; +use reqwest::header::CONTENT_TYPE; use reqwest::{Method, Response, StatusCode}; use serde::Deserialize; use snafu::{OptionExt, ResultExt, Snafu}; -use std::ops::Range; use url::Url; #[derive(Debug, Snafu)] @@ -229,19 +229,12 @@ impl Client { Ok(()) } - pub async fn get( - &self, - location: &Path, - range: Option>, - ) -> Result { + pub async fn get(&self, location: &Path, options: GetOptions) -> Result { let url = self.path_url(location); - let mut builder = self.client.get(url); - - if let Some(range) = range { - builder = builder.header(RANGE, format_http_range(range)); - } + let builder = self.client.get(url); builder + .with_get_options(options) .send_retry(&self.retry_config) .await .map_err(|source| match source.status() { diff --git a/src/http/mod.rs b/src/http/mod.rs index c91faa2..bed1972 100644 --- a/src/http/mod.rs +++ b/src/http/mod.rs @@ -31,8 +31,6 @@ //! [rfc2518]: https://datatracker.ietf.org/doc/html/rfc2518 //! [WebDAV]: https://en.wikipedia.org/wiki/WebDAV -use std::ops::Range; - use async_trait::async_trait; use bytes::Bytes; use futures::stream::BoxStream; @@ -45,8 +43,8 @@ use url::Url; use crate::http::client::Client; use crate::path::Path; use crate::{ - ClientOptions, GetResult, ListResult, MultipartId, ObjectMeta, ObjectStore, Result, - RetryConfig, + ClientOptions, GetOptions, GetResult, ListResult, MultipartId, ObjectMeta, + ObjectStore, Result, RetryConfig, }; mod client; @@ -119,8 +117,8 @@ impl ObjectStore for HttpStore { Err(super::Error::NotImplemented) } - async fn get(&self, location: &Path) -> Result { - let response = self.client.get(location, None).await?; + async fn get_opts(&self, location: &Path, options: GetOptions) -> Result { + let response = self.client.get(location, options).await?; let stream = response .bytes_stream() .map_err(|source| Error::Reqwest { source }.into()) @@ -129,17 +127,6 @@ impl ObjectStore for HttpStore { Ok(GetResult::Stream(stream)) } - async fn get_range(&self, location: &Path, range: Range) -> Result { - let bytes = self - .client - .get(location, Some(range)) - .await? - .bytes() - .await - .context(ReqwestSnafu)?; - Ok(bytes) - } - async fn head(&self, location: &Path) -> Result { let status = self.client.list(Some(location), "0").await?; match status.response.len() { diff --git a/src/lib.rs b/src/lib.rs index 2c93802..75f9ca7 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -346,11 +346,24 @@ pub trait ObjectStore: std::fmt::Display + Send + Sync + Debug + 'static { } /// Return the bytes that are stored at the specified location. - async fn get(&self, location: &Path) -> Result; + async fn get(&self, location: &Path) -> Result { + self.get_opts(location, GetOptions::default()).await + } + + /// Perform a get request with options + /// + /// Note: options.range will be ignored if [`GetResult::File`] + async fn get_opts(&self, location: &Path, options: GetOptions) -> Result; /// Return the bytes that are stored at the specified location /// in the given byte range - async fn get_range(&self, location: &Path, range: Range) -> Result; + async fn get_range(&self, location: &Path, range: Range) -> Result { + let options = GetOptions { + range: Some(range), + ..Default::default() + }; + self.get_opts(location, options).await?.bytes().await + } /// Return the bytes that are stored at the specified location /// in the given byte ranges @@ -478,6 +491,10 @@ impl ObjectStore for Box { self.as_ref().get(location).await } + async fn get_opts(&self, location: &Path, options: GetOptions) -> Result { + self.as_ref().get_opts(location, options).await + } + async fn get_range(&self, location: &Path, range: Range) -> Result { self.as_ref().get_range(location, range).await } @@ -558,6 +575,66 @@ pub struct ObjectMeta { pub e_tag: Option, } +/// Options for a get request, such as range +#[derive(Debug, Default)] +pub struct GetOptions { + /// Request will succeed if the `ObjectMeta::e_tag` matches + /// otherwise returning [`Error::Precondition`] + /// + /// + pub if_match: Option, + /// Request will succeed if the `ObjectMeta::e_tag` does not match + /// otherwise returning [`Error::NotModified`] + /// + /// + pub if_none_match: Option, + /// Request will succeed if the object has been modified since + /// + /// + pub if_modified_since: Option>, + /// Request will succeed if the object has not been modified since + /// otherwise returning [`Error::Precondition`] + /// + /// Some stores, such as S3, will only return `NotModified` for exact + /// timestamp matches, instead of for any timestamp greater than or equal. + /// + /// + pub if_unmodified_since: Option>, + /// Request transfer of only the specified range of bytes + /// otherwise returning [`Error::NotModified`] + /// + /// + pub range: Option>, +} + +impl GetOptions { + /// Returns an error if the modification conditions on this request are not satisfied + fn check_modified( + &self, + location: &Path, + last_modified: DateTime, + ) -> Result<()> { + if let Some(date) = self.if_modified_since { + if last_modified <= date { + return Err(Error::NotModified { + path: location.to_string(), + source: format!("{} >= {}", date, last_modified).into(), + }); + } + } + + if let Some(date) = self.if_unmodified_since { + if last_modified > date { + return Err(Error::Precondition { + path: location.to_string(), + source: format!("{} < {}", date, last_modified).into(), + }); + } + } + Ok(()) + } +} + /// Result for a get request /// /// This special cases the case of a local file, as some systems may @@ -702,6 +779,18 @@ pub enum Error { source: Box, }, + #[snafu(display("Request precondition failure for path {}: {}", path, source))] + Precondition { + path: String, + source: Box, + }, + + #[snafu(display("Object at location {} not modified: {}", path, source))] + NotModified { + path: String, + source: Box, + }, + #[snafu(display("Operation not yet implemented."))] NotImplemented, @@ -1025,6 +1114,85 @@ mod tests { delete_fixtures(storage).await; } + pub(crate) async fn get_opts(storage: &dyn ObjectStore) { + let path = Path::from("test"); + storage.put(&path, "foo".into()).await.unwrap(); + let meta = storage.head(&path).await.unwrap(); + + let options = GetOptions { + if_unmodified_since: Some(meta.last_modified), + ..GetOptions::default() + }; + match storage.get_opts(&path, options).await { + Ok(_) | Err(Error::NotSupported { .. }) => {} + Err(e) => panic!("{e}"), + } + + let options = GetOptions { + if_unmodified_since: Some(meta.last_modified + chrono::Duration::hours(10)), + ..GetOptions::default() + }; + match storage.get_opts(&path, options).await { + Ok(_) | Err(Error::NotSupported { .. }) => {} + Err(e) => panic!("{e}"), + } + + let options = GetOptions { + if_unmodified_since: Some(meta.last_modified - chrono::Duration::hours(10)), + ..GetOptions::default() + }; + match storage.get_opts(&path, options).await { + Err(Error::Precondition { .. } | Error::NotSupported { .. }) => {} + d => panic!("{d:?}"), + } + + let options = GetOptions { + if_modified_since: Some(meta.last_modified), + ..GetOptions::default() + }; + match storage.get_opts(&path, options).await { + Err(Error::NotModified { .. } | Error::NotSupported { .. }) => {} + d => panic!("{d:?}"), + } + + let options = GetOptions { + if_modified_since: Some(meta.last_modified - chrono::Duration::hours(10)), + ..GetOptions::default() + }; + match storage.get_opts(&path, options).await { + Ok(_) | Err(Error::NotSupported { .. }) => {} + Err(e) => panic!("{e}"), + } + + if let Some(tag) = meta.e_tag { + let options = GetOptions { + if_match: Some(tag.clone()), + ..GetOptions::default() + }; + storage.get_opts(&path, options).await.unwrap(); + + let options = GetOptions { + if_match: Some("invalid".to_string()), + ..GetOptions::default() + }; + let err = storage.get_opts(&path, options).await.unwrap_err(); + assert!(matches!(err, Error::Precondition { .. }), "{err}"); + + let options = GetOptions { + if_none_match: Some(tag.clone()), + ..GetOptions::default() + }; + let err = storage.get_opts(&path, options).await.unwrap_err(); + assert!(matches!(err, Error::NotModified { .. }), "{err}"); + + let options = GetOptions { + if_none_match: Some("invalid".to_string()), + ..GetOptions::default() + }; + storage.get_opts(&path, options).await.unwrap(); + } + } + fn get_vec_of_bytes(chunk_length: usize, num_chunks: usize) -> Vec { std::iter::repeat(Bytes::from_iter(std::iter::repeat(b'x').take(chunk_length))) .take(num_chunks) diff --git a/src/limit.rs b/src/limit.rs index d0d9f73..e009111 100644 --- a/src/limit.rs +++ b/src/limit.rs @@ -18,8 +18,8 @@ //! An object store that limits the maximum concurrency of the wrapped implementation use crate::{ - BoxStream, GetResult, ListResult, MultipartId, ObjectMeta, ObjectStore, Path, Result, - StreamExt, + BoxStream, GetOptions, GetResult, ListResult, MultipartId, ObjectMeta, ObjectStore, + Path, Result, StreamExt, }; use async_trait::async_trait; use bytes::Bytes; @@ -114,6 +114,16 @@ impl ObjectStore for LimitStore { } } + async fn get_opts(&self, location: &Path, options: GetOptions) -> Result { + let permit = Arc::clone(&self.semaphore).acquire_owned().await.unwrap(); + match self.inner.get_opts(location, options).await? { + r @ GetResult::File(_, _) => Ok(r), + GetResult::Stream(s) => { + Ok(GetResult::Stream(PermitWrapper::new(s, permit).boxed())) + } + } + } + async fn get_range(&self, location: &Path, range: Range) -> Result { let _permit = self.semaphore.acquire().await.unwrap(); self.inner.get_range(location, range).await @@ -251,10 +261,7 @@ impl AsyncWrite for PermitWrapper { mod tests { use crate::limit::LimitStore; use crate::memory::InMemory; - use crate::tests::{ - list_uses_directories_correctly, list_with_delimiter, put_get_delete_list, - rename_and_copy, stream_get, - }; + use crate::tests::*; use crate::ObjectStore; use std::time::Duration; use tokio::time::timeout; @@ -266,6 +273,7 @@ mod tests { let integration = LimitStore::new(memory, max_requests); put_get_delete_list(&integration).await; + get_opts(&integration).await; list_uses_directories_correctly(&integration).await; list_with_delimiter(&integration).await; rename_and_copy(&integration).await; diff --git a/src/local.rs b/src/local.rs index b40f5a7..26a8bf3 100644 --- a/src/local.rs +++ b/src/local.rs @@ -19,7 +19,7 @@ use crate::{ maybe_spawn_blocking, path::{absolute_path_to_url, Path}, - GetResult, ListResult, MultipartId, ObjectMeta, ObjectStore, Result, + GetOptions, GetResult, ListResult, MultipartId, ObjectMeta, ObjectStore, Result, }; use async_trait::async_trait; use bytes::Bytes; @@ -56,7 +56,7 @@ pub(crate) enum Error { }, #[snafu(display("Unable to access metadata for {}: {}", path, source))] - UnableToAccessMetadata { + Metadata { source: Box, path: String, }, @@ -360,10 +360,27 @@ impl ObjectStore for LocalFileSystem { Err(super::Error::NotImplemented) } - async fn get(&self, location: &Path) -> Result { - let path = self.config.path_to_filesystem(location)?; + async fn get_opts(&self, location: &Path, options: GetOptions) -> Result { + if options.if_match.is_some() || options.if_none_match.is_some() { + return Err(super::Error::NotSupported { + source: "ETags not supported by LocalFileSystem".to_string().into(), + }); + } + + let location = location.clone(); + let path = self.config.path_to_filesystem(&location)?; maybe_spawn_blocking(move || { let file = open_file(&path)?; + if options.if_unmodified_since.is_some() + || options.if_modified_since.is_some() + { + let metadata = file.metadata().map_err(|e| Error::Metadata { + source: e.into(), + path: location.to_string(), + })?; + options.check_modified(&location, last_modified(&metadata))?; + } + Ok(GetResult::File(file, path)) }) .await @@ -408,7 +425,7 @@ impl ObjectStore for LocalFileSystem { source: e, } } else { - Error::UnableToAccessMetadata { + Error::Metadata { source: e.into(), path: location.to_string(), } @@ -878,21 +895,22 @@ fn open_file(path: &PathBuf) -> Result { } fn convert_entry(entry: DirEntry, location: Path) -> Result { - let metadata = entry - .metadata() - .map_err(|e| Error::UnableToAccessMetadata { - source: e.into(), - path: location.to_string(), - })?; + let metadata = entry.metadata().map_err(|e| Error::Metadata { + source: e.into(), + path: location.to_string(), + })?; convert_metadata(metadata, location) } -fn convert_metadata(metadata: std::fs::Metadata, location: Path) -> Result { - let last_modified: DateTime = metadata +fn last_modified(metadata: &std::fs::Metadata) -> DateTime { + metadata .modified() .expect("Modified file time should be supported on this platform") - .into(); + .into() +} +fn convert_metadata(metadata: std::fs::Metadata, location: Path) -> Result { + let last_modified = last_modified(&metadata); let size = usize::try_from(metadata.len()).context(FileSizeOverflowedUsizeSnafu { path: location.as_ref(), })?; @@ -956,13 +974,7 @@ fn convert_walkdir_result( mod tests { use super::*; use crate::test_util::flatten_list_stream; - use crate::{ - tests::{ - copy_if_not_exists, get_nonexistent_object, list_uses_directories_correctly, - list_with_delimiter, put_get_delete_list, rename_and_copy, stream_get, - }, - Error as ObjectStoreError, ObjectStore, - }; + use crate::tests::*; use futures::TryStreamExt; use tempfile::{NamedTempFile, TempDir}; use tokio::io::AsyncWriteExt; @@ -973,6 +985,7 @@ mod tests { let integration = LocalFileSystem::new_with_prefix(root.path()).unwrap(); put_get_delete_list(&integration).await; + get_opts(&integration).await; list_uses_directories_correctly(&integration).await; list_with_delimiter(&integration).await; rename_and_copy(&integration).await; @@ -1085,7 +1098,7 @@ mod tests { let err = get_nonexistent_object(&integration, Some(location)) .await .unwrap_err(); - if let ObjectStoreError::NotFound { path, source } = err { + if let crate::Error::NotFound { path, source } = err { let source_variant = source.downcast_ref::(); assert!( matches!(source_variant, Some(std::io::Error { .. }),), diff --git a/src/memory.rs b/src/memory.rs index b01ffbb..82d4859 100644 --- a/src/memory.rs +++ b/src/memory.rs @@ -16,8 +16,8 @@ // under the License. //! An in-memory object store implementation -use crate::MultipartId; use crate::{path::Path, GetResult, ListResult, ObjectMeta, ObjectStore, Result}; +use crate::{GetOptions, MultipartId}; use async_trait::async_trait; use bytes::Bytes; use chrono::{DateTime, Utc}; @@ -128,12 +128,17 @@ impl ObjectStore for InMemory { })) } - async fn get(&self, location: &Path) -> Result { - let data = self.entry(location).await?; + async fn get_opts(&self, location: &Path, options: GetOptions) -> Result { + if options.if_match.is_some() || options.if_none_match.is_some() { + return Err(super::Error::NotSupported { + source: "ETags not supported by InMemory".to_string().into(), + }); + } + let (data, last_modified) = self.entry(location).await?; + options.check_modified(location, last_modified)?; - Ok(GetResult::Stream( - futures::stream::once(async move { Ok(data.0) }).boxed(), - )) + let stream = futures::stream::once(futures::future::ready(Ok(data))); + Ok(GetResult::Stream(stream.boxed())) } async fn get_range(&self, location: &Path, range: Range) -> Result { @@ -391,19 +396,14 @@ mod tests { use super::*; - use crate::{ - tests::{ - copy_if_not_exists, get_nonexistent_object, list_uses_directories_correctly, - list_with_delimiter, put_get_delete_list, rename_and_copy, stream_get, - }, - Error as ObjectStoreError, ObjectStore, - }; + use crate::tests::*; #[tokio::test] async fn in_memory_test() { let integration = InMemory::new(); put_get_delete_list(&integration).await; + get_opts(&integration).await; list_uses_directories_correctly(&integration).await; list_with_delimiter(&integration).await; rename_and_copy(&integration).await; @@ -443,7 +443,7 @@ mod tests { let err = get_nonexistent_object(&integration, Some(location)) .await .unwrap_err(); - if let ObjectStoreError::NotFound { path, source } = err { + if let crate::Error::NotFound { path, source } = err { let source_variant = source.downcast_ref::(); assert!( matches!(source_variant, Some(Error::NoDataInMemory { .. }),), diff --git a/src/prefix.rs b/src/prefix.rs index 94836d3..ffe5094 100644 --- a/src/prefix.rs +++ b/src/prefix.rs @@ -22,7 +22,9 @@ use std::ops::Range; use tokio::io::AsyncWrite; use crate::path::Path; -use crate::{GetResult, ListResult, MultipartId, ObjectMeta, ObjectStore, Result}; +use crate::{ + GetOptions, GetResult, ListResult, MultipartId, ObjectMeta, ObjectStore, Result, +}; #[doc(hidden)] #[deprecated(note = "Use PrefixStore")] @@ -117,6 +119,15 @@ impl ObjectStore for PrefixStore { self.inner.get_range(&full_path, range).await } + async fn get_opts( + &self, + location: &Path, + options: GetOptions, + ) -> Result { + let full_path = self.full_path(location); + self.inner.get_opts(&full_path, options).await + } + async fn get_ranges( &self, location: &Path, @@ -206,10 +217,7 @@ mod tests { use super::*; use crate::local::LocalFileSystem; use crate::test_util::flatten_list_stream; - use crate::tests::{ - copy_if_not_exists, list_uses_directories_correctly, list_with_delimiter, - put_get_delete_list, rename_and_copy, stream_get, - }; + use crate::tests::*; use tempfile::TempDir; @@ -220,6 +228,7 @@ mod tests { let integration = PrefixStore::new(inner, "prefix"); put_get_delete_list(&integration).await; + get_opts(&integration).await; list_uses_directories_correctly(&integration).await; list_with_delimiter(&integration).await; rename_and_copy(&integration).await; diff --git a/src/throttle.rs b/src/throttle.rs index e513031..fb90afc 100644 --- a/src/throttle.rs +++ b/src/throttle.rs @@ -20,8 +20,8 @@ use parking_lot::Mutex; use std::ops::Range; use std::{convert::TryInto, sync::Arc}; -use crate::MultipartId; use crate::{path::Path, GetResult, ListResult, ObjectMeta, ObjectStore, Result}; +use crate::{GetOptions, MultipartId}; use async_trait::async_trait; use bytes::Bytes; use futures::{stream::BoxStream, FutureExt, StreamExt}; @@ -179,17 +179,18 @@ impl ObjectStore for ThrottledStore { // need to copy to avoid moving / referencing `self` let wait_get_per_byte = self.config().wait_get_per_byte; - self.inner.get(location).await.map(|result| { - let s = match result { - GetResult::Stream(s) => s, - GetResult::File(_, _) => unimplemented!(), - }; + let result = self.inner.get(location).await?; + Ok(throttle_get(result, wait_get_per_byte)) + } - GetResult::Stream(throttle_stream(s, move |bytes| { - let bytes_len: u32 = usize_to_u32_saturate(bytes.len()); - wait_get_per_byte * bytes_len - })) - }) + async fn get_opts(&self, location: &Path, options: GetOptions) -> Result { + sleep(self.config().wait_get_per_call).await; + + // need to copy to avoid moving / referencing `self` + let wait_get_per_byte = self.config().wait_get_per_byte; + + let result = self.inner.get_opts(location, options).await?; + Ok(throttle_get(result, wait_get_per_byte)) } async fn get_range(&self, location: &Path, range: Range) -> Result { @@ -299,6 +300,18 @@ fn usize_to_u32_saturate(x: usize) -> u32 { x.try_into().unwrap_or(u32::MAX) } +fn throttle_get(result: GetResult, wait_get_per_byte: Duration) -> GetResult { + let s = match result { + GetResult::Stream(s) => s, + GetResult::File(_, _) => unimplemented!(), + }; + + GetResult::Stream(throttle_stream(s, move |bytes| { + let bytes_len: u32 = usize_to_u32_saturate(bytes.len()); + wait_get_per_byte * bytes_len + })) +} + fn throttle_stream( stream: BoxStream<'_, Result>, delay: F, @@ -317,13 +330,7 @@ where #[cfg(test)] mod tests { use super::*; - use crate::{ - memory::InMemory, - tests::{ - copy_if_not_exists, list_uses_directories_correctly, list_with_delimiter, - put_get_delete_list, rename_and_copy, - }, - }; + use crate::{memory::InMemory, tests::*}; use bytes::Bytes; use futures::TryStreamExt; use tokio::time::Duration; diff --git a/src/util.rs b/src/util.rs index e5c701d..ba4c683 100644 --- a/src/util.rs +++ b/src/util.rs @@ -44,13 +44,6 @@ pub fn format_prefix(prefix: Option<&crate::path::Path>) -> Option { .map(|p| format!("{}{}", p.as_ref(), crate::path::DELIMITER)) } -/// Returns a formatted HTTP range header as per -/// -#[cfg(any(feature = "aws", feature = "gcp", feature = "azure", feature = "http"))] -pub fn format_http_range(range: std::ops::Range) -> String { - format!("bytes={}-{}", range.start, range.end.saturating_sub(1)) -} - #[cfg(any(feature = "aws", feature = "azure"))] pub(crate) fn hmac_sha256( secret: impl AsRef<[u8]>, From b8747233b72ea550af18f98bc9fbc7b4ebf50441 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Mon, 15 May 2023 18:56:21 +0100 Subject: [PATCH 141/397] Consistently use GCP XML API (#4207) * Consistently use GCP XML API * Use updated fake-gcs-server * Review feedback --- CONTRIBUTING.md | 2 +- Cargo.toml | 2 +- src/aws/client.rs | 69 +------------------ src/aws/mod.rs | 57 ++------------- src/azure/mod.rs | 60 ++-------------- src/client/header.rs | 83 ++++++++++++++++++++++ src/client/list.rs | 85 +++++++++++++++++++++++ src/client/mod.rs | 6 ++ src/gcp/mod.rs | 161 ++++++++++++++----------------------------- src/prefix.rs | 6 +- 10 files changed, 245 insertions(+), 286 deletions(-) create mode 100644 src/client/header.rs create mode 100644 src/client/list.rs diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 550640d..47c2940 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -103,7 +103,7 @@ To test the GCS integration, we use [Fake GCS Server](https://github.com/fsouza/ Startup the fake server: ```shell -docker run -p 4443:4443 fsouza/fake-gcs-server -scheme http +docker run -p 4443:4443 tustvold/fake-gcs-server -scheme http ``` Configure the account: diff --git a/Cargo.toml b/Cargo.toml index e25801b..c6b89fa 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -68,7 +68,7 @@ tokio = { version = "1.25.0", features = ["sync", "macros", "rt", "time", "io-ut nix = "0.26.1" [features] -cloud = ["serde", "serde_json", "quick-xml", "hyper", "reqwest", "reqwest/json","reqwest/stream", "chrono/serde", "base64", "rand", "ring"] +cloud = ["serde", "serde_json", "quick-xml", "hyper", "reqwest", "reqwest/json", "reqwest/stream", "chrono/serde", "base64", "rand", "ring"] azure = ["cloud"] gcp = ["cloud", "rustls-pemfile"] aws = ["cloud"] diff --git a/src/aws/client.rs b/src/aws/client.rs index b2d01ab..1cdf785 100644 --- a/src/aws/client.rs +++ b/src/aws/client.rs @@ -18,6 +18,7 @@ use crate::aws::checksum::Checksum; use crate::aws::credential::{AwsCredential, CredentialExt, CredentialProvider}; use crate::aws::{STORE, STRICT_PATH_ENCODE_SET}; +use crate::client::list::ListResponse; use crate::client::pagination::stream_paginated; use crate::client::retry::RetryExt; use crate::client::GetOptionsExt; @@ -25,13 +26,12 @@ use crate::multipart::UploadPart; use crate::path::DELIMITER; use crate::util::format_prefix; use crate::{ - BoxStream, ClientOptions, GetOptions, ListResult, MultipartId, ObjectMeta, Path, - Result, RetryConfig, StreamExt, + BoxStream, ClientOptions, GetOptions, ListResult, MultipartId, Path, Result, + RetryConfig, StreamExt, }; use base64::prelude::BASE64_STANDARD; use base64::Engine; use bytes::{Buf, Bytes}; -use chrono::{DateTime, Utc}; use percent_encoding::{utf8_percent_encode, PercentEncode}; use reqwest::{header::CONTENT_TYPE, Client as ReqwestClient, Method, Response}; use serde::{Deserialize, Serialize}; @@ -109,69 +109,6 @@ impl From for crate::Error { } } -#[derive(Debug, Deserialize)] -#[serde(rename_all = "PascalCase")] -pub struct ListResponse { - #[serde(default)] - pub contents: Vec, - #[serde(default)] - pub common_prefixes: Vec, - #[serde(default)] - pub next_continuation_token: Option, -} - -impl TryFrom for ListResult { - type Error = crate::Error; - - fn try_from(value: ListResponse) -> Result { - let common_prefixes = value - .common_prefixes - .into_iter() - .map(|x| Ok(Path::parse(x.prefix)?)) - .collect::>()?; - - let objects = value - .contents - .into_iter() - .map(TryFrom::try_from) - .collect::>()?; - - Ok(Self { - common_prefixes, - objects, - }) - } -} - -#[derive(Debug, Deserialize)] -#[serde(rename_all = "PascalCase")] -pub struct ListPrefix { - pub prefix: String, -} - -#[derive(Debug, Deserialize)] -#[serde(rename_all = "PascalCase")] -pub struct ListContents { - pub key: String, - pub size: usize, - pub last_modified: DateTime, - #[serde(rename = "ETag")] - pub e_tag: Option, -} - -impl TryFrom for ObjectMeta { - type Error = crate::Error; - - fn try_from(value: ListContents) -> Result { - Ok(Self { - location: Path::parse(value.key)?, - last_modified: value.last_modified, - size: value.size, - e_tag: value.e_tag, - }) - } -} - #[derive(Debug, Deserialize)] #[serde(rename_all = "PascalCase")] struct InitiateMultipart { diff --git a/src/aws/mod.rs b/src/aws/mod.rs index 3f9b480..2c38a9b 100644 --- a/src/aws/mod.rs +++ b/src/aws/mod.rs @@ -33,7 +33,6 @@ use async_trait::async_trait; use bytes::Bytes; -use chrono::{DateTime, Utc}; use futures::stream::BoxStream; use futures::TryStreamExt; use itertools::Itertools; @@ -52,6 +51,7 @@ use crate::aws::credential::{ AwsCredential, CredentialProvider, InstanceCredentialProvider, StaticCredentialProvider, WebIdentityProvider, }; +use crate::client::header::header_meta; use crate::client::ClientConfigKey; use crate::config::ConfigValue; use crate::multipart::{CloudMultiPartUpload, CloudMultiPartUploadImpl, UploadPart}; @@ -87,24 +87,6 @@ static METADATA_ENDPOINT: &str = "http://169.254.169.254"; #[derive(Debug, Snafu)] #[allow(missing_docs)] enum Error { - #[snafu(display("Last-Modified Header missing from response"))] - MissingLastModified, - - #[snafu(display("Content-Length Header missing from response"))] - MissingContentLength, - - #[snafu(display("Invalid last modified '{}': {}", last_modified, source))] - InvalidLastModified { - last_modified: String, - source: chrono::ParseError, - }, - - #[snafu(display("Invalid content length '{}': {}", content_length, source))] - InvalidContentLength { - content_length: String, - source: std::num::ParseIntError, - }, - #[snafu(display("Missing region"))] MissingRegion, @@ -155,6 +137,11 @@ enum Error { #[snafu(display("Failed to parse the region for bucket '{}'", bucket))] RegionParse { bucket: String }, + + #[snafu(display("Failed to parse headers: {}", source))] + Header { + source: crate::client::header::Error, + }, } impl From for super::Error { @@ -261,41 +248,11 @@ impl ObjectStore for AmazonS3 { } async fn head(&self, location: &Path) -> Result { - use reqwest::header::{CONTENT_LENGTH, ETAG, LAST_MODIFIED}; - let options = GetOptions::default(); // Extract meta from headers // https://docs.aws.amazon.com/AmazonS3/latest/API/API_HeadObject.html#API_HeadObject_ResponseSyntax let response = self.client.get_request(location, options, true).await?; - let headers = response.headers(); - - let last_modified = headers - .get(LAST_MODIFIED) - .context(MissingLastModifiedSnafu)?; - - let content_length = headers - .get(CONTENT_LENGTH) - .context(MissingContentLengthSnafu)?; - - let last_modified = last_modified.to_str().context(BadHeaderSnafu)?; - let last_modified = DateTime::parse_from_rfc2822(last_modified) - .context(InvalidLastModifiedSnafu { last_modified })? - .with_timezone(&Utc); - - let content_length = content_length.to_str().context(BadHeaderSnafu)?; - let content_length = content_length - .parse() - .context(InvalidContentLengthSnafu { content_length })?; - - let e_tag = headers.get(ETAG).context(MissingEtagSnafu)?; - let e_tag = e_tag.to_str().context(BadHeaderSnafu)?; - - Ok(ObjectMeta { - location: location.clone(), - last_modified, - size: content_length, - e_tag: Some(e_tag.to_string()), - }) + Ok(header_meta(location, response.headers()).context(HeaderSnafu)?) } async fn delete(&self, location: &Path) -> Result<()> { diff --git a/src/azure/mod.rs b/src/azure/mod.rs index 6726241..0f8dae0 100644 --- a/src/azure/mod.rs +++ b/src/azure/mod.rs @@ -38,7 +38,6 @@ use async_trait::async_trait; use base64::prelude::BASE64_STANDARD; use base64::Engine; use bytes::Bytes; -use chrono::{TimeZone, Utc}; use futures::{stream::BoxStream, StreamExt, TryStreamExt}; use percent_encoding::percent_decode_str; use serde::{Deserialize, Serialize}; @@ -50,9 +49,9 @@ use std::{collections::BTreeSet, str::FromStr}; use tokio::io::AsyncWrite; use url::Url; +use crate::client::header::header_meta; use crate::client::ClientConfigKey; use crate::config::ConfigValue; -use crate::util::RFC1123_FMT; pub use credential::authority_hosts; mod client; @@ -75,24 +74,6 @@ const MSI_ENDPOINT_ENV_KEY: &str = "IDENTITY_ENDPOINT"; #[derive(Debug, Snafu)] #[allow(missing_docs)] enum Error { - #[snafu(display("Last-Modified Header missing from response"))] - MissingLastModified, - - #[snafu(display("Content-Length Header missing from response"))] - MissingContentLength, - - #[snafu(display("Invalid last modified '{}': {}", last_modified, source))] - InvalidLastModified { - last_modified: String, - source: chrono::ParseError, - }, - - #[snafu(display("Invalid content length '{}': {}", content_length, source))] - InvalidContentLength { - content_length: String, - source: std::num::ParseIntError, - }, - #[snafu(display("Received header containing non-ASCII data"))] BadHeader { source: reqwest::header::ToStrError }, @@ -146,6 +127,11 @@ enum Error { #[snafu(display("ETag Header missing from response"))] MissingEtag, + + #[snafu(display("Failed to parse headers: {}", source))] + Header { + source: crate::client::header::Error, + }, } impl From for super::Error { @@ -223,44 +209,12 @@ impl ObjectStore for MicrosoftAzure { } async fn head(&self, location: &Path) -> Result { - use reqwest::header::{CONTENT_LENGTH, ETAG, LAST_MODIFIED}; let options = GetOptions::default(); // Extract meta from headers // https://docs.microsoft.com/en-us/rest/api/storageservices/get-blob-properties let response = self.client.get_request(location, options, true).await?; - let headers = response.headers(); - - let last_modified = headers - .get(LAST_MODIFIED) - .ok_or(Error::MissingLastModified)? - .to_str() - .context(BadHeaderSnafu)?; - let last_modified = Utc - .datetime_from_str(last_modified, RFC1123_FMT) - .context(InvalidLastModifiedSnafu { last_modified })?; - - let content_length = headers - .get(CONTENT_LENGTH) - .ok_or(Error::MissingContentLength)? - .to_str() - .context(BadHeaderSnafu)?; - let content_length = content_length - .parse() - .context(InvalidContentLengthSnafu { content_length })?; - - let e_tag = headers - .get(ETAG) - .ok_or(Error::MissingEtag)? - .to_str() - .context(BadHeaderSnafu)?; - - Ok(ObjectMeta { - location: location.clone(), - last_modified, - size: content_length, - e_tag: Some(e_tag.to_string()), - }) + Ok(header_meta(location, response.headers()).context(HeaderSnafu)?) } async fn delete(&self, location: &Path) -> Result<()> { diff --git a/src/client/header.rs b/src/client/header.rs new file mode 100644 index 0000000..cc4f16e --- /dev/null +++ b/src/client/header.rs @@ -0,0 +1,83 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Logic for extracting ObjectMeta from headers used by AWS, GCP and Azure + +use crate::path::Path; +use crate::ObjectMeta; +use chrono::{DateTime, Utc}; +use hyper::header::{CONTENT_LENGTH, ETAG, LAST_MODIFIED}; +use hyper::HeaderMap; +use snafu::{OptionExt, ResultExt, Snafu}; + +#[derive(Debug, Snafu)] +pub enum Error { + #[snafu(display("ETag Header missing from response"))] + MissingEtag, + + #[snafu(display("Received header containing non-ASCII data"))] + BadHeader { source: reqwest::header::ToStrError }, + + #[snafu(display("Last-Modified Header missing from response"))] + MissingLastModified, + + #[snafu(display("Content-Length Header missing from response"))] + MissingContentLength, + + #[snafu(display("Invalid last modified '{}': {}", last_modified, source))] + InvalidLastModified { + last_modified: String, + source: chrono::ParseError, + }, + + #[snafu(display("Invalid content length '{}': {}", content_length, source))] + InvalidContentLength { + content_length: String, + source: std::num::ParseIntError, + }, +} + +/// Extracts [`ObjectMeta`] from the provided [`HeaderMap`] +pub fn header_meta(location: &Path, headers: &HeaderMap) -> Result { + let last_modified = headers + .get(LAST_MODIFIED) + .context(MissingLastModifiedSnafu)?; + + let content_length = headers + .get(CONTENT_LENGTH) + .context(MissingContentLengthSnafu)?; + + let last_modified = last_modified.to_str().context(BadHeaderSnafu)?; + let last_modified = DateTime::parse_from_rfc2822(last_modified) + .context(InvalidLastModifiedSnafu { last_modified })? + .with_timezone(&Utc); + + let content_length = content_length.to_str().context(BadHeaderSnafu)?; + let content_length = content_length + .parse() + .context(InvalidContentLengthSnafu { content_length })?; + + let e_tag = headers.get(ETAG).context(MissingEtagSnafu)?; + let e_tag = e_tag.to_str().context(BadHeaderSnafu)?; + + Ok(ObjectMeta { + location: location.clone(), + last_modified, + size: content_length, + e_tag: Some(e_tag.to_string()), + }) +} diff --git a/src/client/list.rs b/src/client/list.rs new file mode 100644 index 0000000..6a3889e --- /dev/null +++ b/src/client/list.rs @@ -0,0 +1,85 @@ +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! The list response format used by GCP and AWS + +use crate::path::Path; +use crate::{ListResult, ObjectMeta, Result}; +use chrono::{DateTime, Utc}; +use serde::Deserialize; + +#[derive(Debug, Deserialize)] +#[serde(rename_all = "PascalCase")] +pub struct ListResponse { + #[serde(default)] + pub contents: Vec, + #[serde(default)] + pub common_prefixes: Vec, + #[serde(default)] + pub next_continuation_token: Option, +} + +impl TryFrom for ListResult { + type Error = crate::Error; + + fn try_from(value: ListResponse) -> Result { + let common_prefixes = value + .common_prefixes + .into_iter() + .map(|x| Ok(Path::parse(x.prefix)?)) + .collect::>()?; + + let objects = value + .contents + .into_iter() + .map(TryFrom::try_from) + .collect::>()?; + + Ok(Self { + common_prefixes, + objects, + }) + } +} + +#[derive(Debug, Deserialize)] +#[serde(rename_all = "PascalCase")] +pub struct ListPrefix { + pub prefix: String, +} + +#[derive(Debug, Deserialize)] +#[serde(rename_all = "PascalCase")] +pub struct ListContents { + pub key: String, + pub size: usize, + pub last_modified: DateTime, + #[serde(rename = "ETag")] + pub e_tag: Option, +} + +impl TryFrom for ObjectMeta { + type Error = crate::Error; + + fn try_from(value: ListContents) -> Result { + Ok(Self { + location: Path::parse(value.key)?, + last_modified: value.last_modified, + size: value.size, + e_tag: value.e_tag, + }) + } +} diff --git a/src/client/mod.rs b/src/client/mod.rs index be44a9f..c6a73fe 100644 --- a/src/client/mod.rs +++ b/src/client/mod.rs @@ -26,6 +26,12 @@ pub mod retry; #[cfg(any(feature = "aws", feature = "gcp", feature = "azure"))] pub mod token; +#[cfg(any(feature = "aws", feature = "gcp", feature = "azure"))] +pub mod header; + +#[cfg(any(feature = "aws", feature = "gcp"))] +pub mod list; + use std::collections::HashMap; use std::str::FromStr; use std::time::Duration; diff --git a/src/gcp/mod.rs b/src/gcp/mod.rs index 41a91fe..32f4055 100644 --- a/src/gcp/mod.rs +++ b/src/gcp/mod.rs @@ -36,15 +36,16 @@ use std::sync::Arc; use async_trait::async_trait; use bytes::{Buf, Bytes}; -use chrono::{DateTime, Utc}; use futures::{stream::BoxStream, StreamExt, TryStreamExt}; -use percent_encoding::{percent_encode, NON_ALPHANUMERIC}; +use percent_encoding::{percent_encode, utf8_percent_encode, NON_ALPHANUMERIC}; use reqwest::{header, Client, Method, Response, StatusCode}; use serde::{Deserialize, Serialize}; use snafu::{OptionExt, ResultExt, Snafu}; use tokio::io::AsyncWrite; use url::Url; +use crate::client::header::header_meta; +use crate::client::list::ListResponse; use crate::client::pagination::stream_paginated; use crate::client::retry::RetryExt; use crate::client::{ClientConfigKey, GetOptionsExt}; @@ -82,6 +83,9 @@ enum Error { #[snafu(display("Error getting list response body: {}", source))] ListResponseBody { source: reqwest::Error }, + #[snafu(display("Got invalid list response: {}", source))] + InvalidListResponse { source: quick_xml::de::DeError }, + #[snafu(display("Error performing get request {}: {}", path, source))] GetRequest { source: crate::client::retry::Error, @@ -143,6 +147,11 @@ enum Error { #[snafu(display("Configuration key: '{}' is not known.", key))] UnknownConfigurationKey { key: String }, + + #[snafu(display("Failed to parse headers: {}", source))] + Header { + source: crate::client::header::Error, + }, } impl From for super::Error { @@ -162,25 +171,6 @@ impl From for super::Error { } } -#[derive(serde::Deserialize, Debug)] -#[serde(rename_all = "camelCase")] -struct ListResponse { - next_page_token: Option, - #[serde(default)] - prefixes: Vec, - #[serde(default)] - items: Vec, -} - -#[derive(serde::Deserialize, Debug)] -struct Object { - name: String, - size: String, - updated: DateTime, - #[serde(rename = "etag")] - e_tag: Option, -} - #[derive(serde::Deserialize, Debug)] #[serde(rename_all = "PascalCase")] struct InitiateMultipartUploadResult { @@ -248,15 +238,11 @@ impl GoogleCloudStorageClient { } fn object_url(&self, path: &Path) -> String { - let encoded = - percent_encoding::utf8_percent_encode(path.as_ref(), NON_ALPHANUMERIC); - format!( - "{}/storage/v1/b/{}/o/{}", - self.base_url, self.bucket_name_encoded, encoded - ) + let encoded = utf8_percent_encode(path.as_ref(), NON_ALPHANUMERIC); + format!("{}/{}/{}", self.base_url, self.bucket_name_encoded, encoded) } - /// Perform a get request + /// Perform a get request async fn get_request( &self, path: &Path, @@ -266,16 +252,15 @@ impl GoogleCloudStorageClient { let token = self.get_token().await?; let url = self.object_url(path); - let alt = match head { - true => "json", - false => "media", + let method = match head { + true => Method::HEAD, + false => Method::GET, }; - let builder = self.client.request(Method::GET, url); - - let response = builder + let response = self + .client + .request(method, url) .bearer_auth(token) - .query(&[("alt", alt)]) .with_get_options(options) .send_retry(&self.retry_config) .await @@ -286,13 +271,10 @@ impl GoogleCloudStorageClient { Ok(response) } - /// Perform a put request + /// Perform a put request async fn put_request(&self, path: &Path, payload: Bytes) -> Result<()> { let token = self.get_token().await?; - let url = format!( - "{}/upload/storage/v1/b/{}/o", - self.base_url, self.bucket_name_encoded - ); + let url = self.object_url(path); let content_type = self .client_options @@ -300,11 +282,10 @@ impl GoogleCloudStorageClient { .unwrap_or("application/octet-stream"); self.client - .request(Method::POST, url) + .request(Method::PUT, url) .bearer_auth(token) .header(header::CONTENT_TYPE, content_type) .header(header::CONTENT_LENGTH, payload.len()) - .query(&[("uploadType", "media"), ("name", path.as_ref())]) .body(payload) .send_retry(&self.retry_config) .await @@ -373,7 +354,7 @@ impl GoogleCloudStorageClient { Ok(()) } - /// Perform a delete request + /// Perform a delete request async fn delete_request(&self, path: &Path) -> Result<()> { let token = self.get_token().await?; let url = self.object_url(path); @@ -390,7 +371,7 @@ impl GoogleCloudStorageClient { Ok(()) } - /// Perform a copy request + /// Perform a copy request async fn copy_request( &self, from: &Path, @@ -398,24 +379,18 @@ impl GoogleCloudStorageClient { if_not_exists: bool, ) -> Result<()> { let token = self.get_token().await?; + let url = self.object_url(to); - let source = - percent_encoding::utf8_percent_encode(from.as_ref(), NON_ALPHANUMERIC); - let destination = - percent_encoding::utf8_percent_encode(to.as_ref(), NON_ALPHANUMERIC); - let url = format!( - "{}/storage/v1/b/{}/o/{}/copyTo/b/{}/o/{}", - self.base_url, - self.bucket_name_encoded, - source, - self.bucket_name_encoded, - destination - ); + let from = utf8_percent_encode(from.as_ref(), NON_ALPHANUMERIC); + let source = format!("{}/{}", self.bucket_name_encoded, from); - let mut builder = self.client.request(Method::POST, url); + let mut builder = self + .client + .request(Method::PUT, url) + .header("x-goog-copy-source", source); if if_not_exists { - builder = builder.query(&[("ifGenerationMatch", "0")]); + builder = builder.header("x-goog-if-generation-match", 0); } builder @@ -436,7 +411,7 @@ impl GoogleCloudStorageClient { Ok(()) } - /// Perform a list request + /// Perform a list request async fn list_request( &self, prefix: Option<&str>, @@ -444,13 +419,10 @@ impl GoogleCloudStorageClient { page_token: Option<&str>, ) -> Result { let token = self.get_token().await?; + let url = format!("{}/{}", self.base_url, self.bucket_name_encoded); - let url = format!( - "{}/storage/v1/b/{}/o", - self.base_url, self.bucket_name_encoded - ); - - let mut query = Vec::with_capacity(4); + let mut query = Vec::with_capacity(5); + query.push(("list-type", "2")); if delimiter { query.push(("delimiter", DELIMITER)) } @@ -460,14 +432,14 @@ impl GoogleCloudStorageClient { } if let Some(page_token) = page_token { - query.push(("pageToken", page_token)) + query.push(("continuation-token", page_token)) } if let Some(max_results) = &self.max_list_results { - query.push(("maxResults", max_results)) + query.push(("max-keys", max_results)) } - let response: ListResponse = self + let response = self .client .request(Method::GET, url) .query(&query) @@ -475,10 +447,13 @@ impl GoogleCloudStorageClient { .send_retry(&self.retry_config) .await .context(ListRequestSnafu)? - .json() + .bytes() .await .context(ListResponseBodySnafu)?; + let response: ListResponse = quick_xml::de::from_reader(response.reader()) + .context(InvalidListResponseSnafu)?; + Ok(response) } @@ -487,14 +462,14 @@ impl GoogleCloudStorageClient { &self, prefix: Option<&Path>, delimiter: bool, - ) -> BoxStream<'_, Result> { + ) -> BoxStream<'_, Result> { let prefix = format_prefix(prefix); stream_paginated(prefix, move |prefix, token| async move { let mut r = self .list_request(prefix.as_deref(), delimiter, token.as_deref()) .await?; - let next_token = r.next_page_token.take(); - Ok((r, prefix, next_token)) + let next_token = r.next_continuation_token.take(); + Ok((r.try_into()?, prefix, next_token)) }) .boxed() } @@ -639,12 +614,6 @@ impl ObjectStore for GoogleCloudStorage { } async fn get_opts(&self, location: &Path, options: GetOptions) -> Result { - if options.if_modified_since.is_some() || options.if_unmodified_since.is_some() { - return Err(super::Error::NotSupported { - source: "ModifiedSince Preconditions not supported by GoogleCloudStorage JSON API".to_string().into(), - }); - } - let response = self.client.get_request(location, options, false).await?; let stream = response .bytes_stream() @@ -660,10 +629,7 @@ impl ObjectStore for GoogleCloudStorage { async fn head(&self, location: &Path) -> Result { let options = GetOptions::default(); let response = self.client.get_request(location, options, true).await?; - let object = response.json().await.context(GetResponseBodySnafu { - path: location.as_ref(), - })?; - convert_object_meta(&object) + Ok(header_meta(location, response.headers()).context(HeaderSnafu)?) } async fn delete(&self, location: &Path) -> Result<()> { @@ -677,11 +643,7 @@ impl ObjectStore for GoogleCloudStorage { let stream = self .client .list_paginated(prefix, false) - .map_ok(|r| { - futures::stream::iter( - r.items.into_iter().map(|x| convert_object_meta(&x)), - ) - }) + .map_ok(|r| futures::stream::iter(r.objects.into_iter().map(Ok))) .try_flatten() .boxed(); @@ -696,15 +658,8 @@ impl ObjectStore for GoogleCloudStorage { while let Some(result) = stream.next().await { let response = result?; - - for p in response.prefixes { - common_prefixes.insert(Path::parse(p)?); - } - - objects.reserve(response.items.len()); - for object in &response.items { - objects.push(convert_object_meta(object)?); - } + common_prefixes.extend(response.common_prefixes.into_iter()); + objects.extend(response.objects.into_iter()); } Ok(ListResult { @@ -1170,20 +1125,6 @@ impl GoogleCloudStorageBuilder { } } -fn convert_object_meta(object: &Object) -> Result { - let location = Path::parse(&object.name)?; - let last_modified = object.updated; - let size = object.size.parse().context(InvalidSizeSnafu)?; - let e_tag = object.e_tag.clone(); - - Ok(ObjectMeta { - location, - last_modified, - size, - e_tag, - }) -} - #[cfg(test)] mod test { use bytes::Bytes; diff --git a/src/prefix.rs b/src/prefix.rs index ffe5094..39585f7 100644 --- a/src/prefix.rs +++ b/src/prefix.rs @@ -119,11 +119,7 @@ impl ObjectStore for PrefixStore { self.inner.get_range(&full_path, range).await } - async fn get_opts( - &self, - location: &Path, - options: GetOptions, - ) -> Result { + async fn get_opts(&self, location: &Path, options: GetOptions) -> Result { let full_path = self.full_path(location); self.inner.get_opts(&full_path, options).await } From b2ab0d93c472a3728fa63c6cef0aee24aff34a65 Mon Sep 17 00:00:00 2001 From: Josh Wiley Date: Tue, 16 May 2023 04:48:12 -0700 Subject: [PATCH 142/397] Object Store (AWS): Support region configured via named profile (#4161) * feat(aws_profile): use profile region as fallback * moved ProfileProvider to aws::profile module * added aws::region::RegionProvider * lazy-init profile credential provider * support overriding profile region * tests * fix(aws_profile): clippy & RAT errors * fix(aws_profile): make RegionProvider async * test(aws_profile): use fake config for testing * refactor(aws_profile): remove unnecessary module aws::profile::region -> aws::profile * refactor(aws_profile): tests w/ profile files * fix(object_store): rat + clippy warnings * Don't spawn thread --------- Co-authored-by: Raphael Taylor-Davies --- src/aws/credential.rs | 62 -------------------- src/aws/mod.rs | 78 ++++++++++++++++++++++++- src/aws/profile.rs | 128 ++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 204 insertions(+), 64 deletions(-) create mode 100644 src/aws/profile.rs diff --git a/src/aws/credential.rs b/src/aws/credential.rs index 16cdf35..9e04794 100644 --- a/src/aws/credential.rs +++ b/src/aws/credential.rs @@ -515,68 +515,6 @@ async fn web_identity( }) } -#[cfg(feature = "aws_profile")] -mod profile { - use super::*; - use aws_config::profile::ProfileFileCredentialsProvider; - use aws_config::provider_config::ProviderConfig; - use aws_credential_types::provider::ProvideCredentials; - use aws_types::region::Region; - use std::time::SystemTime; - - #[derive(Debug)] - pub struct ProfileProvider { - cache: TokenCache>, - credentials: ProfileFileCredentialsProvider, - } - - impl ProfileProvider { - pub fn new(name: String, region: String) -> Self { - let config = ProviderConfig::default().with_region(Some(Region::new(region))); - - Self { - cache: Default::default(), - credentials: ProfileFileCredentialsProvider::builder() - .configure(&config) - .profile_name(name) - .build(), - } - } - } - - impl CredentialProvider for ProfileProvider { - fn get_credential(&self) -> BoxFuture<'_, Result>> { - Box::pin(self.cache.get_or_insert_with(move || async move { - let c = - self.credentials - .provide_credentials() - .await - .map_err(|source| crate::Error::Generic { - store: STORE, - source: Box::new(source), - })?; - let t_now = SystemTime::now(); - let expiry = c - .expiry() - .and_then(|e| e.duration_since(t_now).ok()) - .map(|ttl| Instant::now() + ttl); - - Ok(TemporaryToken { - token: Arc::new(AwsCredential { - key_id: c.access_key_id().to_string(), - secret_key: c.secret_access_key().to_string(), - token: c.session_token().map(ToString::to_string), - }), - expiry, - }) - })) - } - } -} - -#[cfg(feature = "aws_profile")] -pub use profile::ProfileProvider; - #[cfg(test)] mod tests { use super::*; diff --git a/src/aws/mod.rs b/src/aws/mod.rs index 2c38a9b..428e013 100644 --- a/src/aws/mod.rs +++ b/src/aws/mod.rs @@ -64,6 +64,9 @@ mod checksum; mod client; mod credential; +#[cfg(feature = "aws_profile")] +mod profile; + // http://docs.aws.amazon.com/general/latest/gr/sigv4-create-canonical-request.html // // Do not URI-encode any of the unreserved characters that RFC 3986 defines: @@ -985,8 +988,14 @@ impl AmazonS3Builder { self.parse_url(&url)?; } + let region = match (self.region.clone(), self.profile.clone()) { + (Some(region), _) => Some(region), + (None, Some(profile)) => profile_region(profile), + (None, None) => None, + }; + let bucket = self.bucket_name.context(MissingBucketNameSnafu)?; - let region = self.region.context(MissingRegionSnafu)?; + let region = region.context(MissingRegionSnafu)?; let checksum = self.checksum_algorithm.map(|x| x.get()).transpose()?; let credentials = match (self.access_key_id, self.secret_access_key, self.token) { @@ -1094,12 +1103,30 @@ impl AmazonS3Builder { } } +#[cfg(feature = "aws_profile")] +fn profile_region(profile: String) -> Option { + use tokio::runtime::Handle; + + let handle = Handle::current(); + let provider = profile::ProfileProvider::new(profile, None); + + handle.block_on(provider.get_region()) +} + #[cfg(feature = "aws_profile")] fn profile_credentials( profile: String, region: String, ) -> Result> { - Ok(Box::new(credential::ProfileProvider::new(profile, region))) + Ok(Box::new(profile::ProfileProvider::new( + profile, + Some(region), + ))) +} + +#[cfg(not(feature = "aws_profile"))] +fn profile_region(_profile: String) -> Option { + None } #[cfg(not(feature = "aws_profile"))] @@ -1594,3 +1621,50 @@ mod s3_resolve_bucket_region_tests { assert!(result.is_err()); } } + +#[cfg(all(test, feature = "aws_profile"))] +mod profile_tests { + use super::*; + use std::env; + + use super::profile::{TEST_PROFILE_NAME, TEST_PROFILE_REGION}; + + #[tokio::test] + async fn s3_test_region_from_profile() { + let s3_url = "s3://bucket/prefix".to_owned(); + + let s3 = AmazonS3Builder::new() + .with_url(s3_url) + .with_profile(TEST_PROFILE_NAME) + .build() + .unwrap(); + + let region = &s3.client.config().region; + + assert_eq!(region, TEST_PROFILE_REGION); + } + + #[test] + fn s3_test_region_override() { + let s3_url = "s3://bucket/prefix".to_owned(); + + let aws_profile = + env::var("AWS_PROFILE").unwrap_or_else(|_| TEST_PROFILE_NAME.into()); + + let aws_region = + env::var("AWS_REGION").unwrap_or_else(|_| "object_store:fake_region".into()); + + env::set_var("AWS_PROFILE", aws_profile); + + let s3 = AmazonS3Builder::from_env() + .with_url(s3_url) + .with_region(aws_region.clone()) + .build() + .unwrap(); + + let actual = &s3.client.config().region; + let expected = &aws_region; + + assert_eq!(actual, expected); + } +} diff --git a/src/aws/profile.rs b/src/aws/profile.rs new file mode 100644 index 0000000..a88824c --- /dev/null +++ b/src/aws/profile.rs @@ -0,0 +1,128 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#![cfg(feature = "aws_profile")] + +use aws_config::meta::region::ProvideRegion; +use aws_config::profile::profile_file::ProfileFiles; +use aws_config::profile::ProfileFileCredentialsProvider; +use aws_config::profile::ProfileFileRegionProvider; +use aws_config::provider_config::ProviderConfig; +use aws_credential_types::provider::ProvideCredentials; +use aws_types::region::Region; +use futures::future::BoxFuture; +use std::sync::Arc; +use std::time::Instant; +use std::time::SystemTime; + +use crate::aws::credential::CredentialProvider; +use crate::aws::AwsCredential; +use crate::client::token::{TemporaryToken, TokenCache}; +use crate::Result; + +#[cfg(test)] +pub static TEST_PROFILE_NAME: &str = "object_store:fake_profile"; + +#[cfg(test)] +pub static TEST_PROFILE_REGION: &str = "object_store:fake_region_from_profile"; + +#[derive(Debug)] +pub struct ProfileProvider { + name: String, + region: Option, + cache: TokenCache>, +} + +impl ProfileProvider { + pub fn new(name: String, region: Option) -> Self { + Self { + name, + region, + cache: Default::default(), + } + } + + #[cfg(test)] + fn profile_files(&self) -> ProfileFiles { + use aws_config::profile::profile_file::ProfileFileKind; + + let config = format!( + "[profile {}]\nregion = {}", + TEST_PROFILE_NAME, TEST_PROFILE_REGION + ); + + ProfileFiles::builder() + .with_contents(ProfileFileKind::Config, config) + .build() + } + + #[cfg(not(test))] + fn profile_files(&self) -> ProfileFiles { + ProfileFiles::default() + } + + pub async fn get_region(&self) -> Option { + if let Some(region) = self.region.clone() { + return Some(region); + } + + let provider = ProfileFileRegionProvider::builder() + .profile_files(self.profile_files()) + .profile_name(&self.name) + .build(); + + let region = provider.region().await; + + region.map(|r| r.as_ref().to_owned()) + } +} + +impl CredentialProvider for ProfileProvider { + fn get_credential(&self) -> BoxFuture<'_, Result>> { + Box::pin(self.cache.get_or_insert_with(move || async move { + let region = self.region.clone().map(Region::new); + + let config = ProviderConfig::default().with_region(region); + + let credentials = ProfileFileCredentialsProvider::builder() + .configure(&config) + .profile_name(&self.name) + .build(); + + let c = credentials.provide_credentials().await.map_err(|source| { + crate::Error::Generic { + store: "S3", + source: Box::new(source), + } + })?; + let t_now = SystemTime::now(); + let expiry = c + .expiry() + .and_then(|e| e.duration_since(t_now).ok()) + .map(|ttl| Instant::now() + ttl); + + Ok(TemporaryToken { + token: Arc::new(AwsCredential { + key_id: c.access_key_id().to_string(), + secret_key: c.secret_access_key().to_string(), + token: c.session_token().map(ToString::to_string), + }), + expiry, + }) + })) + } +} From 59bad6a27c7e5a8257e5d98c078b48423348d7ed Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Wed, 17 May 2023 08:31:01 +0100 Subject: [PATCH 143/397] Return NotFound for directories in Head and Get (#4230) (#4231) * Return NotFound for directories in Head and Get (#4230) * Fix webdav * Fix error message --- src/azure/client.rs | 14 +++++++++++++- src/http/client.rs | 11 +++++++---- src/http/mod.rs | 20 ++++++++------------ src/lib.rs | 8 ++++++++ src/local.rs | 43 +++++++++++++++++++++++++++---------------- 5 files changed, 63 insertions(+), 33 deletions(-) diff --git a/src/azure/client.rs b/src/azure/client.rs index 4611986..893e261 100644 --- a/src/azure/client.rs +++ b/src/azure/client.rs @@ -257,7 +257,19 @@ impl AzureClient { path: path.as_ref(), })?; - Ok(response) + match response.headers().get("x-ms-resource-type") { + Some(resource) if resource.as_ref() != b"file" => { + Err(crate::Error::NotFound { + path: path.to_string(), + source: format!( + "Not a file, got x-ms-resource-type: {}", + String::from_utf8_lossy(resource.as_ref()) + ) + .into(), + }) + } + _ => Ok(response), + } } /// Make an Azure Delete request diff --git a/src/http/client.rs b/src/http/client.rs index 4e58eb0..6feacbb 100644 --- a/src/http/client.rs +++ b/src/http/client.rs @@ -238,10 +238,13 @@ impl Client { .send_retry(&self.retry_config) .await .map_err(|source| match source.status() { - Some(StatusCode::NOT_FOUND) => crate::Error::NotFound { - source: Box::new(source), - path: location.to_string(), - }, + // Some stores return METHOD_NOT_ALLOWED for get on directories + Some(StatusCode::NOT_FOUND | StatusCode::METHOD_NOT_ALLOWED) => { + crate::Error::NotFound { + source: Box::new(source), + path: location.to_string(), + } + } _ => Error::Request { source }.into(), }) } diff --git a/src/http/mod.rs b/src/http/mod.rs index bed1972..124b7da 100644 --- a/src/http/mod.rs +++ b/src/http/mod.rs @@ -60,15 +60,6 @@ enum Error { url: String, }, - #[snafu(display("Object is a directory"))] - IsDirectory, - - #[snafu(display("PROPFIND response contained no valid objects"))] - NoObjects, - - #[snafu(display("PROPFIND response contained more than one object"))] - MultipleObjects, - #[snafu(display("Request error: {}", source))] Reqwest { source: reqwest::Error }, } @@ -134,12 +125,17 @@ impl ObjectStore for HttpStore { let response = status.response.into_iter().next().unwrap(); response.check_ok()?; match response.is_dir() { - true => Err(Error::IsDirectory.into()), + true => Err(crate::Error::NotFound { + path: location.to_string(), + source: "Is directory".to_string().into(), + }), false => response.object_meta(self.client.base_url()), } } - 0 => Err(Error::NoObjects.into()), - _ => Err(Error::MultipleObjects.into()), + x => Err(crate::Error::NotFound { + path: location.to_string(), + source: format!("Expected 1 result, got {x}").into(), + }), } } diff --git a/src/lib.rs b/src/lib.rs index 75f9ca7..0f3ed80 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -880,6 +880,14 @@ mod tests { assert_eq!(result.common_prefixes.len(), 1); assert_eq!(result.common_prefixes[0], Path::from("test_dir")); + // Should return not found + let err = storage.get(&Path::from("test_dir")).await.unwrap_err(); + assert!(matches!(err, crate::Error::NotFound { .. }), "{}", err); + + // Should return not found + let err = storage.head(&Path::from("test_dir")).await.unwrap_err(); + assert!(matches!(err, crate::Error::NotFound { .. }), "{}", err); + // List everything starting with a prefix that should return results let prefix = Path::from("test_dir"); let content_list = flatten_list_stream(storage, Some(&prefix)).await.unwrap(); diff --git a/src/local.rs b/src/local.rs index 26a8bf3..52719f1 100644 --- a/src/local.rs +++ b/src/local.rs @@ -419,18 +419,23 @@ impl ObjectStore for LocalFileSystem { maybe_spawn_blocking(move || { let metadata = match metadata(&path) { - Err(e) => Err(if e.kind() == ErrorKind::NotFound { - Error::NotFound { + Err(e) => Err(match e.kind() { + ErrorKind::NotFound => Error::NotFound { path: path.clone(), source: e, - } - } else { - Error::Metadata { + }, + _ => Error::Metadata { source: e.into(), path: location.to_string(), - } + }, }), - Ok(m) => Ok(m), + Ok(m) => match m.is_file() { + true => Ok(m), + false => Err(Error::NotFound { + path, + source: io::Error::new(ErrorKind::NotFound, "is not file"), + }), + }, }?; convert_metadata(metadata, location) }) @@ -878,19 +883,25 @@ fn read_range(file: &mut File, path: &PathBuf, range: Range) -> Result Result { - let file = File::open(path).map_err(|e| { - if e.kind() == std::io::ErrorKind::NotFound { - Error::NotFound { + let file = match File::open(path).and_then(|f| Ok((f.metadata()?, f))) { + Err(e) => Err(match e.kind() { + ErrorKind::NotFound => Error::NotFound { path: path.clone(), source: e, - } - } else { - Error::UnableToOpenFile { + }, + _ => Error::UnableToOpenFile { path: path.clone(), source: e, - } - } - })?; + }, + }), + Ok((metadata, file)) => match metadata.is_file() { + true => Ok(file), + false => Err(Error::NotFound { + path: path.clone(), + source: io::Error::new(ErrorKind::NotFound, "not a file"), + }), + }, + }?; Ok(file) } From fedfd15c76460f141b3c81503d30f494ce04bd6c Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Wed, 17 May 2023 12:13:04 +0100 Subject: [PATCH 144/397] Standardise credentials API (#4223) (#4163) (#4225) * Standardise credentials API (#4223) (#4163) * Clippy * Allow HTTP metadata endpoint --- src/aws/client.rs | 6 +- src/aws/credential.rs | 91 +++++++++---------- src/aws/mod.rs | 60 +++++++------ src/aws/profile.rs | 71 ++++++++------- src/azure/client.rs | 52 ++--------- src/azure/credential.rs | 131 +++++++++++++++------------- src/azure/mod.rs | 65 +++++++------- src/client/mod.rs | 89 ++++++++++++++++++- src/gcp/credential.rs | 187 +++++++++++++++++++++------------------- src/gcp/mod.rs | 121 ++++++++++++-------------- 10 files changed, 461 insertions(+), 412 deletions(-) diff --git a/src/aws/client.rs b/src/aws/client.rs index 1cdf785..8ce743b 100644 --- a/src/aws/client.rs +++ b/src/aws/client.rs @@ -16,8 +16,8 @@ // under the License. use crate::aws::checksum::Checksum; -use crate::aws::credential::{AwsCredential, CredentialExt, CredentialProvider}; -use crate::aws::{STORE, STRICT_PATH_ENCODE_SET}; +use crate::aws::credential::{AwsCredential, CredentialExt}; +use crate::aws::{AwsCredentialProvider, STORE, STRICT_PATH_ENCODE_SET}; use crate::client::list::ListResponse; use crate::client::pagination::stream_paginated; use crate::client::retry::RetryExt; @@ -135,7 +135,7 @@ pub struct S3Config { pub endpoint: String, pub bucket: String, pub bucket_endpoint: String, - pub credentials: Box, + pub credentials: AwsCredentialProvider, pub retry_config: RetryConfig, pub client_options: ClientOptions, pub sign_payload: bool, diff --git a/src/aws/credential.rs b/src/aws/credential.rs index 9e04794..47d681c 100644 --- a/src/aws/credential.rs +++ b/src/aws/credential.rs @@ -18,12 +18,12 @@ use crate::aws::{STORE, STRICT_ENCODE_SET}; use crate::client::retry::RetryExt; use crate::client::token::{TemporaryToken, TokenCache}; +use crate::client::TokenProvider; use crate::util::hmac_sha256; use crate::{Result, RetryConfig}; +use async_trait::async_trait; use bytes::Buf; use chrono::{DateTime, Utc}; -use futures::future::BoxFuture; -use futures::TryFutureExt; use percent_encoding::utf8_percent_encode; use reqwest::header::{HeaderMap, HeaderValue}; use reqwest::{Client, Method, Request, RequestBuilder, StatusCode}; @@ -41,10 +41,14 @@ static EMPTY_SHA256_HASH: &str = "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855"; static UNSIGNED_PAYLOAD_LITERAL: &str = "UNSIGNED-PAYLOAD"; -#[derive(Debug)] +/// A set of AWS security credentials +#[derive(Debug, Eq, PartialEq)] pub struct AwsCredential { + /// AWS_ACCESS_KEY_ID pub key_id: String, + /// AWS_SECRET_ACCESS_KEY pub secret_key: String, + /// AWS_SESSION_TOKEN pub token: Option, } @@ -291,49 +295,31 @@ fn canonicalize_headers(header_map: &HeaderMap) -> (String, String) { (signed_headers, canonical_headers) } -/// Provides credentials for use when signing requests -pub trait CredentialProvider: std::fmt::Debug + Send + Sync { - fn get_credential(&self) -> BoxFuture<'_, Result>>; -} - -/// A static set of credentials -#[derive(Debug)] -pub struct StaticCredentialProvider { - pub credential: Arc, -} - -impl CredentialProvider for StaticCredentialProvider { - fn get_credential(&self) -> BoxFuture<'_, Result>> { - Box::pin(futures::future::ready(Ok(Arc::clone(&self.credential)))) - } -} - /// Credentials sourced from the instance metadata service /// /// #[derive(Debug)] pub struct InstanceCredentialProvider { pub cache: TokenCache>, - pub client: Client, - pub retry_config: RetryConfig, pub imdsv1_fallback: bool, pub metadata_endpoint: String, } -impl CredentialProvider for InstanceCredentialProvider { - fn get_credential(&self) -> BoxFuture<'_, Result>> { - Box::pin(self.cache.get_or_insert_with(|| { - instance_creds( - &self.client, - &self.retry_config, - &self.metadata_endpoint, - self.imdsv1_fallback, - ) +#[async_trait] +impl TokenProvider for InstanceCredentialProvider { + type Credential = AwsCredential; + + async fn fetch_token( + &self, + client: &Client, + retry: &RetryConfig, + ) -> Result>> { + instance_creds(client, retry, &self.metadata_endpoint, self.imdsv1_fallback) + .await .map_err(|source| crate::Error::Generic { store: STORE, source, }) - })) } } @@ -342,31 +328,34 @@ impl CredentialProvider for InstanceCredentialProvider { /// #[derive(Debug)] pub struct WebIdentityProvider { - pub cache: TokenCache>, pub token_path: String, pub role_arn: String, pub session_name: String, pub endpoint: String, - pub client: Client, - pub retry_config: RetryConfig, } -impl CredentialProvider for WebIdentityProvider { - fn get_credential(&self) -> BoxFuture<'_, Result>> { - Box::pin(self.cache.get_or_insert_with(|| { - web_identity( - &self.client, - &self.retry_config, - &self.token_path, - &self.role_arn, - &self.session_name, - &self.endpoint, - ) - .map_err(|source| crate::Error::Generic { - store: STORE, - source, - }) - })) +#[async_trait] +impl TokenProvider for WebIdentityProvider { + type Credential = AwsCredential; + + async fn fetch_token( + &self, + client: &Client, + retry: &RetryConfig, + ) -> Result>> { + web_identity( + client, + retry, + &self.token_path, + &self.role_arn, + &self.session_name, + &self.endpoint, + ) + .await + .map_err(|source| crate::Error::Generic { + store: STORE, + source, + }) } } diff --git a/src/aws/mod.rs b/src/aws/mod.rs index 428e013..ddb9dc7 100644 --- a/src/aws/mod.rs +++ b/src/aws/mod.rs @@ -48,11 +48,13 @@ use url::Url; pub use crate::aws::checksum::Checksum; use crate::aws::client::{S3Client, S3Config}; use crate::aws::credential::{ - AwsCredential, CredentialProvider, InstanceCredentialProvider, - StaticCredentialProvider, WebIdentityProvider, + AwsCredential, InstanceCredentialProvider, WebIdentityProvider, }; use crate::client::header::header_meta; -use crate::client::ClientConfigKey; +use crate::client::{ + ClientConfigKey, CredentialProvider, StaticCredentialProvider, + TokenCredentialProvider, +}; use crate::config::ConfigValue; use crate::multipart::{CloudMultiPartUpload, CloudMultiPartUploadImpl, UploadPart}; use crate::{ @@ -83,6 +85,8 @@ const STRICT_PATH_ENCODE_SET: percent_encoding::AsciiSet = STRICT_ENCODE_SET.rem const STORE: &str = "S3"; +type AwsCredentialProvider = Arc>; + /// Default metadata endpoint static METADATA_ENDPOINT: &str = "http://169.254.169.254"; @@ -1001,13 +1005,12 @@ impl AmazonS3Builder { let credentials = match (self.access_key_id, self.secret_access_key, self.token) { (Some(key_id), Some(secret_key), token) => { info!("Using Static credential provider"); - Box::new(StaticCredentialProvider { - credential: Arc::new(AwsCredential { - key_id, - secret_key, - token, - }), - }) as _ + let credential = AwsCredential { + key_id, + secret_key, + token, + }; + Arc::new(StaticCredentialProvider::new(credential)) as _ } (None, Some(_), _) => return Err(Error::MissingAccessKeyId.into()), (Some(_), None, _) => return Err(Error::MissingSecretAccessKey.into()), @@ -1031,15 +1034,18 @@ impl AmazonS3Builder { .with_allow_http(false) .client()?; - Box::new(WebIdentityProvider { - cache: Default::default(), + let token = WebIdentityProvider { token_path, session_name, role_arn, endpoint, + }; + + Arc::new(TokenCredentialProvider::new( + token, client, - retry_config: self.retry_config.clone(), - }) as _ + self.retry_config.clone(), + )) as _ } _ => match self.profile { Some(profile) => { @@ -1049,19 +1055,20 @@ impl AmazonS3Builder { None => { info!("Using Instance credential provider"); - // The instance metadata endpoint is access over HTTP - let client_options = - self.client_options.clone().with_allow_http(true); - - Box::new(InstanceCredentialProvider { + let token = InstanceCredentialProvider { cache: Default::default(), - client: client_options.client()?, - retry_config: self.retry_config.clone(), imdsv1_fallback: self.imdsv1_fallback.get()?, metadata_endpoint: self .metadata_endpoint .unwrap_or_else(|| METADATA_ENDPOINT.into()), - }) as _ + }; + + Arc::new(TokenCredentialProvider::new( + token, + // The instance metadata endpoint is access over HTTP + self.client_options.clone().with_allow_http(true).client()?, + self.retry_config.clone(), + )) as _ } }, }, @@ -1114,11 +1121,8 @@ fn profile_region(profile: String) -> Option { } #[cfg(feature = "aws_profile")] -fn profile_credentials( - profile: String, - region: String, -) -> Result> { - Ok(Box::new(profile::ProfileProvider::new( +fn profile_credentials(profile: String, region: String) -> Result { + Ok(Arc::new(profile::ProfileProvider::new( profile, Some(region), ))) @@ -1133,7 +1137,7 @@ fn profile_region(_profile: String) -> Option { fn profile_credentials( _profile: String, _region: String, -) -> Result> { +) -> Result { Err(Error::MissingProfileFeature.into()) } diff --git a/src/aws/profile.rs b/src/aws/profile.rs index a88824c..3fc0805 100644 --- a/src/aws/profile.rs +++ b/src/aws/profile.rs @@ -17,6 +17,7 @@ #![cfg(feature = "aws_profile")] +use async_trait::async_trait; use aws_config::meta::region::ProvideRegion; use aws_config::profile::profile_file::ProfileFiles; use aws_config::profile::ProfileFileCredentialsProvider; @@ -24,14 +25,13 @@ use aws_config::profile::ProfileFileRegionProvider; use aws_config::provider_config::ProviderConfig; use aws_credential_types::provider::ProvideCredentials; use aws_types::region::Region; -use futures::future::BoxFuture; use std::sync::Arc; use std::time::Instant; use std::time::SystemTime; -use crate::aws::credential::CredentialProvider; use crate::aws::AwsCredential; use crate::client::token::{TemporaryToken, TokenCache}; +use crate::client::CredentialProvider; use crate::Result; #[cfg(test)] @@ -91,38 +91,43 @@ impl ProfileProvider { } } +#[async_trait] impl CredentialProvider for ProfileProvider { - fn get_credential(&self) -> BoxFuture<'_, Result>> { - Box::pin(self.cache.get_or_insert_with(move || async move { - let region = self.region.clone().map(Region::new); - - let config = ProviderConfig::default().with_region(region); - - let credentials = ProfileFileCredentialsProvider::builder() - .configure(&config) - .profile_name(&self.name) - .build(); - - let c = credentials.provide_credentials().await.map_err(|source| { - crate::Error::Generic { - store: "S3", - source: Box::new(source), - } - })?; - let t_now = SystemTime::now(); - let expiry = c - .expiry() - .and_then(|e| e.duration_since(t_now).ok()) - .map(|ttl| Instant::now() + ttl); - - Ok(TemporaryToken { - token: Arc::new(AwsCredential { - key_id: c.access_key_id().to_string(), - secret_key: c.secret_access_key().to_string(), - token: c.session_token().map(ToString::to_string), - }), - expiry, + type Credential = AwsCredential; + + async fn get_credential(&self) -> Result> { + self.cache + .get_or_insert_with(move || async move { + let region = self.region.clone().map(Region::new); + + let config = ProviderConfig::default().with_region(region); + + let credentials = ProfileFileCredentialsProvider::builder() + .configure(&config) + .profile_name(&self.name) + .build(); + + let c = credentials.provide_credentials().await.map_err(|source| { + crate::Error::Generic { + store: "S3", + source: Box::new(source), + } + })?; + let t_now = SystemTime::now(); + let expiry = c + .expiry() + .and_then(|e| e.duration_since(t_now).ok()) + .map(|ttl| Instant::now() + ttl); + + Ok(TemporaryToken { + token: Arc::new(AwsCredential { + key_id: c.access_key_id().to_string(), + secret_key: c.secret_access_key().to_string(), + token: c.session_token().map(ToString::to_string), + }), + expiry, + }) }) - })) + .await } } diff --git a/src/azure/client.rs b/src/azure/client.rs index 893e261..5f165c0 100644 --- a/src/azure/client.rs +++ b/src/azure/client.rs @@ -15,9 +15,9 @@ // specific language governing permissions and limitations // under the License. -use super::credential::{AzureCredential, CredentialProvider}; +use super::credential::AzureCredential; use crate::azure::credential::*; -use crate::azure::STORE; +use crate::azure::{AzureCredentialProvider, STORE}; use crate::client::pagination::stream_paginated; use crate::client::retry::RetryExt; use crate::client::GetOptionsExt; @@ -40,6 +40,7 @@ use reqwest::{ use serde::{Deserialize, Serialize}; use snafu::{ResultExt, Snafu}; use std::collections::HashMap; +use std::sync::Arc; use url::Url; /// A specialized `Error` for object store-related errors @@ -101,10 +102,10 @@ impl From for crate::Error { /// Configuration for [AzureClient] #[derive(Debug)] -pub struct AzureConfig { +pub(crate) struct AzureConfig { pub account: String, pub container: String, - pub credentials: CredentialProvider, + pub credentials: AzureCredentialProvider, pub retry_config: RetryConfig, pub service: Url, pub is_emulator: bool, @@ -143,45 +144,8 @@ impl AzureClient { &self.config } - async fn get_credential(&self) -> Result { - match &self.config.credentials { - CredentialProvider::AccessKey(key) => { - Ok(AzureCredential::AccessKey(key.to_owned())) - } - CredentialProvider::BearerToken(token) => { - Ok(AzureCredential::AuthorizationToken( - // we do the conversion to a HeaderValue here, since it is fallible - // and we want to use it in an infallible function - HeaderValue::from_str(&format!("Bearer {token}")).map_err(|err| { - crate::Error::Generic { - store: STORE, - source: Box::new(err), - } - })?, - )) - } - CredentialProvider::TokenCredential(cache, cred) => { - let token = cache - .get_or_insert_with(|| { - cred.fetch_token(&self.client, &self.config.retry_config) - }) - .await - .context(AuthorizationSnafu)?; - Ok(AzureCredential::AuthorizationToken( - // we do the conversion to a HeaderValue here, since it is fallible - // and we want to use it in an infallible function - HeaderValue::from_str(&format!("Bearer {token}")).map_err(|err| { - crate::Error::Generic { - store: STORE, - source: Box::new(err), - } - })?, - )) - } - CredentialProvider::SASToken(sas) => { - Ok(AzureCredential::SASToken(sas.clone())) - } - } + async fn get_credential(&self) -> Result> { + self.config.credentials.get_credential().await } /// Make an Azure PUT request @@ -308,7 +272,7 @@ impl AzureClient { // If using SAS authorization must include the headers in the URL // - if let AzureCredential::SASToken(pairs) = &credential { + if let AzureCredential::SASToken(pairs) = credential.as_ref() { source.query_pairs_mut().extend_pairs(pairs); } diff --git a/src/azure/credential.rs b/src/azure/credential.rs index 8130df6..fd75389 100644 --- a/src/azure/credential.rs +++ b/src/azure/credential.rs @@ -15,10 +15,13 @@ // specific language governing permissions and limitations // under the License. +use crate::azure::STORE; use crate::client::retry::RetryExt; use crate::client::token::{TemporaryToken, TokenCache}; +use crate::client::{CredentialProvider, TokenProvider}; use crate::util::hmac_sha256; use crate::RetryConfig; +use async_trait::async_trait; use base64::prelude::BASE64_STANDARD; use base64::Engine; use chrono::{DateTime, Utc}; @@ -36,6 +39,7 @@ use snafu::{ResultExt, Snafu}; use std::borrow::Cow; use std::process::Command; use std::str; +use std::sync::Arc; use std::time::{Duration, Instant}; use url::Url; @@ -81,19 +85,30 @@ pub enum Error { pub type Result = std::result::Result; -/// Provides credentials for use when signing requests -#[derive(Debug)] -pub enum CredentialProvider { - AccessKey(String), - BearerToken(String), - SASToken(Vec<(String, String)>), - TokenCredential(TokenCache, Box), +impl From for crate::Error { + fn from(value: Error) -> Self { + Self::Generic { + store: STORE, + source: Box::new(value), + } + } } -pub(crate) enum AzureCredential { +/// An Azure storage credential +#[derive(Debug, Eq, PartialEq)] +pub enum AzureCredential { + /// A shared access key + /// + /// AccessKey(String), + /// A shared access signature + /// + /// SASToken(Vec<(String, String)>), - AuthorizationToken(HeaderValue), + /// An authorization token + /// + /// + BearerToken(String), } /// A list of known Azure authority hosts @@ -155,9 +170,7 @@ impl CredentialExt for RequestBuilder { Self::from_parts(client, request) } - AzureCredential::AuthorizationToken(token) => { - self.header(AUTHORIZATION, token) - } + AzureCredential::BearerToken(token) => self.bearer_auth(token), AzureCredential::SASToken(query_pairs) => self.query(&query_pairs), } } @@ -291,15 +304,6 @@ fn lexy_sort<'a>( values } -#[async_trait::async_trait] -pub trait TokenCredential: std::fmt::Debug + Send + Sync + 'static { - async fn fetch_token( - &self, - client: &Client, - retry: &RetryConfig, - ) -> Result>; -} - #[derive(Deserialize, Debug)] struct TokenResponse { access_token: String, @@ -338,13 +342,15 @@ impl ClientSecretOAuthProvider { } #[async_trait::async_trait] -impl TokenCredential for ClientSecretOAuthProvider { +impl TokenProvider for ClientSecretOAuthProvider { + type Credential = AzureCredential; + /// Fetch a token async fn fetch_token( &self, client: &Client, retry: &RetryConfig, - ) -> Result> { + ) -> crate::Result>> { let response: TokenResponse = client .request(Method::POST, &self.token_url) .header(ACCEPT, HeaderValue::from_static(CONTENT_TYPE_JSON)) @@ -361,12 +367,10 @@ impl TokenCredential for ClientSecretOAuthProvider { .await .context(TokenResponseBodySnafu)?; - let token = TemporaryToken { - token: response.access_token, + Ok(TemporaryToken { + token: Arc::new(AzureCredential::BearerToken(response.access_token)), expiry: Some(Instant::now() + Duration::from_secs(response.expires_in)), - }; - - Ok(token) + }) } } @@ -397,7 +401,6 @@ pub struct ImdsManagedIdentityProvider { client_id: Option, object_id: Option, msi_res_id: Option, - client: Client, } impl ImdsManagedIdentityProvider { @@ -407,7 +410,6 @@ impl ImdsManagedIdentityProvider { object_id: Option, msi_res_id: Option, msi_endpoint: Option, - client: Client, ) -> Self { let msi_endpoint = msi_endpoint.unwrap_or_else(|| { "http://169.254.169.254/metadata/identity/oauth2/token".to_owned() @@ -418,19 +420,20 @@ impl ImdsManagedIdentityProvider { client_id, object_id, msi_res_id, - client, } } } #[async_trait::async_trait] -impl TokenCredential for ImdsManagedIdentityProvider { +impl TokenProvider for ImdsManagedIdentityProvider { + type Credential = AzureCredential; + /// Fetch a token async fn fetch_token( &self, - _client: &Client, + client: &Client, retry: &RetryConfig, - ) -> Result> { + ) -> crate::Result>> { let mut query_items = vec![ ("api-version", MSI_API_VERSION), ("resource", AZURE_STORAGE_RESOURCE), @@ -450,8 +453,7 @@ impl TokenCredential for ImdsManagedIdentityProvider { query_items.push((key, value)); } - let mut builder = self - .client + let mut builder = client .request(Method::GET, &self.msi_endpoint) .header("metadata", "true") .query(&query_items); @@ -468,12 +470,10 @@ impl TokenCredential for ImdsManagedIdentityProvider { .await .context(TokenResponseBodySnafu)?; - let token = TemporaryToken { - token: response.access_token, + Ok(TemporaryToken { + token: Arc::new(AzureCredential::BearerToken(response.access_token)), expiry: Some(Instant::now() + Duration::from_secs(response.expires_in)), - }; - - Ok(token) + }) } } @@ -511,13 +511,15 @@ impl WorkloadIdentityOAuthProvider { } #[async_trait::async_trait] -impl TokenCredential for WorkloadIdentityOAuthProvider { +impl TokenProvider for WorkloadIdentityOAuthProvider { + type Credential = AzureCredential; + /// Fetch a token async fn fetch_token( &self, client: &Client, retry: &RetryConfig, - ) -> Result> { + ) -> crate::Result>> { let token_str = std::fs::read_to_string(&self.federated_token_file) .map_err(|_| Error::FederatedTokenFile)?; @@ -542,12 +544,10 @@ impl TokenCredential for WorkloadIdentityOAuthProvider { .await .context(TokenResponseBodySnafu)?; - let token = TemporaryToken { - token: response.access_token, + Ok(TemporaryToken { + token: Arc::new(AzureCredential::BearerToken(response.access_token)), expiry: Some(Instant::now() + Duration::from_secs(response.expires_in)), - }; - - Ok(token) + }) } } @@ -585,23 +585,16 @@ struct AzureCliTokenResponse { #[derive(Default, Debug)] pub struct AzureCliCredential { - _private: (), + cache: TokenCache>, } impl AzureCliCredential { pub fn new() -> Self { Self::default() } -} -#[async_trait::async_trait] -impl TokenCredential for AzureCliCredential { /// Fetch a token - async fn fetch_token( - &self, - _client: &Client, - _retry: &RetryConfig, - ) -> Result> { + async fn fetch_token(&self) -> Result>> { // on window az is a cmd and it should be called like this // see https://doc.rust-lang.org/nightly/std/process/struct.Command.html let program = if cfg!(target_os = "windows") { @@ -642,7 +635,9 @@ impl TokenCredential for AzureCliCredential { let duration = token_response.expires_on.naive_local() - chrono::Local::now().naive_local(); Ok(TemporaryToken { - token: token_response.access_token, + token: Arc::new(AzureCredential::BearerToken( + token_response.access_token, + )), expiry: Some( Instant::now() + duration.to_std().map_err(|_| Error::AzureCli { @@ -669,6 +664,15 @@ impl TokenCredential for AzureCliCredential { } } +#[async_trait] +impl CredentialProvider for AzureCliCredential { + type Credential = AzureCredential; + + async fn get_credential(&self) -> crate::Result> { + Ok(self.cache.get_or_insert_with(|| self.fetch_token()).await?) + } +} + #[cfg(test)] mod tests { use super::*; @@ -723,7 +727,6 @@ mod tests { None, None, Some(format!("{endpoint}/metadata/identity/oauth2/token")), - client.clone(), ); let token = credential @@ -731,7 +734,10 @@ mod tests { .await .unwrap(); - assert_eq!(&token.token, "TOKEN"); + assert_eq!( + token.token.as_ref(), + &AzureCredential::BearerToken("TOKEN".into()) + ); } #[tokio::test] @@ -779,6 +785,9 @@ mod tests { .await .unwrap(); - assert_eq!(&token.token, "TOKEN"); + assert_eq!( + token.token.as_ref(), + &AzureCredential::BearerToken("TOKEN".into()) + ); } } diff --git a/src/azure/mod.rs b/src/azure/mod.rs index 0f8dae0..6dc14cf 100644 --- a/src/azure/mod.rs +++ b/src/azure/mod.rs @@ -27,7 +27,6 @@ //! a way to drop old blocks. Instead unused blocks are automatically cleaned up //! after 7 days. use self::client::{BlockId, BlockList}; -use crate::client::token::TokenCache; use crate::{ multipart::{CloudMultiPartUpload, CloudMultiPartUploadImpl, UploadPart}, path::Path, @@ -49,14 +48,20 @@ use std::{collections::BTreeSet, str::FromStr}; use tokio::io::AsyncWrite; use url::Url; +use crate::azure::credential::AzureCredential; use crate::client::header::header_meta; -use crate::client::ClientConfigKey; +use crate::client::{ + ClientConfigKey, CredentialProvider, StaticCredentialProvider, + TokenCredentialProvider, +}; use crate::config::ConfigValue; pub use credential::authority_hosts; mod client; mod credential; +type AzureCredentialProvider = Arc>; + const STORE: &str = "MicrosoftAzure"; /// The well-known account used by Azurite and the legacy Azure Storage Emulator. @@ -101,12 +106,6 @@ enum Error { #[snafu(display("Container name must be specified"))] MissingContainerName {}, - #[snafu(display("At least one authorization option must be specified"))] - MissingCredentials {}, - - #[snafu(display("Azure credential error: {}", source), context(false))] - Credential { source: credential::Error }, - #[snafu(display( "Unknown url scheme cannot be parsed into storage location: {}", scheme @@ -913,6 +912,9 @@ impl MicrosoftAzureBuilder { } let container = self.container_name.ok_or(Error::MissingContainerName {})?; + let static_creds = |credential: AzureCredential| -> AzureCredentialProvider { + Arc::new(StaticCredentialProvider::new(credential)) + }; let (is_emulator, storage_url, auth, account) = if self.use_emulator.get()? { let account_name = self @@ -924,7 +926,8 @@ impl MicrosoftAzureBuilder { let account_key = self .access_key .unwrap_or_else(|| EMULATOR_ACCOUNT_KEY.to_string()); - let credential = credential::CredentialProvider::AccessKey(account_key); + + let credential = static_creds(AzureCredential::AccessKey(account_key)); self.client_options = self.client_options.with_allow_http(true); (true, url, credential, account_name) @@ -933,10 +936,11 @@ impl MicrosoftAzureBuilder { let account_url = format!("https://{}.blob.core.windows.net", &account_name); let url = Url::parse(&account_url) .context(UnableToParseUrlSnafu { url: account_url })?; + let credential = if let Some(bearer_token) = self.bearer_token { - credential::CredentialProvider::BearerToken(bearer_token) + static_creds(AzureCredential::BearerToken(bearer_token)) } else if let Some(access_key) = self.access_key { - credential::CredentialProvider::AccessKey(access_key) + static_creds(AzureCredential::AccessKey(access_key)) } else if let (Some(client_id), Some(tenant_id), Some(federated_token_file)) = (&self.client_id, &self.tenant_id, self.federated_token_file) { @@ -946,10 +950,11 @@ impl MicrosoftAzureBuilder { tenant_id, self.authority_host, ); - credential::CredentialProvider::TokenCredential( - TokenCache::default(), - Box::new(client_credential), - ) + Arc::new(TokenCredentialProvider::new( + client_credential, + self.client_options.client()?, + self.retry_config.clone(), + )) as _ } else if let (Some(client_id), Some(client_secret), Some(tenant_id)) = (&self.client_id, self.client_secret, &self.tenant_id) { @@ -959,33 +964,29 @@ impl MicrosoftAzureBuilder { tenant_id, self.authority_host, ); - credential::CredentialProvider::TokenCredential( - TokenCache::default(), - Box::new(client_credential), - ) + Arc::new(TokenCredentialProvider::new( + client_credential, + self.client_options.client()?, + self.retry_config.clone(), + )) as _ } else if let Some(query_pairs) = self.sas_query_pairs { - credential::CredentialProvider::SASToken(query_pairs) + static_creds(AzureCredential::SASToken(query_pairs)) } else if let Some(sas) = self.sas_key { - credential::CredentialProvider::SASToken(split_sas(&sas)?) + static_creds(AzureCredential::SASToken(split_sas(&sas)?)) } else if self.use_azure_cli.get()? { - credential::CredentialProvider::TokenCredential( - TokenCache::default(), - Box::new(credential::AzureCliCredential::new()), - ) + Arc::new(credential::AzureCliCredential::new()) as _ } else { - let client = - self.client_options.clone().with_allow_http(true).client()?; let msi_credential = credential::ImdsManagedIdentityProvider::new( self.client_id, self.object_id, self.msi_resource_id, self.msi_endpoint, - client, ); - credential::CredentialProvider::TokenCredential( - TokenCache::default(), - Box::new(msi_credential), - ) + Arc::new(TokenCredentialProvider::new( + msi_credential, + self.client_options.clone().with_allow_http(true).client()?, + self.retry_config.clone(), + )) as _ }; (false, url, credential, account_name) }; diff --git a/src/client/mod.rs b/src/client/mod.rs index c6a73fe..292e467 100644 --- a/src/client/mod.rs +++ b/src/client/mod.rs @@ -32,17 +32,20 @@ pub mod header; #[cfg(any(feature = "aws", feature = "gcp"))] pub mod list; +use async_trait::async_trait; use std::collections::HashMap; use std::str::FromStr; +use std::sync::Arc; use std::time::Duration; use reqwest::header::{HeaderMap, HeaderValue}; use reqwest::{Client, ClientBuilder, Proxy, RequestBuilder}; use serde::{Deserialize, Serialize}; +use crate::client::token::{TemporaryToken, TokenCache}; use crate::config::{fmt_duration, ConfigValue}; use crate::path::Path; -use crate::GetOptions; +use crate::{GetOptions, Result, RetryConfig}; fn map_client_error(e: reqwest::Error) -> super::Error { super::Error::Generic { @@ -503,6 +506,90 @@ impl GetOptionsExt for RequestBuilder { } } +/// Provides credentials for use when signing requests +#[async_trait] +pub trait CredentialProvider: std::fmt::Debug + Send + Sync { + type Credential; + + async fn get_credential(&self) -> Result>; +} + +/// A static set of credentials +#[derive(Debug)] +pub struct StaticCredentialProvider { + credential: Arc, +} + +impl StaticCredentialProvider { + pub fn new(credential: T) -> Self { + Self { + credential: Arc::new(credential), + } + } +} + +#[async_trait] +impl CredentialProvider for StaticCredentialProvider +where + T: std::fmt::Debug + Send + Sync, +{ + type Credential = T; + + async fn get_credential(&self) -> Result> { + Ok(Arc::clone(&self.credential)) + } +} + +#[cfg(any(feature = "aws", feature = "azure", feature = "gcp"))] +mod cloud { + use super::*; + + /// A [`CredentialProvider`] that uses [`Client`] to fetch temporary tokens + #[derive(Debug)] + pub struct TokenCredentialProvider { + inner: T, + client: Client, + retry: RetryConfig, + cache: TokenCache>, + } + + impl TokenCredentialProvider { + pub fn new(inner: T, client: Client, retry: RetryConfig) -> Self { + Self { + inner, + client, + retry, + cache: Default::default(), + } + } + } + + #[async_trait] + impl CredentialProvider for TokenCredentialProvider { + type Credential = T::Credential; + + async fn get_credential(&self) -> Result> { + self.cache + .get_or_insert_with(|| self.inner.fetch_token(&self.client, &self.retry)) + .await + } + } + + #[async_trait] + pub trait TokenProvider: std::fmt::Debug + Send + Sync { + type Credential: std::fmt::Debug + Send + Sync; + + async fn fetch_token( + &self, + client: &Client, + retry: &RetryConfig, + ) -> Result>>; + } +} + +#[cfg(any(feature = "aws", feature = "azure", feature = "gcp"))] +pub use cloud::*; + #[cfg(test)] mod tests { use super::*; diff --git a/src/gcp/credential.rs b/src/gcp/credential.rs index 057e013..ad12855 100644 --- a/src/gcp/credential.rs +++ b/src/gcp/credential.rs @@ -17,6 +17,9 @@ use crate::client::retry::RetryExt; use crate::client::token::TemporaryToken; +use crate::client::{TokenCredentialProvider, TokenProvider}; +use crate::gcp::credential::Error::UnsupportedCredentialsType; +use crate::gcp::{GcpCredentialProvider, STORE}; use crate::ClientOptions; use crate::RetryConfig; use async_trait::async_trait; @@ -30,6 +33,7 @@ use std::env; use std::fs::File; use std::io::BufReader; use std::path::{Path, PathBuf}; +use std::sync::Arc; use std::time::{Duration, Instant}; use tracing::info; @@ -67,9 +71,21 @@ pub enum Error { #[snafu(display("Unsupported ApplicationCredentials type: {}", type_))] UnsupportedCredentialsType { type_: String }, +} + +impl From for crate::Error { + fn from(value: Error) -> Self { + Self::Generic { + store: STORE, + source: Box::new(value), + } + } +} - #[snafu(display("Error creating client: {}", source))] - Client { source: crate::Error }, +#[derive(Debug, Eq, PartialEq)] +pub struct GcpCredential { + /// An HTTP bearer token + pub bearer: String, } pub type Result = std::result::Result; @@ -127,15 +143,6 @@ struct TokenResponse { expires_in: u64, } -#[async_trait] -pub trait TokenProvider: std::fmt::Debug + Send + Sync { - async fn fetch_token( - &self, - client: &Client, - retry: &RetryConfig, - ) -> Result>; -} - /// Encapsulates the logic to perform an OAuth token challenge #[derive(Debug)] pub struct OAuthProvider { @@ -174,12 +181,14 @@ impl OAuthProvider { #[async_trait] impl TokenProvider for OAuthProvider { + type Credential = GcpCredential; + /// Fetch a fresh token async fn fetch_token( &self, client: &Client, retry: &RetryConfig, - ) -> Result> { + ) -> crate::Result>> { let now = seconds_since_epoch(); let exp = now + 3600; @@ -221,12 +230,12 @@ impl TokenProvider for OAuthProvider { .await .context(TokenResponseBodySnafu)?; - let token = TemporaryToken { - token: response.access_token, + Ok(TemporaryToken { + token: Arc::new(GcpCredential { + bearer: response.access_token, + }), expiry: Some(Instant::now() + Duration::from_secs(response.expires_in)), - }; - - Ok(token) + }) } } @@ -281,17 +290,17 @@ impl ServiceAccountCredentials { } /// Create an [`OAuthProvider`] from this credentials struct. - pub fn token_provider( + pub fn oauth_provider( self, scope: &str, audience: &str, - ) -> Result> { - Ok(Box::new(OAuthProvider::new( + ) -> crate::Result { + Ok(OAuthProvider::new( self.client_email, self.private_key, scope.to_string(), audience.to_string(), - )?) as Box) + )?) } } @@ -329,23 +338,14 @@ fn b64_encode_obj(obj: &T) -> Result { #[derive(Debug, Default)] pub struct InstanceCredentialProvider { audience: String, - client: Client, } impl InstanceCredentialProvider { /// Create a new [`InstanceCredentialProvider`], we need to control the client in order to enable http access so save the options. - pub fn new>( - audience: T, - client_options: ClientOptions, - ) -> Result { - client_options - .with_allow_http(true) - .client() - .map(|client| Self { - audience: audience.into(), - client, - }) - .context(ClientSnafu) + pub fn new>(audience: T) -> Self { + Self { + audience: audience.into(), + } } } @@ -355,7 +355,7 @@ async fn make_metadata_request( hostname: &str, retry: &RetryConfig, audience: &str, -) -> Result { +) -> crate::Result { let url = format!( "http://{hostname}/computeMetadata/v1/instance/service-accounts/default/token" ); @@ -374,30 +374,29 @@ async fn make_metadata_request( #[async_trait] impl TokenProvider for InstanceCredentialProvider { + type Credential = GcpCredential; + /// Fetch a token from the metadata server. /// Since the connection is local we need to enable http access and don't actually use the client object passed in. async fn fetch_token( &self, - _client: &Client, + client: &Client, retry: &RetryConfig, - ) -> Result> { + ) -> crate::Result>> { const METADATA_IP: &str = "169.254.169.254"; const METADATA_HOST: &str = "metadata"; info!("fetching token from metadata server"); let response = - make_metadata_request(&self.client, METADATA_HOST, retry, &self.audience) + make_metadata_request(client, METADATA_HOST, retry, &self.audience) .or_else(|_| { - make_metadata_request( - &self.client, - METADATA_IP, - retry, - &self.audience, - ) + make_metadata_request(client, METADATA_IP, retry, &self.audience) }) .await?; let token = TemporaryToken { - token: response.access_token, + token: Arc::new(GcpCredential { + bearer: response.access_token, + }), expiry: Some(Instant::now() + Duration::from_secs(response.expires_in)), }; Ok(token) @@ -406,31 +405,35 @@ impl TokenProvider for InstanceCredentialProvider { /// ApplicationDefaultCredentials /// -#[derive(Debug)] -pub enum ApplicationDefaultCredentials { - /// - AuthorizedUser { - client_id: String, - client_secret: String, - refresh_token: String, - }, -} - -impl ApplicationDefaultCredentials { - pub fn new(path: Option<&str>) -> Result, Error> { - let file = match ApplicationDefaultCredentialsFile::read(path)? { - Some(f) => f, - None => return Ok(None), - }; - - Ok(Some(match file.type_.as_str() { - "authorized_user" => Self::AuthorizedUser { +pub fn application_default_credentials( + path: Option<&str>, + client: &ClientOptions, + retry: &RetryConfig, +) -> crate::Result> { + let file = match ApplicationDefaultCredentialsFile::read(path)? { + Some(x) => x, + None => return Ok(None), + }; + + match file.type_.as_str() { + // + "authorized_user" => { + let token = AuthorizedUserCredentials { client_id: file.client_id, client_secret: file.client_secret, refresh_token: file.refresh_token, - }, - type_ => return UnsupportedCredentialsTypeSnafu { type_ }.fail(), - })) + }; + + Ok(Some(Arc::new(TokenCredentialProvider::new( + token, + client.client()?, + retry.clone(), + )))) + } + type_ => Err(UnsupportedCredentialsType { + type_: type_.to_string(), + } + .into()), } } @@ -473,41 +476,43 @@ impl ApplicationDefaultCredentialsFile { const DEFAULT_TOKEN_GCP_URI: &str = "https://accounts.google.com/o/oauth2/token"; +/// +#[derive(Debug)] +struct AuthorizedUserCredentials { + client_id: String, + client_secret: String, + refresh_token: String, +} + #[async_trait] -impl TokenProvider for ApplicationDefaultCredentials { +impl TokenProvider for AuthorizedUserCredentials { + type Credential = GcpCredential; + async fn fetch_token( &self, client: &Client, retry: &RetryConfig, - ) -> Result, Error> { - let builder = client.request(Method::POST, DEFAULT_TOKEN_GCP_URI); - let builder = match self { - Self::AuthorizedUser { - client_id, - client_secret, - refresh_token, - } => { - let body = [ - ("grant_type", "refresh_token"), - ("client_id", client_id), - ("client_secret", client_secret), - ("refresh_token", refresh_token), - ]; - builder.form(&body) - } - }; - - let response = builder + ) -> crate::Result>> { + let response = client + .request(Method::POST, DEFAULT_TOKEN_GCP_URI) + .form(&[ + ("grant_type", "refresh_token"), + ("client_id", &self.client_id), + ("client_secret", &self.client_secret), + ("refresh_token", &self.refresh_token), + ]) .send_retry(retry) .await .context(TokenRequestSnafu)? .json::() .await .context(TokenResponseBodySnafu)?; - let token = TemporaryToken { - token: response.access_token, + + Ok(TemporaryToken { + token: Arc::new(GcpCredential { + bearer: response.access_token, + }), expiry: Some(Instant::now() + Duration::from_secs(response.expires_in)), - }; - Ok(token) + }) } } diff --git a/src/gcp/mod.rs b/src/gcp/mod.rs index 32f4055..6813bbf 100644 --- a/src/gcp/mod.rs +++ b/src/gcp/mod.rs @@ -48,9 +48,12 @@ use crate::client::header::header_meta; use crate::client::list::ListResponse; use crate::client::pagination::stream_paginated; use crate::client::retry::RetryExt; -use crate::client::{ClientConfigKey, GetOptionsExt}; +use crate::client::{ + ClientConfigKey, CredentialProvider, GetOptionsExt, StaticCredentialProvider, + TokenCredentialProvider, +}; +use crate::gcp::credential::{application_default_credentials, GcpCredential}; use crate::{ - client::token::TokenCache, multipart::{CloudMultiPartUpload, CloudMultiPartUploadImpl, UploadPart}, path::{Path, DELIMITER}, util::format_prefix, @@ -59,14 +62,15 @@ use crate::{ }; use self::credential::{ - default_gcs_base_url, ApplicationDefaultCredentials, InstanceCredentialProvider, - ServiceAccountCredentials, TokenProvider, + default_gcs_base_url, InstanceCredentialProvider, ServiceAccountCredentials, }; mod credential; const STORE: &str = "GCS"; +type GcpCredentialProvider = Arc>; + #[derive(Debug, Snafu)] enum Error { #[snafu(display("Got invalid XML response for {} {}: {}", method, url, source))] @@ -119,9 +123,6 @@ enum Error { #[snafu(display("Missing bucket name"))] MissingBucketName {}, - #[snafu(display("Could not find either metadata credentials or configuration properties to initialize GCS credentials."))] - MissingCredentials, - #[snafu(display( "One of service account path or service account key may be provided." ))] @@ -209,8 +210,7 @@ struct GoogleCloudStorageClient { client: Client, base_url: String, - token_provider: Option>>, - token_cache: TokenCache, + credentials: GcpCredentialProvider, bucket_name: String, bucket_name_encoded: String, @@ -223,18 +223,8 @@ struct GoogleCloudStorageClient { } impl GoogleCloudStorageClient { - async fn get_token(&self) -> Result { - if let Some(token_provider) = &self.token_provider { - Ok(self - .token_cache - .get_or_insert_with(|| { - token_provider.fetch_token(&self.client, &self.retry_config) - }) - .await - .context(CredentialSnafu)?) - } else { - Ok("".to_owned()) - } + async fn get_credential(&self) -> Result> { + self.credentials.get_credential().await } fn object_url(&self, path: &Path) -> String { @@ -249,7 +239,7 @@ impl GoogleCloudStorageClient { options: GetOptions, head: bool, ) -> Result { - let token = self.get_token().await?; + let credential = self.get_credential().await?; let url = self.object_url(path); let method = match head { @@ -260,7 +250,7 @@ impl GoogleCloudStorageClient { let response = self .client .request(method, url) - .bearer_auth(token) + .bearer_auth(&credential.bearer) .with_get_options(options) .send_retry(&self.retry_config) .await @@ -273,7 +263,7 @@ impl GoogleCloudStorageClient { /// Perform a put request async fn put_request(&self, path: &Path, payload: Bytes) -> Result<()> { - let token = self.get_token().await?; + let credential = self.get_credential().await?; let url = self.object_url(path); let content_type = self @@ -283,7 +273,7 @@ impl GoogleCloudStorageClient { self.client .request(Method::PUT, url) - .bearer_auth(token) + .bearer_auth(&credential.bearer) .header(header::CONTENT_TYPE, content_type) .header(header::CONTENT_LENGTH, payload.len()) .body(payload) @@ -298,7 +288,7 @@ impl GoogleCloudStorageClient { /// Initiate a multi-part upload async fn multipart_initiate(&self, path: &Path) -> Result { - let token = self.get_token().await?; + let credential = self.get_credential().await?; let url = format!("{}/{}/{}", self.base_url, self.bucket_name_encoded, path); let content_type = self @@ -309,7 +299,7 @@ impl GoogleCloudStorageClient { let response = self .client .request(Method::POST, &url) - .bearer_auth(token) + .bearer_auth(&credential.bearer) .header(header::CONTENT_TYPE, content_type) .header(header::CONTENT_LENGTH, "0") .query(&[("uploads", "")]) @@ -338,12 +328,12 @@ impl GoogleCloudStorageClient { path: &str, multipart_id: &MultipartId, ) -> Result<()> { - let token = self.get_token().await?; + let credential = self.get_credential().await?; let url = format!("{}/{}/{}", self.base_url, self.bucket_name_encoded, path); self.client .request(Method::DELETE, &url) - .bearer_auth(token) + .bearer_auth(&credential.bearer) .header(header::CONTENT_TYPE, "application/octet-stream") .header(header::CONTENT_LENGTH, "0") .query(&[("uploadId", multipart_id)]) @@ -356,12 +346,12 @@ impl GoogleCloudStorageClient { /// Perform a delete request async fn delete_request(&self, path: &Path) -> Result<()> { - let token = self.get_token().await?; + let credential = self.get_credential().await?; let url = self.object_url(path); let builder = self.client.request(Method::DELETE, url); builder - .bearer_auth(token) + .bearer_auth(&credential.bearer) .send_retry(&self.retry_config) .await .context(DeleteRequestSnafu { @@ -378,7 +368,7 @@ impl GoogleCloudStorageClient { to: &Path, if_not_exists: bool, ) -> Result<()> { - let token = self.get_token().await?; + let credential = self.get_credential().await?; let url = self.object_url(to); let from = utf8_percent_encode(from.as_ref(), NON_ALPHANUMERIC); @@ -394,7 +384,7 @@ impl GoogleCloudStorageClient { } builder - .bearer_auth(token) + .bearer_auth(&credential.bearer) // Needed if reqwest is compiled with native-tls instead of rustls-tls // See https://github.com/apache/arrow-rs/pull/3921 .header(header::CONTENT_LENGTH, 0) @@ -418,7 +408,7 @@ impl GoogleCloudStorageClient { delimiter: bool, page_token: Option<&str>, ) -> Result { - let token = self.get_token().await?; + let credential = self.get_credential().await?; let url = format!("{}/{}", self.base_url, self.bucket_name_encoded); let mut query = Vec::with_capacity(5); @@ -443,7 +433,7 @@ impl GoogleCloudStorageClient { .client .request(Method::GET, url) .query(&query) - .bearer_auth(token) + .bearer_auth(&credential.bearer) .send_retry(&self.retry_config) .await .context(ListRequestSnafu)? @@ -495,9 +485,9 @@ impl CloudMultiPartUploadImpl for GCSMultipartUpload { self.client.base_url, self.client.bucket_name_encoded, self.encoded_path ); - let token = self + let credential = self .client - .get_token() + .get_credential() .await .map_err(|err| io::Error::new(io::ErrorKind::Other, err))?; @@ -505,7 +495,7 @@ impl CloudMultiPartUploadImpl for GCSMultipartUpload { .client .client .request(Method::PUT, &url) - .bearer_auth(token) + .bearer_auth(&credential.bearer) .query(&[ ("partNumber", format!("{}", part_idx + 1)), ("uploadId", upload_id), @@ -549,9 +539,9 @@ impl CloudMultiPartUploadImpl for GCSMultipartUpload { }) .collect(); - let token = self + let credential = self .client - .get_token() + .get_credential() .await .map_err(|err| io::Error::new(io::ErrorKind::Other, err))?; @@ -567,7 +557,7 @@ impl CloudMultiPartUploadImpl for GCSMultipartUpload { self.client .client .request(Method::POST, &url) - .bearer_auth(token) + .bearer_auth(&credential.bearer) .query(&[("uploadId", upload_id)]) .body(data) .send_retry(&self.client.retry_config) @@ -1062,10 +1052,11 @@ impl GoogleCloudStorageBuilder { }; // Then try to initialize from the application credentials file, or the environment. - let application_default_credentials = ApplicationDefaultCredentials::new( + let application_default_credentials = application_default_credentials( self.application_credentials_path.as_deref(), - ) - .context(CredentialSnafu)?; + &self.client_options, + &self.retry_config, + )?; let disable_oauth = service_account_credentials .as_ref() @@ -1081,29 +1072,24 @@ impl GoogleCloudStorageBuilder { let scope = "https://www.googleapis.com/auth/devstorage.full_control"; let audience = "https://www.googleapis.com/oauth2/v4/token"; - let token_provider = if disable_oauth { - None + let credentials = if disable_oauth { + Arc::new(StaticCredentialProvider::new(GcpCredential { + bearer: "".to_string(), + })) as _ + } else if let Some(credentials) = service_account_credentials { + Arc::new(TokenCredentialProvider::new( + credentials.oauth_provider(scope, audience)?, + self.client_options.client()?, + self.retry_config.clone(), + )) as _ + } else if let Some(credentials) = application_default_credentials { + credentials } else { - let best_provider = if let Some(credentials) = service_account_credentials { - Some( - credentials - .token_provider(scope, audience) - .context(CredentialSnafu)?, - ) - } else if let Some(credentials) = application_default_credentials { - Some(Box::new(credentials) as Box) - } else { - Some(Box::new( - InstanceCredentialProvider::new( - audience, - self.client_options.clone(), - ) - .context(CredentialSnafu)?, - ) as Box) - }; - - // A provider is required at this point, bail out if we don't have one. - Some(best_provider.ok_or(Error::MissingCredentials)?) + Arc::new(TokenCredentialProvider::new( + InstanceCredentialProvider::new(audience), + self.client_options.clone().with_allow_http(true).client()?, + self.retry_config.clone(), + )) as _ }; let encoded_bucket_name = @@ -1113,8 +1099,7 @@ impl GoogleCloudStorageBuilder { client: Arc::new(GoogleCloudStorageClient { client, base_url: gcs_base_url, - token_provider: token_provider.map(Arc::new), - token_cache: Default::default(), + credentials, bucket_name, bucket_name_encoded: encoded_bucket_name, retry_config: self.retry_config, From 99924ceee6d1c58175aa2f04ba49184023a8c544 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Wed, 17 May 2023 21:13:25 +0100 Subject: [PATCH 145/397] Expose credential provider (#4235) --- src/aws/mod.rs | 159 +++++++++++++++++++++++------------------- src/azure/mod.rs | 25 ++++++- src/client/mod.rs | 2 + src/gcp/credential.rs | 1 + src/gcp/mod.rs | 30 ++++++-- src/lib.rs | 2 +- 6 files changed, 137 insertions(+), 82 deletions(-) diff --git a/src/aws/mod.rs b/src/aws/mod.rs index ddb9dc7..a10561b 100644 --- a/src/aws/mod.rs +++ b/src/aws/mod.rs @@ -47,9 +47,7 @@ use url::Url; pub use crate::aws::checksum::Checksum; use crate::aws::client::{S3Client, S3Config}; -use crate::aws::credential::{ - AwsCredential, InstanceCredentialProvider, WebIdentityProvider, -}; +use crate::aws::credential::{InstanceCredentialProvider, WebIdentityProvider}; use crate::client::header::header_meta; use crate::client::{ ClientConfigKey, CredentialProvider, StaticCredentialProvider, @@ -85,7 +83,9 @@ const STRICT_PATH_ENCODE_SET: percent_encoding::AsciiSet = STRICT_ENCODE_SET.rem const STORE: &str = "S3"; -type AwsCredentialProvider = Arc>; +/// [`CredentialProvider`] for [`AmazonS3`] +pub type AwsCredentialProvider = Arc>; +pub use credential::AwsCredential; /// Default metadata endpoint static METADATA_ENDPOINT: &str = "http://169.254.169.254"; @@ -209,6 +209,13 @@ impl std::fmt::Display for AmazonS3 { } } +impl AmazonS3 { + /// Returns the [`AwsCredentialProvider`] used by [`AmazonS3`] + pub fn credentials(&self) -> &AwsCredentialProvider { + &self.client.config().credentials + } +} + #[async_trait] impl ObjectStore for AmazonS3 { async fn put(&self, location: &Path, bytes: Bytes) -> Result<()> { @@ -424,6 +431,8 @@ pub struct AmazonS3Builder { profile: Option, /// Client options client_options: ClientOptions, + /// Credentials + credentials: Option, } /// Configuration keys for [`AmazonS3Builder`] @@ -879,6 +888,12 @@ impl AmazonS3Builder { self } + /// Set the credential provider overriding any other options + pub fn with_credentials(mut self, credentials: AwsCredentialProvider) -> Self { + self.credentials = Some(credentials); + self + } + /// Sets what protocol is allowed. If `allow_http` is : /// * false (default): Only HTTPS are allowed /// * true: HTTP and HTTPS are allowed @@ -992,7 +1007,7 @@ impl AmazonS3Builder { self.parse_url(&url)?; } - let region = match (self.region.clone(), self.profile.clone()) { + let region = match (self.region, self.profile.clone()) { (Some(region), _) => Some(region), (None, Some(profile)) => profile_region(profile), (None, None) => None, @@ -1002,76 +1017,74 @@ impl AmazonS3Builder { let region = region.context(MissingRegionSnafu)?; let checksum = self.checksum_algorithm.map(|x| x.get()).transpose()?; - let credentials = match (self.access_key_id, self.secret_access_key, self.token) { - (Some(key_id), Some(secret_key), token) => { - info!("Using Static credential provider"); - let credential = AwsCredential { - key_id, - secret_key, - token, - }; - Arc::new(StaticCredentialProvider::new(credential)) as _ - } - (None, Some(_), _) => return Err(Error::MissingAccessKeyId.into()), - (Some(_), None, _) => return Err(Error::MissingSecretAccessKey.into()), - // TODO: Replace with `AmazonS3Builder::credentials_from_env` - _ => match ( - std::env::var("AWS_WEB_IDENTITY_TOKEN_FILE"), - std::env::var("AWS_ROLE_ARN"), - ) { - (Ok(token_path), Ok(role_arn)) => { - info!("Using WebIdentity credential provider"); - - let session_name = std::env::var("AWS_ROLE_SESSION_NAME") - .unwrap_or_else(|_| "WebIdentitySession".to_string()); - - let endpoint = format!("https://sts.{region}.amazonaws.com"); - - // Disallow non-HTTPs requests - let client = self - .client_options - .clone() - .with_allow_http(false) - .client()?; - - let token = WebIdentityProvider { - token_path, - session_name, - role_arn, - endpoint, - }; - - Arc::new(TokenCredentialProvider::new( + let credentials = if let Some(credentials) = self.credentials { + credentials + } else if self.access_key_id.is_some() || self.secret_access_key.is_some() { + match (self.access_key_id, self.secret_access_key, self.token) { + (Some(key_id), Some(secret_key), token) => { + info!("Using Static credential provider"); + let credential = AwsCredential { + key_id, + secret_key, token, - client, - self.retry_config.clone(), - )) as _ + }; + Arc::new(StaticCredentialProvider::new(credential)) as _ } - _ => match self.profile { - Some(profile) => { - info!("Using profile \"{}\" credential provider", profile); - profile_credentials(profile, region.clone())? - } - None => { - info!("Using Instance credential provider"); - - let token = InstanceCredentialProvider { - cache: Default::default(), - imdsv1_fallback: self.imdsv1_fallback.get()?, - metadata_endpoint: self - .metadata_endpoint - .unwrap_or_else(|| METADATA_ENDPOINT.into()), - }; - - Arc::new(TokenCredentialProvider::new( - token, - // The instance metadata endpoint is access over HTTP - self.client_options.clone().with_allow_http(true).client()?, - self.retry_config.clone(), - )) as _ - } - }, - }, + (None, Some(_), _) => return Err(Error::MissingAccessKeyId.into()), + (Some(_), None, _) => return Err(Error::MissingSecretAccessKey.into()), + (None, None, _) => unreachable!(), + } + } else if let (Ok(token_path), Ok(role_arn)) = ( + std::env::var("AWS_WEB_IDENTITY_TOKEN_FILE"), + std::env::var("AWS_ROLE_ARN"), + ) { + // TODO: Replace with `AmazonS3Builder::credentials_from_env` + info!("Using WebIdentity credential provider"); + + let session_name = std::env::var("AWS_ROLE_SESSION_NAME") + .unwrap_or_else(|_| "WebIdentitySession".to_string()); + + let endpoint = format!("https://sts.{region}.amazonaws.com"); + + // Disallow non-HTTPs requests + let client = self + .client_options + .clone() + .with_allow_http(false) + .client()?; + + let token = WebIdentityProvider { + token_path, + session_name, + role_arn, + endpoint, + }; + + Arc::new(TokenCredentialProvider::new( + token, + client, + self.retry_config.clone(), + )) as _ + } else if let Some(profile) = self.profile { + info!("Using profile \"{}\" credential provider", profile); + profile_credentials(profile, region.clone())? + } else { + info!("Using Instance credential provider"); + + let token = InstanceCredentialProvider { + cache: Default::default(), + imdsv1_fallback: self.imdsv1_fallback.get()?, + metadata_endpoint: self + .metadata_endpoint + .unwrap_or_else(|| METADATA_ENDPOINT.into()), + }; + + Arc::new(TokenCredentialProvider::new( + token, + // The instance metadata endpoint is access over HTTP + self.client_options.clone().with_allow_http(true).client()?, + self.retry_config.clone(), + )) as _ }; let endpoint: String; diff --git a/src/azure/mod.rs b/src/azure/mod.rs index 6dc14cf..069b033 100644 --- a/src/azure/mod.rs +++ b/src/azure/mod.rs @@ -48,7 +48,6 @@ use std::{collections::BTreeSet, str::FromStr}; use tokio::io::AsyncWrite; use url::Url; -use crate::azure::credential::AzureCredential; use crate::client::header::header_meta; use crate::client::{ ClientConfigKey, CredentialProvider, StaticCredentialProvider, @@ -60,7 +59,10 @@ pub use credential::authority_hosts; mod client; mod credential; -type AzureCredentialProvider = Arc>; +/// [`CredentialProvider`] for [`MicrosoftAzure`] +pub type AzureCredentialProvider = + Arc>; +pub use credential::AzureCredential; const STORE: &str = "MicrosoftAzure"; @@ -153,6 +155,13 @@ pub struct MicrosoftAzure { client: Arc, } +impl MicrosoftAzure { + /// Returns the [`AzureCredentialProvider`] used by [`MicrosoftAzure`] + pub fn credentials(&self) -> &AzureCredentialProvider { + &self.client.config().credentials + } +} + impl std::fmt::Display for MicrosoftAzure { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!( @@ -374,6 +383,8 @@ pub struct MicrosoftAzureBuilder { retry_config: RetryConfig, /// Client options client_options: ClientOptions, + /// Credentials + credentials: Option, } /// Configuration keys for [`MicrosoftAzureBuilder`] @@ -840,6 +851,12 @@ impl MicrosoftAzureBuilder { self } + /// Set the credential provider overriding any other options + pub fn with_credentials(mut self, credentials: AzureCredentialProvider) -> Self { + self.credentials = Some(credentials); + self + } + /// Set if the Azure emulator should be used (defaults to false) pub fn with_use_emulator(mut self, use_emulator: bool) -> Self { self.use_emulator = use_emulator.into(); @@ -937,7 +954,9 @@ impl MicrosoftAzureBuilder { let url = Url::parse(&account_url) .context(UnableToParseUrlSnafu { url: account_url })?; - let credential = if let Some(bearer_token) = self.bearer_token { + let credential = if let Some(credential) = self.credentials { + credential + } else if let Some(bearer_token) = self.bearer_token { static_creds(AzureCredential::BearerToken(bearer_token)) } else if let Some(access_key) = self.access_key { static_creds(AzureCredential::AccessKey(access_key)) diff --git a/src/client/mod.rs b/src/client/mod.rs index 292e467..8c23576 100644 --- a/src/client/mod.rs +++ b/src/client/mod.rs @@ -509,8 +509,10 @@ impl GetOptionsExt for RequestBuilder { /// Provides credentials for use when signing requests #[async_trait] pub trait CredentialProvider: std::fmt::Debug + Send + Sync { + /// The type of credential returned by this provider type Credential; + /// Return a credential async fn get_credential(&self) -> Result>; } diff --git a/src/gcp/credential.rs b/src/gcp/credential.rs index ad12855..205b805 100644 --- a/src/gcp/credential.rs +++ b/src/gcp/credential.rs @@ -82,6 +82,7 @@ impl From for crate::Error { } } +/// A Google Cloud Storage Credential #[derive(Debug, Eq, PartialEq)] pub struct GcpCredential { /// An HTTP bearer token diff --git a/src/gcp/mod.rs b/src/gcp/mod.rs index 6813bbf..21ba158 100644 --- a/src/gcp/mod.rs +++ b/src/gcp/mod.rs @@ -52,7 +52,6 @@ use crate::client::{ ClientConfigKey, CredentialProvider, GetOptionsExt, StaticCredentialProvider, TokenCredentialProvider, }; -use crate::gcp::credential::{application_default_credentials, GcpCredential}; use crate::{ multipart::{CloudMultiPartUpload, CloudMultiPartUploadImpl, UploadPart}, path::{Path, DELIMITER}, @@ -61,15 +60,18 @@ use crate::{ ObjectStore, Result, RetryConfig, }; -use self::credential::{ - default_gcs_base_url, InstanceCredentialProvider, ServiceAccountCredentials, +use credential::{ + application_default_credentials, default_gcs_base_url, InstanceCredentialProvider, + ServiceAccountCredentials, }; mod credential; const STORE: &str = "GCS"; -type GcpCredentialProvider = Arc>; +/// [`CredentialProvider`] for [`GoogleCloudStorage`] +pub type GcpCredentialProvider = Arc>; +pub use credential::GcpCredential; #[derive(Debug, Snafu)] enum Error { @@ -205,6 +207,13 @@ impl std::fmt::Display for GoogleCloudStorage { } } +impl GoogleCloudStorage { + /// Returns the [`GcpCredentialProvider`] used by [`GoogleCloudStorage`] + pub fn credentials(&self) -> &GcpCredentialProvider { + &self.client.credentials + } +} + #[derive(Debug)] struct GoogleCloudStorageClient { client: Client, @@ -696,6 +705,8 @@ pub struct GoogleCloudStorageBuilder { retry_config: RetryConfig, /// Client options client_options: ClientOptions, + /// Credentials + credentials: Option, } /// Configuration keys for [`GoogleCloudStorageBuilder`] @@ -794,6 +805,7 @@ impl Default for GoogleCloudStorageBuilder { retry_config: Default::default(), client_options: ClientOptions::new().with_allow_http(true), url: None, + credentials: None, } } } @@ -1006,6 +1018,12 @@ impl GoogleCloudStorageBuilder { self } + /// Set the credential provider overriding any other options + pub fn with_credentials(mut self, credentials: GcpCredentialProvider) -> Self { + self.credentials = Some(credentials); + self + } + /// Set the retry configuration pub fn with_retry(mut self, retry_config: RetryConfig) -> Self { self.retry_config = retry_config; @@ -1072,7 +1090,9 @@ impl GoogleCloudStorageBuilder { let scope = "https://www.googleapis.com/auth/devstorage.full_control"; let audience = "https://www.googleapis.com/oauth2/v4/token"; - let credentials = if disable_oauth { + let credentials = if let Some(credentials) = self.credentials { + credentials + } else if disable_oauth { Arc::new(StaticCredentialProvider::new(GcpCredential { bearer: "".to_string(), })) as _ diff --git a/src/lib.rs b/src/lib.rs index 0f3ed80..7116a87 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -245,7 +245,7 @@ pub mod throttle; mod client; #[cfg(any(feature = "gcp", feature = "aws", feature = "azure", feature = "http"))] -pub use client::{backoff::BackoffConfig, retry::RetryConfig}; +pub use client::{backoff::BackoffConfig, retry::RetryConfig, CredentialProvider}; #[cfg(any(feature = "gcp", feature = "aws", feature = "azure", feature = "http"))] mod config; From 8bc32fe491fc9a4a281b33cc24f47f62c478417f Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 18 May 2023 08:50:56 +0100 Subject: [PATCH 146/397] Remove AWS_PROFILE support (#4238) --- Cargo.toml | 8 --- src/aws/mod.rs | 127 +------------------------------------------ src/aws/profile.rs | 133 --------------------------------------------- 3 files changed, 1 insertion(+), 267 deletions(-) delete mode 100644 src/aws/profile.rs diff --git a/Cargo.toml b/Cargo.toml index c6b89fa..bd9c973 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -53,11 +53,6 @@ reqwest = { version = "0.11", default-features = false, features = ["rustls-tls" ring = { version = "0.16", default-features = false, features = ["std"], optional = true } rustls-pemfile = { version = "1.0", default-features = false, optional = true } -# AWS Profile support -aws-types = { version = "0.55", optional = true } -aws-credential-types = { version = "0.55", optional = true } -aws-config = { version = "0.55", optional = true } - [target.'cfg(not(target_arch = "wasm32"))'.dependencies] tokio = { version = "1.25.0", features = ["sync", "macros", "rt", "time", "io-util", "fs"] } @@ -74,9 +69,6 @@ gcp = ["cloud", "rustls-pemfile"] aws = ["cloud"] http = ["cloud"] -# Experimental support for AWS_PROFILE -aws_profile = ["aws", "aws-config", "aws-types", "aws-credential-types"] - [dev-dependencies] # In alphabetical order dotenv = "0.15.0" tempfile = "3.1.0" diff --git a/src/aws/mod.rs b/src/aws/mod.rs index a10561b..a7f43d1 100644 --- a/src/aws/mod.rs +++ b/src/aws/mod.rs @@ -64,9 +64,6 @@ mod checksum; mod client; mod credential; -#[cfg(feature = "aws_profile")] -mod profile; - // http://docs.aws.amazon.com/general/latest/gr/sigv4-create-canonical-request.html // // Do not URI-encode any of the unreserved characters that RFC 3986 defines: @@ -106,9 +103,6 @@ enum Error { #[snafu(display("Missing SecretAccessKey"))] MissingSecretAccessKey, - #[snafu(display("Profile support requires aws_profile feature"))] - MissingProfileFeature, - #[snafu(display("ETag Header missing from response"))] MissingEtag, @@ -427,8 +421,6 @@ pub struct AmazonS3Builder { checksum_algorithm: Option>, /// Metadata endpoint, see metadata_endpoint: Option, - /// Profile name, see - profile: Option, /// Client options client_options: ClientOptions, /// Credentials @@ -559,13 +551,6 @@ pub enum AmazonS3ConfigKey { /// - `metadata_endpoint` MetadataEndpoint, - /// AWS profile name - /// - /// Supported keys: - /// - `aws_profile` - /// - `profile` - Profile, - /// Client options Client(ClientConfigKey), } @@ -583,7 +568,6 @@ impl AsRef for AmazonS3ConfigKey { Self::VirtualHostedStyleRequest => "aws_virtual_hosted_style_request", Self::DefaultRegion => "aws_default_region", Self::MetadataEndpoint => "aws_metadata_endpoint", - Self::Profile => "aws_profile", Self::UnsignedPayload => "aws_unsigned_payload", Self::Checksum => "aws_checksum_algorithm", Self::Client(opt) => opt.as_ref(), @@ -612,7 +596,6 @@ impl FromStr for AmazonS3ConfigKey { "aws_virtual_hosted_style_request" | "virtual_hosted_style_request" => { Ok(Self::VirtualHostedStyleRequest) } - "aws_profile" | "profile" => Ok(Self::Profile), "aws_imdsv1_fallback" | "imdsv1_fallback" => Ok(Self::ImdsV1Fallback), "aws_metadata_endpoint" | "metadata_endpoint" => Ok(Self::MetadataEndpoint), "aws_unsigned_payload" | "unsigned_payload" => Ok(Self::UnsignedPayload), @@ -643,7 +626,6 @@ impl AmazonS3Builder { /// * `AWS_SESSION_TOKEN` -> token /// * `AWS_CONTAINER_CREDENTIALS_RELATIVE_URI` -> /// * `AWS_ALLOW_HTTP` -> set to "true" to permit HTTP connections without TLS - /// * `AWS_PROFILE` -> set profile name, requires `aws_profile` feature enabled /// # Example /// ``` /// use object_store::aws::AmazonS3Builder; @@ -727,7 +709,6 @@ impl AmazonS3Builder { AmazonS3ConfigKey::MetadataEndpoint => { self.metadata_endpoint = Some(value.into()) } - AmazonS3ConfigKey::Profile => self.profile = Some(value.into()), AmazonS3ConfigKey::UnsignedPayload => self.unsigned_payload.parse(value), AmazonS3ConfigKey::Checksum => { self.checksum_algorithm = Some(ConfigValue::Deferred(value.into())) @@ -794,7 +775,6 @@ impl AmazonS3Builder { Some(self.virtual_hosted_style_request.to_string()) } AmazonS3ConfigKey::MetadataEndpoint => self.metadata_endpoint.clone(), - AmazonS3ConfigKey::Profile => self.profile.clone(), AmazonS3ConfigKey::UnsignedPayload => Some(self.unsigned_payload.to_string()), AmazonS3ConfigKey::Checksum => { self.checksum_algorithm.as_ref().map(ToString::to_string) @@ -982,24 +962,6 @@ impl AmazonS3Builder { self } - /// Set the AWS profile name, see - /// - /// This makes use of [aws-config] to provide credentials and therefore requires - /// the `aws-profile` feature to be enabled - /// - /// It is strongly encouraged that users instead make use of a credential manager - /// such as [aws-vault] not only to avoid the significant additional dependencies, - /// but also to avoid storing credentials in [plain text on disk] - /// - /// [aws-config]: https://docs.rs/aws-config - /// [aws-vault]: https://github.com/99designs/aws-vault - /// [plain text on disk]: https://99designs.com.au/blog/engineering/aws-vault/ - #[cfg(feature = "aws_profile")] - pub fn with_profile(mut self, profile: impl Into) -> Self { - self.profile = Some(profile.into()); - self - } - /// Create a [`AmazonS3`] instance from the provided values, /// consuming `self`. pub fn build(mut self) -> Result { @@ -1007,14 +969,8 @@ impl AmazonS3Builder { self.parse_url(&url)?; } - let region = match (self.region, self.profile.clone()) { - (Some(region), _) => Some(region), - (None, Some(profile)) => profile_region(profile), - (None, None) => None, - }; - let bucket = self.bucket_name.context(MissingBucketNameSnafu)?; - let region = region.context(MissingRegionSnafu)?; + let region = self.region.context(MissingRegionSnafu)?; let checksum = self.checksum_algorithm.map(|x| x.get()).transpose()?; let credentials = if let Some(credentials) = self.credentials { @@ -1065,9 +1021,6 @@ impl AmazonS3Builder { client, self.retry_config.clone(), )) as _ - } else if let Some(profile) = self.profile { - info!("Using profile \"{}\" credential provider", profile); - profile_credentials(profile, region.clone())? } else { info!("Using Instance credential provider"); @@ -1123,37 +1076,6 @@ impl AmazonS3Builder { } } -#[cfg(feature = "aws_profile")] -fn profile_region(profile: String) -> Option { - use tokio::runtime::Handle; - - let handle = Handle::current(); - let provider = profile::ProfileProvider::new(profile, None); - - handle.block_on(provider.get_region()) -} - -#[cfg(feature = "aws_profile")] -fn profile_credentials(profile: String, region: String) -> Result { - Ok(Arc::new(profile::ProfileProvider::new( - profile, - Some(region), - ))) -} - -#[cfg(not(feature = "aws_profile"))] -fn profile_region(_profile: String) -> Option { - None -} - -#[cfg(not(feature = "aws_profile"))] -fn profile_credentials( - _profile: String, - _region: String, -) -> Result { - Err(Error::MissingProfileFeature.into()) -} - #[cfg(test)] mod tests { use super::*; @@ -1638,50 +1560,3 @@ mod s3_resolve_bucket_region_tests { assert!(result.is_err()); } } - -#[cfg(all(test, feature = "aws_profile"))] -mod profile_tests { - use super::*; - use std::env; - - use super::profile::{TEST_PROFILE_NAME, TEST_PROFILE_REGION}; - - #[tokio::test] - async fn s3_test_region_from_profile() { - let s3_url = "s3://bucket/prefix".to_owned(); - - let s3 = AmazonS3Builder::new() - .with_url(s3_url) - .with_profile(TEST_PROFILE_NAME) - .build() - .unwrap(); - - let region = &s3.client.config().region; - - assert_eq!(region, TEST_PROFILE_REGION); - } - - #[test] - fn s3_test_region_override() { - let s3_url = "s3://bucket/prefix".to_owned(); - - let aws_profile = - env::var("AWS_PROFILE").unwrap_or_else(|_| TEST_PROFILE_NAME.into()); - - let aws_region = - env::var("AWS_REGION").unwrap_or_else(|_| "object_store:fake_region".into()); - - env::set_var("AWS_PROFILE", aws_profile); - - let s3 = AmazonS3Builder::from_env() - .with_url(s3_url) - .with_region(aws_region.clone()) - .build() - .unwrap(); - - let actual = &s3.client.config().region; - let expected = &aws_region; - - assert_eq!(actual, expected); - } -} diff --git a/src/aws/profile.rs b/src/aws/profile.rs deleted file mode 100644 index 3fc0805..0000000 --- a/src/aws/profile.rs +++ /dev/null @@ -1,133 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#![cfg(feature = "aws_profile")] - -use async_trait::async_trait; -use aws_config::meta::region::ProvideRegion; -use aws_config::profile::profile_file::ProfileFiles; -use aws_config::profile::ProfileFileCredentialsProvider; -use aws_config::profile::ProfileFileRegionProvider; -use aws_config::provider_config::ProviderConfig; -use aws_credential_types::provider::ProvideCredentials; -use aws_types::region::Region; -use std::sync::Arc; -use std::time::Instant; -use std::time::SystemTime; - -use crate::aws::AwsCredential; -use crate::client::token::{TemporaryToken, TokenCache}; -use crate::client::CredentialProvider; -use crate::Result; - -#[cfg(test)] -pub static TEST_PROFILE_NAME: &str = "object_store:fake_profile"; - -#[cfg(test)] -pub static TEST_PROFILE_REGION: &str = "object_store:fake_region_from_profile"; - -#[derive(Debug)] -pub struct ProfileProvider { - name: String, - region: Option, - cache: TokenCache>, -} - -impl ProfileProvider { - pub fn new(name: String, region: Option) -> Self { - Self { - name, - region, - cache: Default::default(), - } - } - - #[cfg(test)] - fn profile_files(&self) -> ProfileFiles { - use aws_config::profile::profile_file::ProfileFileKind; - - let config = format!( - "[profile {}]\nregion = {}", - TEST_PROFILE_NAME, TEST_PROFILE_REGION - ); - - ProfileFiles::builder() - .with_contents(ProfileFileKind::Config, config) - .build() - } - - #[cfg(not(test))] - fn profile_files(&self) -> ProfileFiles { - ProfileFiles::default() - } - - pub async fn get_region(&self) -> Option { - if let Some(region) = self.region.clone() { - return Some(region); - } - - let provider = ProfileFileRegionProvider::builder() - .profile_files(self.profile_files()) - .profile_name(&self.name) - .build(); - - let region = provider.region().await; - - region.map(|r| r.as_ref().to_owned()) - } -} - -#[async_trait] -impl CredentialProvider for ProfileProvider { - type Credential = AwsCredential; - - async fn get_credential(&self) -> Result> { - self.cache - .get_or_insert_with(move || async move { - let region = self.region.clone().map(Region::new); - - let config = ProviderConfig::default().with_region(region); - - let credentials = ProfileFileCredentialsProvider::builder() - .configure(&config) - .profile_name(&self.name) - .build(); - - let c = credentials.provide_credentials().await.map_err(|source| { - crate::Error::Generic { - store: "S3", - source: Box::new(source), - } - })?; - let t_now = SystemTime::now(); - let expiry = c - .expiry() - .and_then(|e| e.duration_since(t_now).ok()) - .map(|ttl| Instant::now() + ttl); - - Ok(TemporaryToken { - token: Arc::new(AwsCredential { - key_id: c.access_key_id().to_string(), - secret_key: c.secret_access_key().to_string(), - token: c.session_token().map(ToString::to_string), - }), - expiry, - }) - }) - .await - } -} From f376f6305796e49eaec48abf347e0ca88c8a88ea Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 18 May 2023 09:05:41 +0100 Subject: [PATCH 147/397] Expose AwsAuthorizer (#4237) * Expose AWSAuthorizer * Review feedback --- src/aws/client.rs | 3 +- src/aws/credential.rs | 128 +++++++++++++++++++++++++++--------------- src/aws/mod.rs | 3 +- 3 files changed, 86 insertions(+), 48 deletions(-) diff --git a/src/aws/client.rs b/src/aws/client.rs index 8ce743b..2c45050 100644 --- a/src/aws/client.rs +++ b/src/aws/client.rs @@ -238,7 +238,7 @@ impl S3Client { &self.config.region, "s3", self.config.sign_payload, - payload_sha256, + payload_sha256.as_deref(), ) .send_retry(&self.config.retry_config) .await @@ -315,7 +315,6 @@ impl S3Client { let mut query = Vec::with_capacity(4); - // Note: the order of these matters to ensure the generated URL is canonical if let Some(token) = token { query.push(("continuation-token", token)) } diff --git a/src/aws/credential.rs b/src/aws/credential.rs index 47d681c..909dde0 100644 --- a/src/aws/credential.rs +++ b/src/aws/credential.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -use crate::aws::{STORE, STRICT_ENCODE_SET}; +use crate::aws::{STORE, STRICT_ENCODE_SET, STRICT_PATH_ENCODE_SET}; use crate::client::retry::RetryExt; use crate::client::token::{TemporaryToken, TokenCache}; use crate::client::TokenProvider; @@ -39,7 +39,8 @@ type StdError = Box; /// SHA256 hash of empty string static EMPTY_SHA256_HASH: &str = "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855"; -static UNSIGNED_PAYLOAD_LITERAL: &str = "UNSIGNED-PAYLOAD"; +static UNSIGNED_PAYLOAD: &str = "UNSIGNED-PAYLOAD"; +static STREAMING_PAYLOAD: &str = "STREAMING-AWS4-HMAC-SHA256-PAYLOAD"; /// A set of AWS security credentials #[derive(Debug, Eq, PartialEq)] @@ -72,8 +73,12 @@ impl AwsCredential { } } -struct RequestSigner<'a> { - date: DateTime, +/// Authorize a [`Request`] with an [`AwsCredential`] using [AWS SigV4] +/// +/// [AWS SigV4]: https://docs.aws.amazon.com/general/latest/gr/sigv4-calculate-signature.html +#[derive(Debug)] +pub struct AwsAuthorizer<'a> { + date: Option>, credential: &'a AwsCredential, service: &'a str, region: &'a str, @@ -85,39 +90,78 @@ const HASH_HEADER: &str = "x-amz-content-sha256"; const TOKEN_HEADER: &str = "x-amz-security-token"; const AUTH_HEADER: &str = "authorization"; -impl<'a> RequestSigner<'a> { - fn sign(&self, request: &mut Request, pre_calculated_digest: Option>) { +impl<'a> AwsAuthorizer<'a> { + /// Create a new [`AwsAuthorizer`] + pub fn new(credential: &'a AwsCredential, service: &'a str, region: &'a str) -> Self { + Self { + credential, + service, + region, + date: None, + sign_payload: true, + } + } + + /// Controls whether this [`AwsAuthorizer`] will attempt to sign the request payload, + /// the default is `true` + pub fn with_sign_payload(mut self, signed: bool) -> Self { + self.sign_payload = signed; + self + } + + /// Authorize `request` with an optional pre-calculated SHA256 digest by attaching + /// the relevant [AWS SigV4] headers + /// + /// # Payload Signature + /// + /// AWS SigV4 requests must contain the `x-amz-content-sha256` header, it is set as follows: + /// + /// * If not configured to sign payloads, it is set to `UNSIGNED-PAYLOAD` + /// * If a `pre_calculated_digest` is provided, it is set to the hex encoding of it + /// * If it is a streaming request, it is set to `STREAMING-AWS4-HMAC-SHA256-PAYLOAD` + /// * Otherwise it is set to the hex encoded SHA256 of the request body + /// + /// [AWS SigV4]: https://docs.aws.amazon.com/IAM/latest/UserGuide/create-signed-request.html + pub fn authorize(&self, request: &mut Request, pre_calculated_digest: Option<&[u8]>) { if let Some(ref token) = self.credential.token { let token_val = HeaderValue::from_str(token).unwrap(); request.headers_mut().insert(TOKEN_HEADER, token_val); } - let host_val = HeaderValue::from_str( - &request.url()[url::Position::BeforeHost..url::Position::AfterPort], - ) - .unwrap(); + let host = &request.url()[url::Position::BeforeHost..url::Position::AfterPort]; + let host_val = HeaderValue::from_str(host).unwrap(); request.headers_mut().insert("host", host_val); - let date_str = self.date.format("%Y%m%dT%H%M%SZ").to_string(); + let date = self.date.unwrap_or_else(Utc::now); + let date_str = date.format("%Y%m%dT%H%M%SZ").to_string(); let date_val = HeaderValue::from_str(&date_str).unwrap(); request.headers_mut().insert(DATE_HEADER, date_val); - let digest = if self.sign_payload { - if let Some(digest) = pre_calculated_digest { - hex_encode(&digest) - } else { - match request.body() { + let digest = match self.sign_payload { + false => UNSIGNED_PAYLOAD.to_string(), + true => match pre_calculated_digest { + Some(digest) => hex_encode(digest), + None => match request.body() { None => EMPTY_SHA256_HASH.to_string(), - Some(body) => hex_digest(body.as_bytes().unwrap()), - } - } - } else { - UNSIGNED_PAYLOAD_LITERAL.to_string() + Some(body) => match body.as_bytes() { + Some(bytes) => hex_digest(bytes), + None => STREAMING_PAYLOAD.to_string(), + }, + }, + }, }; let header_digest = HeaderValue::from_str(&digest).unwrap(); request.headers_mut().insert(HASH_HEADER, header_digest); + // Each path segment must be URI-encoded twice (except for Amazon S3 which only gets URI-encoded once). + // see https://docs.aws.amazon.com/general/latest/gr/sigv4-create-canonical-request.html + let canonical_uri = match self.service { + "s3" => request.url().path().to_string(), + _ => utf8_percent_encode(request.url().path(), &STRICT_PATH_ENCODE_SET) + .to_string(), + }; + let (signed_headers, canonical_headers) = canonicalize_headers(request.headers()); let canonical_query = canonicalize_query(request.url()); @@ -125,7 +169,7 @@ impl<'a> RequestSigner<'a> { let canonical_request = format!( "{}\n{}\n{}\n{}\n{}\n{}", request.method().as_str(), - request.url().path(), // S3 doesn't percent encode this like other services + canonical_uri, canonical_query, canonical_headers, signed_headers, @@ -135,14 +179,14 @@ impl<'a> RequestSigner<'a> { let hashed_canonical_request = hex_digest(canonical_request.as_bytes()); let scope = format!( "{}/{}/{}/aws4_request", - self.date.format("%Y%m%d"), + date.format("%Y%m%d"), self.region, self.service ); let string_to_sign = format!( "AWS4-HMAC-SHA256\n{}\n{}\n{}", - self.date.format("%Y%m%dT%H%M%SZ"), + date.format("%Y%m%dT%H%M%SZ"), scope, hashed_canonical_request ); @@ -150,7 +194,7 @@ impl<'a> RequestSigner<'a> { // sign the string let signature = self.credential - .sign(&string_to_sign, self.date, self.region, self.service); + .sign(&string_to_sign, date, self.region, self.service); // build the actual auth header let authorisation = format!( @@ -171,7 +215,7 @@ pub trait CredentialExt { region: &str, service: &str, sign_payload: bool, - payload_sha256: Option>, + payload_sha256: Option<&[u8]>, ) -> Self; } @@ -182,21 +226,15 @@ impl CredentialExt for RequestBuilder { region: &str, service: &str, sign_payload: bool, - payload_sha256: Option>, + payload_sha256: Option<&[u8]>, ) -> Self { let (client, request) = self.build_split(); let mut request = request.expect("request valid"); - let date = Utc::now(); - let signer = RequestSigner { - date, - credential, - service, - region, - sign_payload, - }; + AwsAuthorizer::new(credential, service, region) + .with_sign_payload(sign_payload) + .authorize(&mut request, payload_sha256); - signer.sign(&mut request, payload_sha256); Self::from_parts(client, request) } } @@ -539,15 +577,15 @@ mod tests { .build() .unwrap(); - let signer = RequestSigner { - date, + let signer = AwsAuthorizer { + date: Some(date), credential: &credential, service: "ec2", region: "us-east-1", sign_payload: true, }; - signer.sign(&mut request, None); + signer.authorize(&mut request, None); assert_eq!(request.headers().get(AUTH_HEADER).unwrap(), "AWS4-HMAC-SHA256 Credential=AKIAIOSFODNN7EXAMPLE/20220806/us-east-1/ec2/aws4_request, SignedHeaders=host;x-amz-content-sha256;x-amz-date, Signature=a3c787a7ed37f7fdfbfd2d7056a3d7c9d85e6d52a2bfbec73793c0be6e7862d4") } @@ -577,15 +615,15 @@ mod tests { .build() .unwrap(); - let signer = RequestSigner { - date, + let authorizer = AwsAuthorizer { + date: Some(date), credential: &credential, service: "ec2", region: "us-east-1", sign_payload: false, }; - signer.sign(&mut request, None); + authorizer.authorize(&mut request, None); assert_eq!(request.headers().get(AUTH_HEADER).unwrap(), "AWS4-HMAC-SHA256 Credential=AKIAIOSFODNN7EXAMPLE/20220806/us-east-1/ec2/aws4_request, SignedHeaders=host;x-amz-content-sha256;x-amz-date, Signature=653c3d8ea261fd826207df58bc2bb69fbb5003e9eb3c0ef06e4a51f2a81d8699") } @@ -614,15 +652,15 @@ mod tests { .build() .unwrap(); - let signer = RequestSigner { - date, + let authorizer = AwsAuthorizer { + date: Some(date), credential: &credential, service: "s3", region: "us-east-1", sign_payload: true, }; - signer.sign(&mut request, None); + authorizer.authorize(&mut request, None); assert_eq!(request.headers().get(AUTH_HEADER).unwrap(), "AWS4-HMAC-SHA256 Credential=H20ABqCkLZID4rLe/20220809/us-east-1/s3/aws4_request, SignedHeaders=host;x-amz-content-sha256;x-amz-date, Signature=9ebf2f92872066c99ac94e573b4e1b80f4dbb8a32b1e8e23178318746e7d1b4d") } diff --git a/src/aws/mod.rs b/src/aws/mod.rs index a7f43d1..e71124f 100644 --- a/src/aws/mod.rs +++ b/src/aws/mod.rs @@ -82,7 +82,7 @@ const STORE: &str = "S3"; /// [`CredentialProvider`] for [`AmazonS3`] pub type AwsCredentialProvider = Arc>; -pub use credential::AwsCredential; +pub use credential::{AwsAuthorizer, AwsCredential}; /// Default metadata endpoint static METADATA_ENDPOINT: &str = "http://169.254.169.254"; @@ -160,6 +160,7 @@ impl From for super::Error { } /// Get the bucket region using the [HeadBucket API]. This will fail if the bucket does not exist. +/// /// [HeadBucket API]: https://docs.aws.amazon.com/AmazonS3/latest/API/API_HeadBucket.html pub async fn resolve_bucket_region( bucket: &str, From 3d4d1e0543004c1ddd9e6bfb42a1b5deb7ddc701 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 18 May 2023 09:18:38 +0100 Subject: [PATCH 148/397] Extract Common Listing and Retrieval Functionality (#4220) * Factor out common cloud storage client functionality * Remove format_prefix * Review feedback --- src/aws/client.rs | 224 +++++++++++++++++------------------- src/aws/mod.rs | 63 ++-------- src/azure/client.rs | 104 ++++++++--------- src/azure/mod.rs | 55 ++------- src/client/get.rs | 70 +++++++++++ src/client/list.rs | 162 +++++++++++++++++--------- src/client/list_response.rs | 85 ++++++++++++++ src/client/mod.rs | 18 ++- src/gcp/mod.rs | 148 +++++++++--------------- src/util.rs | 8 -- 10 files changed, 497 insertions(+), 440 deletions(-) create mode 100644 src/client/get.rs create mode 100644 src/client/list_response.rs diff --git a/src/aws/client.rs b/src/aws/client.rs index 2c45050..cfce352 100644 --- a/src/aws/client.rs +++ b/src/aws/client.rs @@ -18,17 +18,17 @@ use crate::aws::checksum::Checksum; use crate::aws::credential::{AwsCredential, CredentialExt}; use crate::aws::{AwsCredentialProvider, STORE, STRICT_PATH_ENCODE_SET}; -use crate::client::list::ListResponse; -use crate::client::pagination::stream_paginated; +use crate::client::get::GetClient; +use crate::client::list::ListClient; +use crate::client::list_response::ListResponse; use crate::client::retry::RetryExt; use crate::client::GetOptionsExt; use crate::multipart::UploadPart; use crate::path::DELIMITER; -use crate::util::format_prefix; use crate::{ - BoxStream, ClientOptions, GetOptions, ListResult, MultipartId, Path, Result, - RetryConfig, StreamExt, + ClientOptions, GetOptions, ListResult, MultipartId, Path, Result, RetryConfig, }; +use async_trait::async_trait; use base64::prelude::BASE64_STANDARD; use base64::Engine; use bytes::{Buf, Bytes}; @@ -169,40 +169,6 @@ impl S3Client { self.config.credentials.get_credential().await } - /// Make an S3 GET request - pub async fn get_request( - &self, - path: &Path, - options: GetOptions, - head: bool, - ) -> Result { - let credential = self.get_credential().await?; - let url = self.config.path_url(path); - let method = match head { - true => Method::HEAD, - false => Method::GET, - }; - - let builder = self.client.request(method, url); - - let response = builder - .with_get_options(options) - .with_aws_sigv4( - credential.as_ref(), - &self.config.region, - "s3", - self.config.sign_payload, - None, - ) - .send_retry(&self.config.retry_config) - .await - .context(GetRequestSnafu { - path: path.as_ref(), - })?; - - Ok(response) - } - /// Make an S3 PUT request pub async fn put_request( &self, @@ -302,88 +268,6 @@ impl S3Client { Ok(()) } - /// Make an S3 List request - async fn list_request( - &self, - prefix: Option<&str>, - delimiter: bool, - token: Option<&str>, - offset: Option<&str>, - ) -> Result<(ListResult, Option)> { - let credential = self.get_credential().await?; - let url = self.config.bucket_endpoint.clone(); - - let mut query = Vec::with_capacity(4); - - if let Some(token) = token { - query.push(("continuation-token", token)) - } - - if delimiter { - query.push(("delimiter", DELIMITER)) - } - - query.push(("list-type", "2")); - - if let Some(prefix) = prefix { - query.push(("prefix", prefix)) - } - - if let Some(offset) = offset { - query.push(("start-after", offset)) - } - - let response = self - .client - .request(Method::GET, &url) - .query(&query) - .with_aws_sigv4( - credential.as_ref(), - &self.config.region, - "s3", - self.config.sign_payload, - None, - ) - .send_retry(&self.config.retry_config) - .await - .context(ListRequestSnafu)? - .bytes() - .await - .context(ListResponseBodySnafu)?; - - let mut response: ListResponse = quick_xml::de::from_reader(response.reader()) - .context(InvalidListResponseSnafu)?; - let token = response.next_continuation_token.take(); - - Ok((response.try_into()?, token)) - } - - /// Perform a list operation automatically handling pagination - pub fn list_paginated( - &self, - prefix: Option<&Path>, - delimiter: bool, - offset: Option<&Path>, - ) -> BoxStream<'_, Result> { - let offset = offset.map(|x| x.to_string()); - let prefix = format_prefix(prefix); - stream_paginated( - (prefix, offset), - move |(prefix, offset), token| async move { - let (r, next_token) = self - .list_request( - prefix.as_deref(), - delimiter, - token.as_deref(), - offset.as_deref(), - ) - .await?; - Ok((r, (prefix, offset), next_token)) - }, - ) - .boxed() - } - pub async fn create_multipart(&self, location: &Path) -> Result { let credential = self.get_credential().await?; let url = format!("{}?uploads=", self.config.path_url(location),); @@ -451,6 +335,104 @@ impl S3Client { } } +#[async_trait] +impl GetClient for S3Client { + const STORE: &'static str = STORE; + + /// Make an S3 GET request + async fn get_request( + &self, + path: &Path, + options: GetOptions, + head: bool, + ) -> Result { + let credential = self.get_credential().await?; + let url = self.config.path_url(path); + let method = match head { + true => Method::HEAD, + false => Method::GET, + }; + + let builder = self.client.request(method, url); + + let response = builder + .with_get_options(options) + .with_aws_sigv4( + credential.as_ref(), + &self.config.region, + "s3", + self.config.sign_payload, + None, + ) + .send_retry(&self.config.retry_config) + .await + .context(GetRequestSnafu { + path: path.as_ref(), + })?; + + Ok(response) + } +} + +#[async_trait] +impl ListClient for S3Client { + /// Make an S3 List request + async fn list_request( + &self, + prefix: Option<&str>, + delimiter: bool, + token: Option<&str>, + offset: Option<&str>, + ) -> Result<(ListResult, Option)> { + let credential = self.get_credential().await?; + let url = self.config.bucket_endpoint.clone(); + + let mut query = Vec::with_capacity(4); + + if let Some(token) = token { + query.push(("continuation-token", token)) + } + + if delimiter { + query.push(("delimiter", DELIMITER)) + } + + query.push(("list-type", "2")); + + if let Some(prefix) = prefix { + query.push(("prefix", prefix)) + } + + if let Some(offset) = offset { + query.push(("start-after", offset)) + } + + let response = self + .client + .request(Method::GET, &url) + .query(&query) + .with_aws_sigv4( + credential.as_ref(), + &self.config.region, + "s3", + self.config.sign_payload, + None, + ) + .send_retry(&self.config.retry_config) + .await + .context(ListRequestSnafu)? + .bytes() + .await + .context(ListResponseBodySnafu)?; + + let mut response: ListResponse = quick_xml::de::from_reader(response.reader()) + .context(InvalidListResponseSnafu)?; + let token = response.next_continuation_token.take(); + + Ok((response.try_into()?, token)) + } +} + fn encode_path(path: &Path) -> PercentEncode<'_> { utf8_percent_encode(path.as_ref(), &STRICT_PATH_ENCODE_SET) } diff --git a/src/aws/mod.rs b/src/aws/mod.rs index e71124f..4c6d346 100644 --- a/src/aws/mod.rs +++ b/src/aws/mod.rs @@ -34,11 +34,9 @@ use async_trait::async_trait; use bytes::Bytes; use futures::stream::BoxStream; -use futures::TryStreamExt; use itertools::Itertools; use serde::{Deserialize, Serialize}; use snafu::{ensure, OptionExt, ResultExt, Snafu}; -use std::collections::BTreeSet; use std::str::FromStr; use std::sync::Arc; use tokio::io::AsyncWrite; @@ -48,7 +46,8 @@ use url::Url; pub use crate::aws::checksum::Checksum; use crate::aws::client::{S3Client, S3Config}; use crate::aws::credential::{InstanceCredentialProvider, WebIdentityProvider}; -use crate::client::header::header_meta; +use crate::client::get::GetClientExt; +use crate::client::list::ListClientExt; use crate::client::{ ClientConfigKey, CredentialProvider, StaticCredentialProvider, TokenCredentialProvider, @@ -57,7 +56,7 @@ use crate::config::ConfigValue; use crate::multipart::{CloudMultiPartUpload, CloudMultiPartUploadImpl, UploadPart}; use crate::{ ClientOptions, GetOptions, GetResult, ListResult, MultipartId, ObjectMeta, - ObjectStore, Path, Result, RetryConfig, StreamExt, + ObjectStore, Path, Result, RetryConfig, }; mod checksum; @@ -138,11 +137,6 @@ enum Error { #[snafu(display("Failed to parse the region for bucket '{}'", bucket))] RegionParse { bucket: String }, - - #[snafu(display("Failed to parse headers: {}", source))] - Header { - source: crate::client::header::Error, - }, } impl From for super::Error { @@ -244,24 +238,11 @@ impl ObjectStore for AmazonS3 { } async fn get_opts(&self, location: &Path, options: GetOptions) -> Result { - let response = self.client.get_request(location, options, false).await?; - let stream = response - .bytes_stream() - .map_err(|source| crate::Error::Generic { - store: STORE, - source: Box::new(source), - }) - .boxed(); - - Ok(GetResult::Stream(stream)) + self.client.get_opts(location, options).await } async fn head(&self, location: &Path) -> Result { - let options = GetOptions::default(); - // Extract meta from headers - // https://docs.aws.amazon.com/AmazonS3/latest/API/API_HeadObject.html#API_HeadObject_ResponseSyntax - let response = self.client.get_request(location, options, true).await?; - Ok(header_meta(location, response.headers()).context(HeaderSnafu)?) + self.client.head(location).await } async fn delete(&self, location: &Path) -> Result<()> { @@ -272,14 +253,7 @@ impl ObjectStore for AmazonS3 { &self, prefix: Option<&Path>, ) -> Result>> { - let stream = self - .client - .list_paginated(prefix, false, None) - .map_ok(|r| futures::stream::iter(r.objects.into_iter().map(Ok))) - .try_flatten() - .boxed(); - - Ok(stream) + self.client.list(prefix).await } async fn list_with_offset( @@ -287,32 +261,11 @@ impl ObjectStore for AmazonS3 { prefix: Option<&Path>, offset: &Path, ) -> Result>> { - let stream = self - .client - .list_paginated(prefix, false, Some(offset)) - .map_ok(|r| futures::stream::iter(r.objects.into_iter().map(Ok))) - .try_flatten() - .boxed(); - - Ok(stream) + self.client.list_with_offset(prefix, offset).await } async fn list_with_delimiter(&self, prefix: Option<&Path>) -> Result { - let mut stream = self.client.list_paginated(prefix, true, None); - - let mut common_prefixes = BTreeSet::new(); - let mut objects = Vec::new(); - - while let Some(result) = stream.next().await { - let response = result?; - common_prefixes.extend(response.common_prefixes.into_iter()); - objects.extend(response.objects.into_iter()); - } - - Ok(ListResult { - common_prefixes: common_prefixes.into_iter().collect(), - objects, - }) + self.client.list_with_delimiter(prefix).await } async fn copy(&self, from: &Path, to: &Path) -> Result<()> { diff --git a/src/azure/client.rs b/src/azure/client.rs index 5f165c0..868a803 100644 --- a/src/azure/client.rs +++ b/src/azure/client.rs @@ -18,15 +18,16 @@ use super::credential::AzureCredential; use crate::azure::credential::*; use crate::azure::{AzureCredentialProvider, STORE}; -use crate::client::pagination::stream_paginated; +use crate::client::get::GetClient; +use crate::client::list::ListClient; use crate::client::retry::RetryExt; use crate::client::GetOptionsExt; use crate::path::DELIMITER; -use crate::util::{deserialize_rfc1123, format_prefix}; +use crate::util::deserialize_rfc1123; use crate::{ - BoxStream, ClientOptions, GetOptions, ListResult, ObjectMeta, Path, Result, - RetryConfig, StreamExt, + ClientOptions, GetOptions, ListResult, ObjectMeta, Path, Result, RetryConfig, }; +use async_trait::async_trait; use base64::prelude::BASE64_STANDARD; use base64::Engine; use bytes::{Buf, Bytes}; @@ -187,40 +188,6 @@ impl AzureClient { path: path.as_ref(), })?; - Ok(response) - } - - /// Make an Azure GET request - /// - /// - pub async fn get_request( - &self, - path: &Path, - options: GetOptions, - head: bool, - ) -> Result { - let credential = self.get_credential().await?; - let url = self.config.path_url(path); - let method = match head { - true => Method::HEAD, - false => Method::GET, - }; - - let builder = self - .client - .request(method, url) - .header(CONTENT_LENGTH, HeaderValue::from_static("0")) - .body(Bytes::new()); - - let response = builder - .with_get_options(options) - .with_azure_authorization(&credential, &self.config.account) - .send_retry(&self.config.retry_config) - .await - .context(GetRequestSnafu { - path: path.as_ref(), - })?; - match response.headers().get("x-ms-resource-type") { Some(resource) if resource.as_ref() != b"file" => { Err(crate::Error::NotFound { @@ -300,14 +267,59 @@ impl AzureClient { Ok(()) } +} +#[async_trait] +impl GetClient for AzureClient { + const STORE: &'static str = STORE; + + /// Make an Azure GET request + /// + /// + async fn get_request( + &self, + path: &Path, + options: GetOptions, + head: bool, + ) -> Result { + let credential = self.get_credential().await?; + let url = self.config.path_url(path); + let method = match head { + true => Method::HEAD, + false => Method::GET, + }; + + let builder = self + .client + .request(method, url) + .header(CONTENT_LENGTH, HeaderValue::from_static("0")) + .body(Bytes::new()); + + let response = builder + .with_get_options(options) + .with_azure_authorization(&credential, &self.config.account) + .send_retry(&self.config.retry_config) + .await + .context(GetRequestSnafu { + path: path.as_ref(), + })?; + + Ok(response) + } +} + +#[async_trait] +impl ListClient for AzureClient { /// Make an Azure List request async fn list_request( &self, prefix: Option<&str>, delimiter: bool, token: Option<&str>, + offset: Option<&str>, ) -> Result<(ListResult, Option)> { + assert!(offset.is_none()); // Not yet supported + let credential = self.get_credential().await?; let url = self.config.path_url(&Path::default()); @@ -346,22 +358,6 @@ impl AzureClient { Ok((to_list_result(response, prefix)?, token)) } - - /// Perform a list operation automatically handling pagination - pub fn list_paginated( - &self, - prefix: Option<&Path>, - delimiter: bool, - ) -> BoxStream<'_, Result> { - let prefix = format_prefix(prefix); - stream_paginated(prefix, move |prefix, token| async move { - let (r, next_token) = self - .list_request(prefix.as_deref(), delimiter, token.as_deref()) - .await?; - Ok((r, prefix, next_token)) - }) - .boxed() - } } /// Raw / internal response from list requests diff --git a/src/azure/mod.rs b/src/azure/mod.rs index 069b033..d273503 100644 --- a/src/azure/mod.rs +++ b/src/azure/mod.rs @@ -37,18 +37,19 @@ use async_trait::async_trait; use base64::prelude::BASE64_STANDARD; use base64::Engine; use bytes::Bytes; -use futures::{stream::BoxStream, StreamExt, TryStreamExt}; +use futures::stream::BoxStream; use percent_encoding::percent_decode_str; use serde::{Deserialize, Serialize}; use snafu::{OptionExt, ResultExt, Snafu}; use std::fmt::{Debug, Formatter}; use std::io; +use std::str::FromStr; use std::sync::Arc; -use std::{collections::BTreeSet, str::FromStr}; use tokio::io::AsyncWrite; use url::Url; -use crate::client::header::header_meta; +use crate::client::get::GetClientExt; +use crate::client::list::ListClientExt; use crate::client::{ ClientConfigKey, CredentialProvider, StaticCredentialProvider, TokenCredentialProvider, @@ -128,11 +129,6 @@ enum Error { #[snafu(display("ETag Header missing from response"))] MissingEtag, - - #[snafu(display("Failed to parse headers: {}", source))] - Header { - source: crate::client::header::Error, - }, } impl From for super::Error { @@ -204,25 +200,11 @@ impl ObjectStore for MicrosoftAzure { } async fn get_opts(&self, location: &Path, options: GetOptions) -> Result { - let response = self.client.get_request(location, options, false).await?; - let stream = response - .bytes_stream() - .map_err(|source| crate::Error::Generic { - store: STORE, - source: Box::new(source), - }) - .boxed(); - - Ok(GetResult::Stream(stream)) + self.client.get_opts(location, options).await } async fn head(&self, location: &Path) -> Result { - let options = GetOptions::default(); - - // Extract meta from headers - // https://docs.microsoft.com/en-us/rest/api/storageservices/get-blob-properties - let response = self.client.get_request(location, options, true).await?; - Ok(header_meta(location, response.headers()).context(HeaderSnafu)?) + self.client.head(location).await } async fn delete(&self, location: &Path) -> Result<()> { @@ -233,32 +215,11 @@ impl ObjectStore for MicrosoftAzure { &self, prefix: Option<&Path>, ) -> Result>> { - let stream = self - .client - .list_paginated(prefix, false) - .map_ok(|r| futures::stream::iter(r.objects.into_iter().map(Ok))) - .try_flatten() - .boxed(); - - Ok(stream) + self.client.list(prefix).await } async fn list_with_delimiter(&self, prefix: Option<&Path>) -> Result { - let mut stream = self.client.list_paginated(prefix, true); - - let mut common_prefixes = BTreeSet::new(); - let mut objects = Vec::new(); - - while let Some(result) = stream.next().await { - let response = result?; - common_prefixes.extend(response.common_prefixes.into_iter()); - objects.extend(response.objects.into_iter()); - } - - Ok(ListResult { - common_prefixes: common_prefixes.into_iter().collect(), - objects, - }) + self.client.list_with_delimiter(prefix).await } async fn copy(&self, from: &Path, to: &Path) -> Result<()> { diff --git a/src/client/get.rs b/src/client/get.rs new file mode 100644 index 0000000..3c66a72 --- /dev/null +++ b/src/client/get.rs @@ -0,0 +1,70 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::client::header::header_meta; +use crate::path::Path; +use crate::Result; +use crate::{Error, GetOptions, GetResult, ObjectMeta}; +use async_trait::async_trait; +use futures::{StreamExt, TryStreamExt}; +use reqwest::Response; + +/// A client that can perform a get request +#[async_trait] +pub trait GetClient: Send + Sync + 'static { + const STORE: &'static str; + + async fn get_request( + &self, + path: &Path, + options: GetOptions, + head: bool, + ) -> Result; +} + +/// Extension trait for [`GetClient`] that adds common retrieval functionality +#[async_trait] +pub trait GetClientExt { + async fn get_opts(&self, location: &Path, options: GetOptions) -> Result; + + async fn head(&self, location: &Path) -> Result; +} + +#[async_trait] +impl GetClientExt for T { + async fn get_opts(&self, location: &Path, options: GetOptions) -> Result { + let response = self.get_request(location, options, false).await?; + let stream = response + .bytes_stream() + .map_err(|source| Error::Generic { + store: T::STORE, + source: Box::new(source), + }) + .boxed(); + + Ok(GetResult::Stream(stream)) + } + + async fn head(&self, location: &Path) -> Result { + let options = GetOptions::default(); + let response = self.get_request(location, options, true).await?; + header_meta(location, response.headers()).map_err(|e| Error::Generic { + store: T::STORE, + source: Box::new(e), + }) + } +} diff --git a/src/client/list.rs b/src/client/list.rs index 6a3889e..b2dbee2 100644 --- a/src/client/list.rs +++ b/src/client/list.rs @@ -1,3 +1,4 @@ +// Licensed to the Apache Software Foundation (ASF) under one // or more contributor license agreements. See the NOTICE file // distributed with this work for additional information // regarding copyright ownership. The ASF licenses this file @@ -14,72 +15,123 @@ // specific language governing permissions and limitations // under the License. -//! The list response format used by GCP and AWS - +use crate::client::pagination::stream_paginated; use crate::path::Path; -use crate::{ListResult, ObjectMeta, Result}; -use chrono::{DateTime, Utc}; -use serde::Deserialize; - -#[derive(Debug, Deserialize)] -#[serde(rename_all = "PascalCase")] -pub struct ListResponse { - #[serde(default)] - pub contents: Vec, - #[serde(default)] - pub common_prefixes: Vec, - #[serde(default)] - pub next_continuation_token: Option, +use crate::Result; +use crate::{ListResult, ObjectMeta}; +use async_trait::async_trait; +use futures::stream::BoxStream; +use futures::{StreamExt, TryStreamExt}; +use std::collections::BTreeSet; + +/// A client that can perform paginated list requests +#[async_trait] +pub trait ListClient: Send + Sync + 'static { + async fn list_request( + &self, + prefix: Option<&str>, + delimiter: bool, + token: Option<&str>, + offset: Option<&str>, + ) -> Result<(ListResult, Option)>; } -impl TryFrom for ListResult { - type Error = crate::Error; +/// Extension trait for [`ListClient`] that adds common listing functionality +#[async_trait] +pub trait ListClientExt { + fn list_paginated( + &self, + prefix: Option<&Path>, + delimiter: bool, + offset: Option<&Path>, + ) -> BoxStream<'_, Result>; - fn try_from(value: ListResponse) -> Result { - let common_prefixes = value - .common_prefixes - .into_iter() - .map(|x| Ok(Path::parse(x.prefix)?)) - .collect::>()?; + async fn list( + &self, + prefix: Option<&Path>, + ) -> Result>>; - let objects = value - .contents - .into_iter() - .map(TryFrom::try_from) - .collect::>()?; + async fn list_with_offset( + &self, + prefix: Option<&Path>, + offset: &Path, + ) -> Result>>; - Ok(Self { - common_prefixes, - objects, - }) - } + async fn list_with_delimiter(&self, prefix: Option<&Path>) -> Result; } -#[derive(Debug, Deserialize)] -#[serde(rename_all = "PascalCase")] -pub struct ListPrefix { - pub prefix: String, -} +#[async_trait] +impl ListClientExt for T { + fn list_paginated( + &self, + prefix: Option<&Path>, + delimiter: bool, + offset: Option<&Path>, + ) -> BoxStream<'_, Result> { + let offset = offset.map(|x| x.to_string()); + let prefix = prefix + .filter(|x| !x.as_ref().is_empty()) + .map(|p| format!("{}{}", p.as_ref(), crate::path::DELIMITER)); -#[derive(Debug, Deserialize)] -#[serde(rename_all = "PascalCase")] -pub struct ListContents { - pub key: String, - pub size: usize, - pub last_modified: DateTime, - #[serde(rename = "ETag")] - pub e_tag: Option, -} + stream_paginated( + (prefix, offset), + move |(prefix, offset), token| async move { + let (r, next_token) = self + .list_request( + prefix.as_deref(), + delimiter, + token.as_deref(), + offset.as_deref(), + ) + .await?; + Ok((r, (prefix, offset), next_token)) + }, + ) + .boxed() + } -impl TryFrom for ObjectMeta { - type Error = crate::Error; + async fn list( + &self, + prefix: Option<&Path>, + ) -> Result>> { + let stream = self + .list_paginated(prefix, false, None) + .map_ok(|r| futures::stream::iter(r.objects.into_iter().map(Ok))) + .try_flatten() + .boxed(); - fn try_from(value: ListContents) -> Result { - Ok(Self { - location: Path::parse(value.key)?, - last_modified: value.last_modified, - size: value.size, - e_tag: value.e_tag, + Ok(stream) + } + + async fn list_with_offset( + &self, + prefix: Option<&Path>, + offset: &Path, + ) -> Result>> { + let stream = self + .list_paginated(prefix, false, Some(offset)) + .map_ok(|r| futures::stream::iter(r.objects.into_iter().map(Ok))) + .try_flatten() + .boxed(); + + Ok(stream) + } + + async fn list_with_delimiter(&self, prefix: Option<&Path>) -> Result { + let mut stream = self.list_paginated(prefix, true, None); + + let mut common_prefixes = BTreeSet::new(); + let mut objects = Vec::new(); + + while let Some(result) = stream.next().await { + let response = result?; + common_prefixes.extend(response.common_prefixes.into_iter()); + objects.extend(response.objects.into_iter()); + } + + Ok(ListResult { + common_prefixes: common_prefixes.into_iter().collect(), + objects, }) } } diff --git a/src/client/list_response.rs b/src/client/list_response.rs new file mode 100644 index 0000000..6a3889e --- /dev/null +++ b/src/client/list_response.rs @@ -0,0 +1,85 @@ +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! The list response format used by GCP and AWS + +use crate::path::Path; +use crate::{ListResult, ObjectMeta, Result}; +use chrono::{DateTime, Utc}; +use serde::Deserialize; + +#[derive(Debug, Deserialize)] +#[serde(rename_all = "PascalCase")] +pub struct ListResponse { + #[serde(default)] + pub contents: Vec, + #[serde(default)] + pub common_prefixes: Vec, + #[serde(default)] + pub next_continuation_token: Option, +} + +impl TryFrom for ListResult { + type Error = crate::Error; + + fn try_from(value: ListResponse) -> Result { + let common_prefixes = value + .common_prefixes + .into_iter() + .map(|x| Ok(Path::parse(x.prefix)?)) + .collect::>()?; + + let objects = value + .contents + .into_iter() + .map(TryFrom::try_from) + .collect::>()?; + + Ok(Self { + common_prefixes, + objects, + }) + } +} + +#[derive(Debug, Deserialize)] +#[serde(rename_all = "PascalCase")] +pub struct ListPrefix { + pub prefix: String, +} + +#[derive(Debug, Deserialize)] +#[serde(rename_all = "PascalCase")] +pub struct ListContents { + pub key: String, + pub size: usize, + pub last_modified: DateTime, + #[serde(rename = "ETag")] + pub e_tag: Option, +} + +impl TryFrom for ObjectMeta { + type Error = crate::Error; + + fn try_from(value: ListContents) -> Result { + Ok(Self { + location: Path::parse(value.key)?, + last_modified: value.last_modified, + size: value.size, + e_tag: value.e_tag, + }) + } +} diff --git a/src/client/mod.rs b/src/client/mod.rs index 8c23576..5f3a042 100644 --- a/src/client/mod.rs +++ b/src/client/mod.rs @@ -20,9 +20,18 @@ pub mod backoff; #[cfg(test)] pub mod mock_server; + +pub mod retry; + #[cfg(any(feature = "aws", feature = "gcp", feature = "azure"))] pub mod pagination; -pub mod retry; + +#[cfg(any(feature = "aws", feature = "gcp", feature = "azure"))] +pub mod get; + +#[cfg(any(feature = "aws", feature = "gcp", feature = "azure"))] +pub mod list; + #[cfg(any(feature = "aws", feature = "gcp", feature = "azure"))] pub mod token; @@ -30,7 +39,7 @@ pub mod token; pub mod header; #[cfg(any(feature = "aws", feature = "gcp"))] -pub mod list; +pub mod list_response; use async_trait::async_trait; use std::collections::HashMap; @@ -42,10 +51,9 @@ use reqwest::header::{HeaderMap, HeaderValue}; use reqwest::{Client, ClientBuilder, Proxy, RequestBuilder}; use serde::{Deserialize, Serialize}; -use crate::client::token::{TemporaryToken, TokenCache}; use crate::config::{fmt_duration, ConfigValue}; use crate::path::Path; -use crate::{GetOptions, Result, RetryConfig}; +use crate::{GetOptions, Result}; fn map_client_error(e: reqwest::Error) -> super::Error { super::Error::Generic { @@ -545,6 +553,8 @@ where #[cfg(any(feature = "aws", feature = "azure", feature = "gcp"))] mod cloud { use super::*; + use crate::client::token::{TemporaryToken, TokenCache}; + use crate::RetryConfig; /// A [`CredentialProvider`] that uses [`Client`] to fetch temporary tokens #[derive(Debug)] diff --git a/src/gcp/mod.rs b/src/gcp/mod.rs index 21ba158..7b11273 100644 --- a/src/gcp/mod.rs +++ b/src/gcp/mod.rs @@ -29,14 +29,13 @@ //! to abort the upload and drop those unneeded parts. In addition, you may wish to //! consider implementing automatic clean up of unused parts that are older than one //! week. -use std::collections::BTreeSet; use std::io; use std::str::FromStr; use std::sync::Arc; use async_trait::async_trait; use bytes::{Buf, Bytes}; -use futures::{stream::BoxStream, StreamExt, TryStreamExt}; +use futures::stream::BoxStream; use percent_encoding::{percent_encode, utf8_percent_encode, NON_ALPHANUMERIC}; use reqwest::{header, Client, Method, Response, StatusCode}; use serde::{Deserialize, Serialize}; @@ -44,9 +43,9 @@ use snafu::{OptionExt, ResultExt, Snafu}; use tokio::io::AsyncWrite; use url::Url; -use crate::client::header::header_meta; -use crate::client::list::ListResponse; -use crate::client::pagination::stream_paginated; +use crate::client::get::{GetClient, GetClientExt}; +use crate::client::list::{ListClient, ListClientExt}; +use crate::client::list_response::ListResponse; use crate::client::retry::RetryExt; use crate::client::{ ClientConfigKey, CredentialProvider, GetOptionsExt, StaticCredentialProvider, @@ -55,7 +54,6 @@ use crate::client::{ use crate::{ multipart::{CloudMultiPartUpload, CloudMultiPartUploadImpl, UploadPart}, path::{Path, DELIMITER}, - util::format_prefix, ClientOptions, GetOptions, GetResult, ListResult, MultipartId, ObjectMeta, ObjectStore, Result, RetryConfig, }; @@ -150,11 +148,6 @@ enum Error { #[snafu(display("Configuration key: '{}' is not known.", key))] UnknownConfigurationKey { key: String }, - - #[snafu(display("Failed to parse headers: {}", source))] - Header { - source: crate::client::header::Error, - }, } impl From for super::Error { @@ -241,35 +234,6 @@ impl GoogleCloudStorageClient { format!("{}/{}/{}", self.base_url, self.bucket_name_encoded, encoded) } - /// Perform a get request - async fn get_request( - &self, - path: &Path, - options: GetOptions, - head: bool, - ) -> Result { - let credential = self.get_credential().await?; - let url = self.object_url(path); - - let method = match head { - true => Method::HEAD, - false => Method::GET, - }; - - let response = self - .client - .request(method, url) - .bearer_auth(&credential.bearer) - .with_get_options(options) - .send_retry(&self.retry_config) - .await - .context(GetRequestSnafu { - path: path.as_ref(), - })?; - - Ok(response) - } - /// Perform a put request async fn put_request(&self, path: &Path, payload: Bytes) -> Result<()> { let credential = self.get_credential().await?; @@ -409,14 +373,54 @@ impl GoogleCloudStorageClient { Ok(()) } +} +#[async_trait] +impl GetClient for GoogleCloudStorageClient { + const STORE: &'static str = STORE; + + /// Perform a get request + async fn get_request( + &self, + path: &Path, + options: GetOptions, + head: bool, + ) -> Result { + let credential = self.get_credential().await?; + let url = self.object_url(path); + + let method = match head { + true => Method::HEAD, + false => Method::GET, + }; + + let response = self + .client + .request(method, url) + .bearer_auth(&credential.bearer) + .with_get_options(options) + .send_retry(&self.retry_config) + .await + .context(GetRequestSnafu { + path: path.as_ref(), + })?; + + Ok(response) + } +} + +#[async_trait] +impl ListClient for GoogleCloudStorageClient { /// Perform a list request async fn list_request( &self, prefix: Option<&str>, delimiter: bool, page_token: Option<&str>, - ) -> Result { + offset: Option<&str>, + ) -> Result<(ListResult, Option)> { + assert!(offset.is_none()); // Not yet supported + let credential = self.get_credential().await?; let url = format!("{}/{}", self.base_url, self.bucket_name_encoded); @@ -450,27 +454,11 @@ impl GoogleCloudStorageClient { .await .context(ListResponseBodySnafu)?; - let response: ListResponse = quick_xml::de::from_reader(response.reader()) + let mut response: ListResponse = quick_xml::de::from_reader(response.reader()) .context(InvalidListResponseSnafu)?; - Ok(response) - } - - /// Perform a list operation automatically handling pagination - fn list_paginated( - &self, - prefix: Option<&Path>, - delimiter: bool, - ) -> BoxStream<'_, Result> { - let prefix = format_prefix(prefix); - stream_paginated(prefix, move |prefix, token| async move { - let mut r = self - .list_request(prefix.as_deref(), delimiter, token.as_deref()) - .await?; - let next_token = r.next_continuation_token.take(); - Ok((r.try_into()?, prefix, next_token)) - }) - .boxed() + let token = response.next_continuation_token.take(); + Ok((response.try_into()?, token)) } } @@ -613,22 +601,11 @@ impl ObjectStore for GoogleCloudStorage { } async fn get_opts(&self, location: &Path, options: GetOptions) -> Result { - let response = self.client.get_request(location, options, false).await?; - let stream = response - .bytes_stream() - .map_err(|source| crate::Error::Generic { - store: STORE, - source: Box::new(source), - }) - .boxed(); - - Ok(GetResult::Stream(stream)) + self.client.get_opts(location, options).await } async fn head(&self, location: &Path) -> Result { - let options = GetOptions::default(); - let response = self.client.get_request(location, options, true).await?; - Ok(header_meta(location, response.headers()).context(HeaderSnafu)?) + self.client.head(location).await } async fn delete(&self, location: &Path) -> Result<()> { @@ -639,32 +616,11 @@ impl ObjectStore for GoogleCloudStorage { &self, prefix: Option<&Path>, ) -> Result>> { - let stream = self - .client - .list_paginated(prefix, false) - .map_ok(|r| futures::stream::iter(r.objects.into_iter().map(Ok))) - .try_flatten() - .boxed(); - - Ok(stream) + self.client.list(prefix).await } async fn list_with_delimiter(&self, prefix: Option<&Path>) -> Result { - let mut stream = self.client.list_paginated(prefix, true); - - let mut common_prefixes = BTreeSet::new(); - let mut objects = Vec::new(); - - while let Some(result) = stream.next().await { - let response = result?; - common_prefixes.extend(response.common_prefixes.into_iter()); - objects.extend(response.objects.into_iter()); - } - - Ok(ListResult { - common_prefixes: common_prefixes.into_iter().collect(), - objects, - }) + self.client.list_with_delimiter(prefix).await } async fn copy(&self, from: &Path, to: &Path) -> Result<()> { diff --git a/src/util.rs b/src/util.rs index ba4c683..79ca4bb 100644 --- a/src/util.rs +++ b/src/util.rs @@ -36,14 +36,6 @@ where .map_err(serde::de::Error::custom) } -/// Returns the prefix to be passed to an object store -#[cfg(any(feature = "aws", feature = "gcp", feature = "azure"))] -pub fn format_prefix(prefix: Option<&crate::path::Path>) -> Option { - prefix - .filter(|x| !x.as_ref().is_empty()) - .map(|p| format!("{}{}", p.as_ref(), crate::path::DELIMITER)) -} - #[cfg(any(feature = "aws", feature = "azure"))] pub(crate) fn hmac_sha256( secret: impl AsRef<[u8]>, From 8cfb7e6791361002ad0721d59287e16429d0c10a Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 18 May 2023 09:51:03 +0100 Subject: [PATCH 149/397] Prepare object_store 0.6.0 (#4241) --- CHANGELOG-old.md | 38 +++++++++++++++ CHANGELOG.md | 80 +++++++++++++++++++++----------- Cargo.toml | 2 +- dev/release/update_change_log.sh | 4 +- 4 files changed, 94 insertions(+), 30 deletions(-) diff --git a/CHANGELOG-old.md b/CHANGELOG-old.md index cc9453b..c9c4e28 100644 --- a/CHANGELOG-old.md +++ b/CHANGELOG-old.md @@ -19,6 +19,44 @@ # Historical Changelog +## [object_store_0.5.6](https://github.com/apache/arrow-rs/tree/object_store_0.5.6) (2023-03-30) + +[Full Changelog](https://github.com/apache/arrow-rs/compare/object_store_0.5.5...object_store_0.5.6) + +**Implemented enhancements:** + +- Document ObjectStore::list Ordering [\#3975](https://github.com/apache/arrow-rs/issues/3975) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Add option to start listing at a particular key [\#3970](https://github.com/apache/arrow-rs/issues/3970) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Implement `ObjectStore` for trait objects [\#3865](https://github.com/apache/arrow-rs/issues/3865) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Add ObjectStore::append [\#3790](https://github.com/apache/arrow-rs/issues/3790) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Make `InMemory` object store track last modified time for each entry [\#3782](https://github.com/apache/arrow-rs/issues/3782) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Support Unsigned S3 Payloads [\#3737](https://github.com/apache/arrow-rs/issues/3737) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Add Content-MD5 or checksum header for using an Object Locked S3 [\#3725](https://github.com/apache/arrow-rs/issues/3725) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] + +**Fixed bugs:** + +- LocalFileSystem::put is not Atomic [\#3780](https://github.com/apache/arrow-rs/issues/3780) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] + +**Merged pull requests:** + +- Add ObjectStore::list\_with\_offset \(\#3970\) [\#3973](https://github.com/apache/arrow-rs/pull/3973) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Remove incorrect validation logic on S3 bucket names [\#3947](https://github.com/apache/arrow-rs/pull/3947) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([rtyler](https://github.com/rtyler)) +- Prepare arrow 36 [\#3935](https://github.com/apache/arrow-rs/pull/3935) ([tustvold](https://github.com/tustvold)) +- fix: Specify content length for gcp copy request [\#3921](https://github.com/apache/arrow-rs/pull/3921) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([scsmithr](https://github.com/scsmithr)) +- Revert structured ArrayData \(\#3877\) [\#3894](https://github.com/apache/arrow-rs/pull/3894) ([tustvold](https://github.com/tustvold)) +- Add support for checksum algorithms in AWS [\#3873](https://github.com/apache/arrow-rs/pull/3873) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([trueleo](https://github.com/trueleo)) +- Rename PrefixObjectStore to PrefixStore [\#3870](https://github.com/apache/arrow-rs/pull/3870) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Implement append for LimitStore, PrefixObjectStore, ThrottledStore [\#3869](https://github.com/apache/arrow-rs/pull/3869) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Supporting metadata fetch without open file read mode [\#3868](https://github.com/apache/arrow-rs/pull/3868) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([metesynnada](https://github.com/metesynnada)) +- Impl ObjectStore for trait object [\#3866](https://github.com/apache/arrow-rs/pull/3866) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([Kinrany](https://github.com/Kinrany)) +- Update quick-xml requirement from 0.27.0 to 0.28.0 [\#3857](https://github.com/apache/arrow-rs/pull/3857) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- Update changelog for 35.0.0 [\#3843](https://github.com/apache/arrow-rs/pull/3843) ([tustvold](https://github.com/tustvold)) +- Cleanup ApplicationDefaultCredentials [\#3799](https://github.com/apache/arrow-rs/pull/3799) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Make InMemory object store track last modified time for each entry [\#3796](https://github.com/apache/arrow-rs/pull/3796) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([Weijun-H](https://github.com/Weijun-H)) +- Add ObjectStore::append [\#3791](https://github.com/apache/arrow-rs/pull/3791) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Make LocalFileSystem::put atomic \(\#3780\) [\#3781](https://github.com/apache/arrow-rs/pull/3781) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Add support for unsigned payloads in aws [\#3741](https://github.com/apache/arrow-rs/pull/3741) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([trueleo](https://github.com/trueleo)) + ## [object_store_0.5.5](https://github.com/apache/arrow-rs/tree/object_store_0.5.5) (2023-02-27) [Full Changelog](https://github.com/apache/arrow-rs/compare/object_store_0.5.4...object_store_0.5.5) diff --git a/CHANGELOG.md b/CHANGELOG.md index b26ae71..bde0f75 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,43 +19,69 @@ # Changelog -## [object_store_0.5.6](https://github.com/apache/arrow-rs/tree/object_store_0.5.6) (2023-03-30) +## [object_store_0.6.0](https://github.com/apache/arrow-rs/tree/object_store_0.6.0) (2023-05-18) -[Full Changelog](https://github.com/apache/arrow-rs/compare/object_store_0.5.5...object_store_0.5.6) +[Full Changelog](https://github.com/apache/arrow-rs/compare/object_store_0.5.6...object_store_0.6.0) + +**Breaking changes:** + +- Add ObjectStore::get\_opts \(\#2241\) [\#4212](https://github.com/apache/arrow-rs/pull/4212) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Simplify ObjectStore configuration pattern [\#4189](https://github.com/apache/arrow-rs/pull/4189) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- object\_store: fix: Incorrect parsing of https Path Style S3 url [\#4082](https://github.com/apache/arrow-rs/pull/4082) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([roeap](https://github.com/roeap)) +- feat: add etag for objectMeta [\#3937](https://github.com/apache/arrow-rs/pull/3937) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([Weijun-H](https://github.com/Weijun-H)) **Implemented enhancements:** -- Document ObjectStore::list Ordering [\#3975](https://github.com/apache/arrow-rs/issues/3975) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- Add option to start listing at a particular key [\#3970](https://github.com/apache/arrow-rs/issues/3970) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- Implement `ObjectStore` for trait objects [\#3865](https://github.com/apache/arrow-rs/issues/3865) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- Add ObjectStore::append [\#3790](https://github.com/apache/arrow-rs/issues/3790) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- Make `InMemory` object store track last modified time for each entry [\#3782](https://github.com/apache/arrow-rs/issues/3782) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- Support Unsigned S3 Payloads [\#3737](https://github.com/apache/arrow-rs/issues/3737) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- Add Content-MD5 or checksum header for using an Object Locked S3 [\#3725](https://github.com/apache/arrow-rs/issues/3725) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Object Store Authorization [\#4223](https://github.com/apache/arrow-rs/issues/4223) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Use XML API for GCS [\#4209](https://github.com/apache/arrow-rs/issues/4209) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- ObjectStore with\_url Should Handle Path [\#4199](https://github.com/apache/arrow-rs/issues/4199) +- Return Error on Invalid Config Value [\#4191](https://github.com/apache/arrow-rs/issues/4191) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Extensible ObjectStore Authentication [\#4163](https://github.com/apache/arrow-rs/issues/4163) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- object\_store: When using an AWS profile, obtain the default AWS region from the active profile [\#4158](https://github.com/apache/arrow-rs/issues/4158) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- InMemory append API [\#4152](https://github.com/apache/arrow-rs/issues/4152) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Support accessing ipc Reader/Writer inner by reference [\#4121](https://github.com/apache/arrow-rs/issues/4121) +- \[object\_store\] Retry requests on connection error [\#4119](https://github.com/apache/arrow-rs/issues/4119) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- object\_store: Instantiate object store from provided url with store options [\#4047](https://github.com/apache/arrow-rs/issues/4047) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- object\_store: Builders \(S3/Azure/GCS\) are missing the `get method` to get the actual configuration information [\#4021](https://github.com/apache/arrow-rs/issues/4021) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] **Fixed bugs:** -- LocalFileSystem::put is not Atomic [\#3780](https://github.com/apache/arrow-rs/issues/3780) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- ObjectStore::head Returns Directory for LocalFileSystem and Hierarchical Azure [\#4230](https://github.com/apache/arrow-rs/issues/4230) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- object\_store: different behavior from aws cli for default profile [\#4137](https://github.com/apache/arrow-rs/issues/4137) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- ImdsManagedIdentityOAuthProvider should send resource ID instead of OIDC scope [\#4096](https://github.com/apache/arrow-rs/issues/4096) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Update readme to remove reference to Jira [\#4091](https://github.com/apache/arrow-rs/issues/4091) +- object\_store: Incorrect parsing of https Path Style S3 url [\#4078](https://github.com/apache/arrow-rs/issues/4078) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- \[object\_store\] `local::tests::test_list_root` test fails during release verification [\#3772](https://github.com/apache/arrow-rs/issues/3772) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] **Merged pull requests:** -- Add ObjectStore::list\_with\_offset \(\#3970\) [\#3973](https://github.com/apache/arrow-rs/pull/3973) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- Remove incorrect validation logic on S3 bucket names [\#3947](https://github.com/apache/arrow-rs/pull/3947) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([rtyler](https://github.com/rtyler)) -- Prepare arrow 36 [\#3935](https://github.com/apache/arrow-rs/pull/3935) ([tustvold](https://github.com/tustvold)) -- fix: Specify content length for gcp copy request [\#3921](https://github.com/apache/arrow-rs/pull/3921) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([scsmithr](https://github.com/scsmithr)) -- Revert structured ArrayData \(\#3877\) [\#3894](https://github.com/apache/arrow-rs/pull/3894) ([tustvold](https://github.com/tustvold)) -- Add support for checksum algorithms in AWS [\#3873](https://github.com/apache/arrow-rs/pull/3873) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([trueleo](https://github.com/trueleo)) -- Rename PrefixObjectStore to PrefixStore [\#3870](https://github.com/apache/arrow-rs/pull/3870) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- Implement append for LimitStore, PrefixObjectStore, ThrottledStore [\#3869](https://github.com/apache/arrow-rs/pull/3869) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- Supporting metadata fetch without open file read mode [\#3868](https://github.com/apache/arrow-rs/pull/3868) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([metesynnada](https://github.com/metesynnada)) -- Impl ObjectStore for trait object [\#3866](https://github.com/apache/arrow-rs/pull/3866) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([Kinrany](https://github.com/Kinrany)) -- Update quick-xml requirement from 0.27.0 to 0.28.0 [\#3857](https://github.com/apache/arrow-rs/pull/3857) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([dependabot[bot]](https://github.com/apps/dependabot)) -- Update changelog for 35.0.0 [\#3843](https://github.com/apache/arrow-rs/pull/3843) ([tustvold](https://github.com/tustvold)) -- Cleanup ApplicationDefaultCredentials [\#3799](https://github.com/apache/arrow-rs/pull/3799) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- Make InMemory object store track last modified time for each entry [\#3796](https://github.com/apache/arrow-rs/pull/3796) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([Weijun-H](https://github.com/Weijun-H)) -- Add ObjectStore::append [\#3791](https://github.com/apache/arrow-rs/pull/3791) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- Make LocalFileSystem::put atomic \(\#3780\) [\#3781](https://github.com/apache/arrow-rs/pull/3781) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- Add support for unsigned payloads in aws [\#3741](https://github.com/apache/arrow-rs/pull/3741) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([trueleo](https://github.com/trueleo)) +- Remove AWS\_PROFILE support [\#4238](https://github.com/apache/arrow-rs/pull/4238) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Expose AwsAuthorizer [\#4237](https://github.com/apache/arrow-rs/pull/4237) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Expose CredentialProvider [\#4235](https://github.com/apache/arrow-rs/pull/4235) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Return NotFound for directories in Head and Get \(\#4230\) [\#4231](https://github.com/apache/arrow-rs/pull/4231) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Standardise credentials API \(\#4223\) \(\#4163\) [\#4225](https://github.com/apache/arrow-rs/pull/4225) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Extract Common Listing and Retrieval Functionality [\#4220](https://github.com/apache/arrow-rs/pull/4220) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- feat\(object-store\): extend Options API for http client [\#4208](https://github.com/apache/arrow-rs/pull/4208) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([roeap](https://github.com/roeap)) +- Consistently use GCP XML API [\#4207](https://github.com/apache/arrow-rs/pull/4207) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Implement list\_with\_offset for PrefixStore [\#4203](https://github.com/apache/arrow-rs/pull/4203) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Allow setting ClientOptions with Options API [\#4202](https://github.com/apache/arrow-rs/pull/4202) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Create ObjectStore from URL and Options \(\#4047\) [\#4200](https://github.com/apache/arrow-rs/pull/4200) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Skip test\_list\_root on OS X \(\#3772\) [\#4198](https://github.com/apache/arrow-rs/pull/4198) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Recognise R2 URLs for S3 object store \(\#4190\) [\#4194](https://github.com/apache/arrow-rs/pull/4194) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Fix ImdsManagedIdentityProvider \(\#4096\) [\#4193](https://github.com/apache/arrow-rs/pull/4193) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Deffered Object Store Config Parsing \(\#4191\) [\#4192](https://github.com/apache/arrow-rs/pull/4192) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Object Store \(AWS\): Support dynamically resolving S3 bucket region [\#4188](https://github.com/apache/arrow-rs/pull/4188) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([mr-brobot](https://github.com/mr-brobot)) +- Faster prefix match in object\_store path handling [\#4164](https://github.com/apache/arrow-rs/pull/4164) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Object Store \(AWS\): Support region configured via named profile [\#4161](https://github.com/apache/arrow-rs/pull/4161) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([mr-brobot](https://github.com/mr-brobot)) +- InMemory append API [\#4153](https://github.com/apache/arrow-rs/pull/4153) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([berkaysynnada](https://github.com/berkaysynnada)) +- docs: fix the wrong ln command in CONTRIBUTING.md [\#4139](https://github.com/apache/arrow-rs/pull/4139) ([SteveLauC](https://github.com/SteveLauC)) +- Display the file path in the error message when failed to open credentials file for GCS [\#4124](https://github.com/apache/arrow-rs/pull/4124) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([haoxins](https://github.com/haoxins)) +- Retry on Connection Errors [\#4120](https://github.com/apache/arrow-rs/pull/4120) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([kindly](https://github.com/kindly)) +- Simplify reference to GitHub issues [\#4092](https://github.com/apache/arrow-rs/pull/4092) ([bkmgit](https://github.com/bkmgit)) +- Use reqwest build\_split [\#4039](https://github.com/apache/arrow-rs/pull/4039) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Fix object\_store CI [\#4037](https://github.com/apache/arrow-rs/pull/4037) ([tustvold](https://github.com/tustvold)) +- Add get\_config\_value to AWS/Azure/GCP Builders [\#4035](https://github.com/apache/arrow-rs/pull/4035) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([r4ntix](https://github.com/r4ntix)) +- Update AWS SDK [\#3993](https://github.com/apache/arrow-rs/pull/3993) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) diff --git a/Cargo.toml b/Cargo.toml index bd9c973..1fb9886 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "object_store" -version = "0.5.6" +version = "0.6.0" edition = "2021" license = "MIT/Apache-2.0" readme = "README.md" diff --git a/dev/release/update_change_log.sh b/dev/release/update_change_log.sh index b69d36f..6090630 100755 --- a/dev/release/update_change_log.sh +++ b/dev/release/update_change_log.sh @@ -29,8 +29,8 @@ set -e -SINCE_TAG="object_store_0.5.5" -FUTURE_RELEASE="object_store_0.5.6" +SINCE_TAG="object_store_0.5.6" +FUTURE_RELEASE="object_store_0.6.0" SOURCE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" SOURCE_TOP_DIR="$(cd "${SOURCE_DIR}/../../" && pwd)" From 6eb2278b882694a1937c06d126ffaca3d916aec2 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 18 May 2023 10:14:50 +0100 Subject: [PATCH 150/397] Fix merge conflict from #4220 (#4242) --- src/azure/client.rs | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/src/azure/client.rs b/src/azure/client.rs index 868a803..5ed6f24 100644 --- a/src/azure/client.rs +++ b/src/azure/client.rs @@ -188,19 +188,7 @@ impl AzureClient { path: path.as_ref(), })?; - match response.headers().get("x-ms-resource-type") { - Some(resource) if resource.as_ref() != b"file" => { - Err(crate::Error::NotFound { - path: path.to_string(), - source: format!( - "Not a file, got x-ms-resource-type: {}", - String::from_utf8_lossy(resource.as_ref()) - ) - .into(), - }) - } - _ => Ok(response), - } + Ok(response) } /// Make an Azure Delete request @@ -304,7 +292,19 @@ impl GetClient for AzureClient { path: path.as_ref(), })?; - Ok(response) + match response.headers().get("x-ms-resource-type") { + Some(resource) if resource.as_ref() != b"file" => { + Err(crate::Error::NotFound { + path: path.to_string(), + source: format!( + "Not a file, got x-ms-resource-type: {}", + String::from_utf8_lossy(resource.as_ref()) + ) + .into(), + }) + } + _ => Ok(response), + } } } From f58f73351ad50ac6a49f83a51c12968862c44b48 Mon Sep 17 00:00:00 2001 From: Johann Fuechsl Date: Fri, 26 May 2023 14:30:34 +0200 Subject: [PATCH 151/397] Set ECS specific metadata endpoint if AWS_CONTAINER_CREDENTIALS_RELATIVE_URI is set (#4288) Co-authored-by: Johann Fuechsl --- src/aws/mod.rs | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/src/aws/mod.rs b/src/aws/mod.rs index 4c6d346..fac6165 100644 --- a/src/aws/mod.rs +++ b/src/aws/mod.rs @@ -84,7 +84,10 @@ pub type AwsCredentialProvider = Arc Date: Sat, 27 May 2023 02:31:34 -0700 Subject: [PATCH 152/397] feat: support bulk deletes in object_store (#4060) * feat: support bulk deletes * fix: make NotFound reporting consistent * fix http store * fix aws support * remove unnecessary flag * fix: make AWS S3 compatible * pr feedback: use simpler API * pr feedback: test paths and ordering * Update object_store/src/limit.rs Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> * take fallible stream * final pr feedback --------- Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> --- CONTRIBUTING.md | 14 ++-- Cargo.toml | 2 +- src/aws/client.rs | 175 +++++++++++++++++++++++++++++++++++++++++++++ src/aws/mod.rs | 21 ++++++ src/http/client.rs | 8 ++- src/lib.rs | 123 +++++++++++++++++++++++++++++-- src/limit.rs | 7 ++ src/local.rs | 9 ++- 8 files changed, 345 insertions(+), 14 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 47c2940..aeb38e1 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -39,7 +39,8 @@ To test the S3 integration against [localstack](https://localstack.cloud/) First start up a container running localstack ``` -$ podman run --rm -it -e PROVIDER_OVERRIDE_S3=asf -p 4566:4566 -p 4510-4559:4510-4559 localstack/localstack +$ podman run -d -p 4566:4566 localstack/localstack:2.0 +$ podman run -d -p 1338:1338 amazon/amazon-ec2-metadata-mock:v1.9.2 --imdsv2 ``` Setup environment @@ -87,13 +88,18 @@ $ podman run -p 10000:10000 -p 10001:10001 -p 10002:10002 mcr.microsoft.com/azur Create a bucket ``` -$ podman run --net=host mcr.microsoft.com/azure-cli az storage container create -n test-bucket --connection-string 'DefaultEndpointsProtocol=http;AccountName=devstoreaccount1;AccountKey=Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==;BlobEndpoint=http://128.0.0.1:10000/devstoreaccount1;QueueEndpoint=http://128.0.0.1:10001/devstoreaccount1;' +$ podman run --net=host mcr.microsoft.com/azure-cli az storage container create -n test-bucket --connection-string 'DefaultEndpointsProtocol=http;AccountName=devstoreaccount1;AccountKey=Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==;BlobEndpoint=http://127.0.0.1:10000/devstoreaccount1;QueueEndpoint=http://127.0.0.1:10001/devstoreaccount1;' ``` Run tests -``` -$ cargo test --features azure +```shell +AZURE_USE_EMULATOR=1 \ +TEST_INTEGRATION=1 \ +OBJECT_STORE_BUCKET=test-bucket \ +AZURE_STORAGE_ACCOUNT=devstoreaccount1 \ +AZURE_STORAGE_ACCESS_KEY=Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw== \ +cargo test --features azure ``` ### GCP diff --git a/Cargo.toml b/Cargo.toml index 1fb9886..28bf29f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -45,7 +45,7 @@ walkdir = "2" # Cloud storage support base64 = { version = "0.21", default-features = false, features = ["std"], optional = true } hyper = { version = "0.14", default-features = false, optional = true } -quick-xml = { version = "0.28.0", features = ["serialize"], optional = true } +quick-xml = { version = "0.28.0", features = ["serialize", "overlapped-lists"], optional = true } serde = { version = "1.0", default-features = false, features = ["derive"], optional = true } serde_json = { version = "1.0", default-features = false, optional = true } rand = { version = "0.8", default-features = false, features = ["std", "std_rng"], optional = true } diff --git a/src/aws/client.rs b/src/aws/client.rs index cfce352..0c24936 100644 --- a/src/aws/client.rs +++ b/src/aws/client.rs @@ -32,7 +32,9 @@ use async_trait::async_trait; use base64::prelude::BASE64_STANDARD; use base64::Engine; use bytes::{Buf, Bytes}; +use itertools::Itertools; use percent_encoding::{utf8_percent_encode, PercentEncode}; +use quick_xml::events::{self as xml_events}; use reqwest::{header::CONTENT_TYPE, Client as ReqwestClient, Method, Response}; use serde::{Deserialize, Serialize}; use snafu::{ResultExt, Snafu}; @@ -66,6 +68,29 @@ pub(crate) enum Error { path: String, }, + #[snafu(display("Error performing DeleteObjects request: {}", source))] + DeleteObjectsRequest { source: crate::client::retry::Error }, + + #[snafu(display( + "DeleteObjects request failed for key {}: {} (code: {})", + path, + message, + code + ))] + DeleteFailed { + path: String, + code: String, + message: String, + }, + + #[snafu(display("Error getting DeleteObjects response body: {}", source))] + DeleteObjectsResponse { source: reqwest::Error }, + + #[snafu(display("Got invalid DeleteObjects response: {}", source))] + InvalidDeleteObjectsResponse { + source: Box, + }, + #[snafu(display("Error performing copy request {}: {}", path, source))] CopyRequest { source: crate::client::retry::Error, @@ -129,6 +154,44 @@ struct MultipartPart { part_number: usize, } +#[derive(Deserialize)] +#[serde(rename_all = "PascalCase", rename = "DeleteResult")] +struct BatchDeleteResponse { + #[serde(rename = "$value")] + content: Vec, +} + +#[derive(Deserialize)] +enum DeleteObjectResult { + Deleted(DeletedObject), + Error(DeleteError), +} + +#[derive(Deserialize)] +#[serde(rename_all = "PascalCase", rename = "Deleted")] +struct DeletedObject { + #[allow(dead_code)] + key: String, +} + +#[derive(Deserialize)] +#[serde(rename_all = "PascalCase", rename = "Error")] +struct DeleteError { + key: String, + code: String, + message: String, +} + +impl From for Error { + fn from(err: DeleteError) -> Self { + Self::DeleteFailed { + path: err.key, + code: err.code, + message: err.message, + } + } +} + #[derive(Debug)] pub struct S3Config { pub region: String, @@ -243,6 +306,118 @@ impl S3Client { Ok(()) } + /// Make an S3 Delete Objects request + /// + /// Produces a vector of results, one for each path in the input vector. If + /// the delete was successful, the path is returned in the `Ok` variant. If + /// there was an error for a certain path, the error will be returned in the + /// vector. If there was an issue with making the overall request, an error + /// will be returned at the top level. + pub async fn bulk_delete_request( + &self, + paths: Vec, + ) -> Result>> { + if paths.is_empty() { + return Ok(Vec::new()); + } + + let credential = self.get_credential().await?; + let url = format!("{}?delete", self.config.bucket_endpoint); + + let mut buffer = Vec::new(); + let mut writer = quick_xml::Writer::new(&mut buffer); + writer + .write_event(xml_events::Event::Start( + xml_events::BytesStart::new("Delete").with_attributes([( + "xmlns", + "http://s3.amazonaws.com/doc/2006-03-01/", + )]), + )) + .unwrap(); + for path in &paths { + // {path} + writer + .write_event(xml_events::Event::Start(xml_events::BytesStart::new( + "Object", + ))) + .unwrap(); + writer + .write_event(xml_events::Event::Start(xml_events::BytesStart::new("Key"))) + .unwrap(); + writer + .write_event(xml_events::Event::Text(xml_events::BytesText::new( + path.as_ref(), + ))) + .map_err(|err| crate::Error::Generic { + store: STORE, + source: Box::new(err), + })?; + writer + .write_event(xml_events::Event::End(xml_events::BytesEnd::new("Key"))) + .unwrap(); + writer + .write_event(xml_events::Event::End(xml_events::BytesEnd::new("Object"))) + .unwrap(); + } + writer + .write_event(xml_events::Event::End(xml_events::BytesEnd::new("Delete"))) + .unwrap(); + + let body = Bytes::from(buffer); + + let mut builder = self.client.request(Method::POST, url); + + // Compute checksum - S3 *requires* this for DeleteObjects requests, so we default to + // their algorithm if the user hasn't specified one. + let checksum = self.config().checksum.unwrap_or(Checksum::SHA256); + let digest = checksum.digest(&body); + builder = builder.header(checksum.header_name(), BASE64_STANDARD.encode(&digest)); + let payload_sha256 = if checksum == Checksum::SHA256 { + Some(digest) + } else { + None + }; + + let response = builder + .header(CONTENT_TYPE, "application/xml") + .body(body) + .with_aws_sigv4( + credential.as_ref(), + &self.config.region, + "s3", + self.config.sign_payload, + payload_sha256.as_deref(), + ) + .send_retry(&self.config.retry_config) + .await + .context(DeleteObjectsRequestSnafu {})? + .bytes() + .await + .context(DeleteObjectsResponseSnafu {})?; + + let response: BatchDeleteResponse = quick_xml::de::from_reader(response.reader()) + .map_err(|err| Error::InvalidDeleteObjectsResponse { + source: Box::new(err), + })?; + + // Assume all were ok, then fill in errors. This guarantees output order + // matches input order. + let mut results: Vec> = paths.iter().cloned().map(Ok).collect(); + for content in response.content.into_iter() { + if let DeleteObjectResult::Error(error) = content { + let path = Path::parse(&error.key).map_err(|err| { + Error::InvalidDeleteObjectsResponse { + source: Box::new(err), + } + })?; + let i = paths.iter().find_position(|&p| p == &path).unwrap().0; + results[i] = Err(Error::from(error).into()); + } + } + + Ok(results) + } + /// Make an S3 Copy request pub async fn copy_request(&self, from: &Path, to: &Path) -> Result<()> { let credential = self.get_credential().await?; diff --git a/src/aws/mod.rs b/src/aws/mod.rs index fac6165..3696e4a 100644 --- a/src/aws/mod.rs +++ b/src/aws/mod.rs @@ -34,6 +34,7 @@ use async_trait::async_trait; use bytes::Bytes; use futures::stream::BoxStream; +use futures::{StreamExt, TryStreamExt}; use itertools::Itertools; use serde::{Deserialize, Serialize}; use snafu::{ensure, OptionExt, ResultExt, Snafu}; @@ -252,6 +253,26 @@ impl ObjectStore for AmazonS3 { self.client.delete_request(location, &()).await } + fn delete_stream<'a>( + &'a self, + locations: BoxStream<'a, Result>, + ) -> BoxStream<'a, Result> { + locations + .try_chunks(1_000) + .map(move |locations| async { + // Early return the error. We ignore the paths that have already been + // collected into the chunk. + let locations = locations.map_err(|e| e.1)?; + self.client + .bulk_delete_request(locations) + .await + .map(futures::stream::iter) + }) + .buffered(20) + .try_flatten() + .boxed() + } + async fn list( &self, prefix: Option<&Path>, diff --git a/src/http/client.rs b/src/http/client.rs index 6feacbb..1d3df34 100644 --- a/src/http/client.rs +++ b/src/http/client.rs @@ -225,7 +225,13 @@ impl Client { .delete(url) .send_retry(&self.retry_config) .await - .context(RequestSnafu)?; + .map_err(|source| match source.status() { + Some(StatusCode::NOT_FOUND) => crate::Error::NotFound { + source: Box::new(source), + path: path.to_string(), + }, + _ => Error::Request { source }.into(), + })?; Ok(()) } diff --git a/src/lib.rs b/src/lib.rs index 7116a87..c5bf40c 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -386,6 +386,63 @@ pub trait ObjectStore: std::fmt::Display + Send + Sync + Debug + 'static { /// Delete the object at the specified location. async fn delete(&self, location: &Path) -> Result<()>; + /// Delete all the objects at the specified locations + /// + /// When supported, this method will use bulk operations that delete more + /// than one object per a request. The default implementation will call + /// the single object delete method for each location, but with up to 10 + /// concurrent requests. + /// + /// The returned stream yields the results of the delete operations in the + /// same order as the input locations. However, some errors will be from + /// an overall call to a bulk delete operation, and not from a specific + /// location. + /// + /// If the object did not exist, the result may be an error or a success, + /// depending on the behavior of the underlying store. For example, local + /// filesystems, GCP, and Azure return an error, while S3 and in-memory will + /// return Ok. If it is an error, it will be [`Error::NotFound`]. + /// + /// ``` + /// # use object_store::local::LocalFileSystem; + /// # async fn example() -> Result<(), Box> { + /// # let root = tempfile::TempDir::new().unwrap(); + /// # let store = LocalFileSystem::new_with_prefix(root.path()).unwrap(); + /// use object_store::{ObjectStore, ObjectMeta}; + /// use object_store::path::Path; + /// use futures::{StreamExt, TryStreamExt}; + /// use bytes::Bytes; + /// + /// // Create two objects + /// store.put(&Path::from("foo"), Bytes::from("foo")).await?; + /// store.put(&Path::from("bar"), Bytes::from("bar")).await?; + /// + /// // List object + /// let locations = store.list(None).await? + /// .map(|meta: Result| meta.map(|m| m.location)) + /// .boxed(); + /// + /// // Delete them + /// store.delete_stream(locations).try_collect::>().await?; + /// # Ok(()) + /// # } + /// # let rt = tokio::runtime::Builder::new_current_thread().build().unwrap(); + /// # rt.block_on(example()).unwrap(); + /// ``` + fn delete_stream<'a>( + &'a self, + locations: BoxStream<'a, Result>, + ) -> BoxStream<'a, Result> { + locations + .map(|location| async { + let location = location?; + self.delete(&location).await?; + Ok(location) + }) + .buffered(10) + .boxed() + } + /// List all the objects with the given prefix. /// /// Prefixes are evaluated on a path segment basis, i.e. `foo/bar/` is a prefix of `foo/bar/x` but not of @@ -515,6 +572,13 @@ impl ObjectStore for Box { self.as_ref().delete(location).await } + fn delete_stream<'a>( + &'a self, + locations: BoxStream<'a, Result>, + ) -> BoxStream<'a, Result> { + self.as_ref().delete_stream(locations) + } + async fn list( &self, prefix: Option<&Path>, @@ -1119,6 +1183,49 @@ mod tests { assert_eq!(actual, expected, "{prefix:?} - {offset:?}"); } + // Test bulk delete + let paths = vec![ + Path::from("a/a.file"), + Path::from("a/a/b.file"), + Path::from("aa/a.file"), + Path::from("does_not_exist"), + Path::from("I'm a < & weird path"), + Path::from("ab/a.file"), + Path::from("a/😀.file"), + ]; + + storage.put(&paths[4], "foo".into()).await.unwrap(); + + let out_paths = storage + .delete_stream(futures::stream::iter(paths.clone()).map(Ok).boxed()) + .collect::>() + .await; + + assert_eq!(out_paths.len(), paths.len()); + + let expect_errors = [3]; + + for (i, input_path) in paths.iter().enumerate() { + let err = storage.head(input_path).await.unwrap_err(); + assert!(matches!(err, crate::Error::NotFound { .. }), "{}", err); + + if expect_errors.contains(&i) { + // Some object stores will report NotFound, but others (such as S3) will + // report success regardless. + match &out_paths[i] { + Err(Error::NotFound { path: out_path, .. }) => { + assert!(out_path.ends_with(&input_path.to_string())); + } + Ok(out_path) => { + assert_eq!(out_path, input_path); + } + _ => panic!("unexpected error"), + } + } else { + assert_eq!(out_paths[i].as_ref().unwrap(), input_path); + } + } + delete_fixtures(storage).await; } @@ -1471,11 +1578,17 @@ mod tests { } async fn delete_fixtures(storage: &DynObjectStore) { - let paths = flatten_list_stream(storage, None).await.unwrap(); - - for f in &paths { - storage.delete(f).await.unwrap(); - } + let paths = storage + .list(None) + .await + .unwrap() + .map_ok(|meta| meta.location) + .boxed(); + storage + .delete_stream(paths) + .try_collect::>() + .await + .unwrap(); } /// Test that the returned stream does not borrow the lifetime of Path diff --git a/src/limit.rs b/src/limit.rs index e009111..630fd14 100644 --- a/src/limit.rs +++ b/src/limit.rs @@ -148,6 +148,13 @@ impl ObjectStore for LimitStore { self.inner.delete(location).await } + fn delete_stream<'a>( + &'a self, + locations: BoxStream<'a, Result>, + ) -> BoxStream<'a, Result> { + self.inner.delete_stream(locations) + } + async fn list( &self, prefix: Option<&Path>, diff --git a/src/local.rs b/src/local.rs index 52719f1..bbd54db 100644 --- a/src/local.rs +++ b/src/local.rs @@ -444,9 +444,12 @@ impl ObjectStore for LocalFileSystem { async fn delete(&self, location: &Path) -> Result<()> { let path = self.config.path_to_filesystem(location)?; - maybe_spawn_blocking(move || { - std::fs::remove_file(&path).context(UnableToDeleteFileSnafu { path })?; - Ok(()) + maybe_spawn_blocking(move || match std::fs::remove_file(&path) { + Ok(_) => Ok(()), + Err(e) => Err(match e.kind() { + ErrorKind::NotFound => Error::NotFound { path, source: e }.into(), + _ => Error::UnableToDeleteFile { path, source: e }.into(), + }), }) .await } From 2dac010a2dc0dfa64a25d90c6c90acfee4ea5523 Mon Sep 17 00:00:00 2001 From: Will Jones Date: Tue, 30 May 2023 13:52:45 -0700 Subject: [PATCH 153/397] feat: use exactly equal parts in multipart upload (#4305) * refactor: use exactly equal parts in multipart upload * Improve test * Apply suggestions from code review Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> * Fix lifetime --------- Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> --- src/lib.rs | 18 ++++++++++++++--- src/multipart.rs | 52 ++++++++++++++++++++++++++++++++---------------- 2 files changed, 50 insertions(+), 20 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index c5bf40c..98bbb7a 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -898,6 +898,8 @@ mod test_util { mod tests { use super::*; use crate::test_util::flatten_list_stream; + use bytes::{BufMut, BytesMut}; + use itertools::Itertools; use tokio::io::AsyncWriteExt; pub(crate) async fn put_get_delete_list(storage: &DynObjectStore) { @@ -1308,8 +1310,18 @@ mod tests { } } + fn get_random_bytes(len: usize) -> Bytes { + use rand::Rng; + let mut rng = rand::thread_rng(); + let mut bytes = BytesMut::with_capacity(len); + for _ in 0..len { + bytes.put_u8(rng.gen()); + } + bytes.freeze() + } + fn get_vec_of_bytes(chunk_length: usize, num_chunks: usize) -> Vec { - std::iter::repeat(Bytes::from_iter(std::iter::repeat(b'x').take(chunk_length))) + std::iter::repeat(get_random_bytes(chunk_length)) .take(num_chunks) .collect() } @@ -1344,8 +1356,8 @@ mod tests { assert_eq!(bytes_expected, bytes_written); // Can overwrite some storage - // Sizes carefully chosen to exactly hit min limit of 5 MiB - let data = get_vec_of_bytes(242_880, 22); + // Sizes chosen to ensure we write three parts + let data = (0..7).map(|_| get_random_bytes(3_200_000)).collect_vec(); let bytes_expected = data.concat(); let (_, mut writer) = storage.put_multipart(&location).await.unwrap(); for chunk in &data { diff --git a/src/multipart.rs b/src/multipart.rs index 0606fb5..2658030 100644 --- a/src/multipart.rs +++ b/src/multipart.rs @@ -60,8 +60,11 @@ where max_concurrency: usize, /// Buffer that will be sent in next upload. current_buffer: Vec, - /// Minimum size of a part in bytes - min_part_size: usize, + /// Size of each part. + /// + /// While S3 and Minio support variable part sizes, R2 requires they all be + /// exactly the same size. + part_size: usize, /// Index of current part current_part_idx: usize, /// The completion task @@ -85,12 +88,21 @@ where // Minimum size of 5 MiB // https://docs.aws.amazon.com/AmazonS3/latest/userguide/qfacts.html // https://cloud.google.com/storage/quotas#requests - min_part_size: 5_242_880, + part_size: 10 * 1024 * 1024, current_part_idx: 0, completion_task: None, } } + // Add data to the current buffer, returning the number of bytes added + fn add_to_buffer(mut self: Pin<&mut Self>, buf: &[u8], offset: usize) -> usize { + let remaining_capacity = self.part_size - self.current_buffer.len(); + let to_copy = std::cmp::min(remaining_capacity, buf.len() - offset); + self.current_buffer + .extend_from_slice(&buf[offset..offset + to_copy]); + to_copy + } + pub fn poll_tasks( mut self: Pin<&mut Self>, cx: &mut std::task::Context<'_>, @@ -158,15 +170,21 @@ where // Poll current tasks self.as_mut().poll_tasks(cx)?; - // If adding buf to pending buffer would trigger send, check - // whether we have capacity for another task. - let enough_to_send = - (buf.len() + self.current_buffer.len()) >= self.min_part_size; - if enough_to_send && self.tasks.len() < self.max_concurrency { - // If we do, copy into the buffer and submit the task, and return ready. - self.current_buffer.extend_from_slice(buf); + let mut offset = 0; + + loop { + // Fill up current buffer + offset += self.as_mut().add_to_buffer(buf, offset); - let out_buffer = std::mem::take(&mut self.current_buffer); + // If we don't have a full buffer or we have too many tasks, break + if self.current_buffer.len() < self.part_size + || self.tasks.len() >= self.max_concurrency + { + break; + } + + let new_buffer = Vec::with_capacity(self.part_size); + let out_buffer = std::mem::replace(&mut self.current_buffer, new_buffer); let inner = Arc::clone(&self.inner); let part_idx = self.current_part_idx; self.tasks.push(Box::pin(async move { @@ -177,14 +195,14 @@ where // We need to poll immediately after adding to setup waker self.as_mut().poll_tasks(cx)?; + } - Poll::Ready(Ok(buf.len())) - } else if !enough_to_send { - self.current_buffer.extend_from_slice(buf); - Poll::Ready(Ok(buf.len())) - } else { - // Waker registered by call to poll_tasks at beginning + // If offset is zero, then we didn't write anything because we didn't + // have capacity for more tasks and our buffer is full. + if offset == 0 && !buf.is_empty() { Poll::Pending + } else { + Poll::Ready(Ok(offset)) } } From 9a8080cb232c07d20d90b9e8b921fb1bca8a6af4 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Fri, 2 Jun 2023 11:26:28 +0100 Subject: [PATCH 154/397] Fix clippy for object_store (#4344) --- src/aws/mod.rs | 18 ++++++++---------- src/path/mod.rs | 9 +++------ 2 files changed, 11 insertions(+), 16 deletions(-) diff --git a/src/aws/mod.rs b/src/aws/mod.rs index 3696e4a..8de4b7c 100644 --- a/src/aws/mod.rs +++ b/src/aws/mod.rs @@ -1127,23 +1127,21 @@ mod tests { ) .with_allow_http(true); - let config = - if let Some(endpoint) = env::var("OBJECT_STORE_AWS_ENDPOINT").ok() { - config.with_endpoint(endpoint) - } else { - config - }; + let config = if let Ok(endpoint) = env::var("OBJECT_STORE_AWS_ENDPOINT") { + config.with_endpoint(endpoint) + } else { + config + }; - let config = if let Some(token) = - env::var("OBJECT_STORE_AWS_SESSION_TOKEN").ok() + let config = if let Ok(token) = env::var("OBJECT_STORE_AWS_SESSION_TOKEN") { config.with_token(token) } else { config }; - let config = if let Some(virtual_hosted_style_request) = - env::var("OBJECT_STORE_VIRTUAL_HOSTED_STYLE_REQUEST").ok() + let config = if let Ok(virtual_hosted_style_request) = + env::var("OBJECT_STORE_VIRTUAL_HOSTED_STYLE_REQUEST") { config.with_virtual_hosted_style_request( virtual_hosted_style_request.trim().parse().unwrap(), diff --git a/src/path/mod.rs b/src/path/mod.rs index 29b1341..ab30e0e 100644 --- a/src/path/mod.rs +++ b/src/path/mod.rs @@ -438,18 +438,15 @@ mod tests { assert!(existing_path.prefix_match(&prefix).is_none()); // Prefix matches but there aren't any parts after it - let existing_path = Path::from("apple/bear/cow/dog"); - - let prefix = existing_path.clone(); - assert_eq!(existing_path.prefix_match(&prefix).unwrap().count(), 0); + let existing = Path::from("apple/bear/cow/dog"); + assert_eq!(existing.prefix_match(&existing).unwrap().count(), 0); assert_eq!(Path::default().parts().count(), 0); } #[test] fn prefix_matches() { let haystack = Path::from_iter(["foo/bar", "baz%2Ftest", "something"]); - let needle = haystack.clone(); // self starts with self assert!( haystack.prefix_matches(&haystack), @@ -457,7 +454,7 @@ mod tests { ); // a longer prefix doesn't match - let needle = needle.child("longer now"); + let needle = haystack.child("longer now"); assert!( !haystack.prefix_matches(&needle), "{haystack:?} shouldn't have started with {needle:?}" From 5c119959bf4c7d4c26d00f2731d9d41d1edd9857 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Fri, 2 Jun 2023 14:28:35 +0100 Subject: [PATCH 155/397] Don't exclude FIFO files from LocalFileSystem (#4345) --- src/local.rs | 31 ++++++++++++++++++------------- 1 file changed, 18 insertions(+), 13 deletions(-) diff --git a/src/local.rs b/src/local.rs index bbd54db..6039f8d 100644 --- a/src/local.rs +++ b/src/local.rs @@ -429,11 +429,11 @@ impl ObjectStore for LocalFileSystem { path: location.to_string(), }, }), - Ok(m) => match m.is_file() { + Ok(m) => match !m.is_dir() { true => Ok(m), false => Err(Error::NotFound { path, - source: io::Error::new(ErrorKind::NotFound, "is not file"), + source: io::Error::new(ErrorKind::NotFound, "is directory"), }), }, }?; @@ -897,11 +897,11 @@ fn open_file(path: &PathBuf) -> Result { source: e, }, }), - Ok((metadata, file)) => match metadata.is_file() { + Ok((metadata, file)) => match !metadata.is_dir() { true => Ok(file), false => Err(Error::NotFound { path: path.clone(), - source: io::Error::new(ErrorKind::NotFound, "not a file"), + source: io::Error::new(ErrorKind::NotFound, "is directory"), }), }, }?; @@ -1491,21 +1491,26 @@ mod unix_test { use crate::{ObjectStore, Path}; use nix::sys::stat; use nix::unistd; - use std::time::Duration; + use std::fs::OpenOptions; use tempfile::TempDir; - use tokio::time::timeout; #[tokio::test] - async fn test_head_fifo() { + async fn test_fifo() { let filename = "some_file"; let root = TempDir::new().unwrap(); let integration = LocalFileSystem::new_with_prefix(root.path()).unwrap(); - unistd::mkfifo(&root.path().join(filename), stat::Mode::S_IRWXU).unwrap(); + let path = root.path().join(filename); + unistd::mkfifo(&path, stat::Mode::S_IRWXU).unwrap(); + let location = Path::from(filename); - if (timeout(Duration::from_millis(10), integration.head(&location)).await) - .is_err() - { - panic!("Did not receive value within 10 ms"); - } + integration.head(&location).await.unwrap(); + + // Need to open read and write side in parallel + let spawned = tokio::task::spawn_blocking(|| { + OpenOptions::new().write(true).open(path).unwrap(); + }); + + integration.get(&location).await.unwrap(); + spawned.await.unwrap(); } } From ce6affc3872fe00a369de57c4db5cd3071ff495b Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Fri, 2 Jun 2023 15:40:28 +0100 Subject: [PATCH 156/397] Fix support for ECS IAM credentials (#4310) --- src/aws/credential.rs | 45 +++++++++++++++++++++++++++++++++++++- src/aws/mod.rs | 51 +++++++++++++++++++++++++++++-------------- 2 files changed, 79 insertions(+), 17 deletions(-) diff --git a/src/aws/credential.rs b/src/aws/credential.rs index 909dde0..be0ffa5 100644 --- a/src/aws/credential.rs +++ b/src/aws/credential.rs @@ -20,7 +20,7 @@ use crate::client::retry::RetryExt; use crate::client::token::{TemporaryToken, TokenCache}; use crate::client::TokenProvider; use crate::util::hmac_sha256; -use crate::{Result, RetryConfig}; +use crate::{CredentialProvider, Result, RetryConfig}; use async_trait::async_trait; use bytes::Buf; use chrono::{DateTime, Utc}; @@ -542,6 +542,49 @@ async fn web_identity( }) } +/// Credentials sourced from a task IAM role +/// +/// +#[derive(Debug)] +pub struct TaskCredentialProvider { + pub url: String, + pub retry: RetryConfig, + pub client: Client, + pub cache: TokenCache>, +} + +#[async_trait] +impl CredentialProvider for TaskCredentialProvider { + type Credential = AwsCredential; + + async fn get_credential(&self) -> Result> { + self.cache + .get_or_insert_with(|| task_credential(&self.client, &self.retry, &self.url)) + .await + .map_err(|source| crate::Error::Generic { + store: STORE, + source, + }) + } +} + +/// +async fn task_credential( + client: &Client, + retry: &RetryConfig, + url: &str, +) -> Result>, StdError> { + let creds: InstanceCredentials = + client.get(url).send_retry(retry).await?.json().await?; + + let now = Utc::now(); + let ttl = (creds.expiration - now).to_std().unwrap_or_default(); + Ok(TemporaryToken { + token: Arc::new(creds.into()), + expiry: Some(Instant::now() + ttl), + }) +} + #[cfg(test)] mod tests { use super::*; diff --git a/src/aws/mod.rs b/src/aws/mod.rs index 8de4b7c..8a486f9 100644 --- a/src/aws/mod.rs +++ b/src/aws/mod.rs @@ -46,7 +46,9 @@ use url::Url; pub use crate::aws::checksum::Checksum; use crate::aws::client::{S3Client, S3Config}; -use crate::aws::credential::{InstanceCredentialProvider, WebIdentityProvider}; +use crate::aws::credential::{ + InstanceCredentialProvider, TaskCredentialProvider, WebIdentityProvider, +}; use crate::client::get::GetClientExt; use crate::client::list::ListClientExt; use crate::client::{ @@ -87,9 +89,6 @@ pub use credential::{AwsAuthorizer, AwsCredential}; /// Default metadata endpoint static DEFAULT_METADATA_ENDPOINT: &str = "http://169.254.169.254"; -/// ECS metadata endpoint -static ECS_METADATA_ENDPOINT: &str = "http://169.254.170.2"; - /// A specialized `Error` for object store-related errors #[derive(Debug, Snafu)] #[allow(missing_docs)] @@ -399,6 +398,8 @@ pub struct AmazonS3Builder { checksum_algorithm: Option>, /// Metadata endpoint, see metadata_endpoint: Option, + /// Container credentials URL, see + container_credentials_relative_uri: Option, /// Client options client_options: ClientOptions, /// Credentials @@ -529,6 +530,11 @@ pub enum AmazonS3ConfigKey { /// - `metadata_endpoint` MetadataEndpoint, + /// Set the container credentials relative URI + /// + /// + ContainerCredentialsRelativeUri, + /// Client options Client(ClientConfigKey), } @@ -548,6 +554,9 @@ impl AsRef for AmazonS3ConfigKey { Self::MetadataEndpoint => "aws_metadata_endpoint", Self::UnsignedPayload => "aws_unsigned_payload", Self::Checksum => "aws_checksum_algorithm", + Self::ContainerCredentialsRelativeUri => { + "aws_container_credentials_relative_uri" + } Self::Client(opt) => opt.as_ref(), } } @@ -578,6 +587,9 @@ impl FromStr for AmazonS3ConfigKey { "aws_metadata_endpoint" | "metadata_endpoint" => Ok(Self::MetadataEndpoint), "aws_unsigned_payload" | "unsigned_payload" => Ok(Self::UnsignedPayload), "aws_checksum_algorithm" | "checksum_algorithm" => Ok(Self::Checksum), + "aws_container_credentials_relative_uri" => { + Ok(Self::ContainerCredentialsRelativeUri) + } // Backwards compatibility "aws_allow_http" => Ok(Self::Client(ClientConfigKey::AllowHttp)), _ => match s.parse() { @@ -625,15 +637,6 @@ impl AmazonS3Builder { } } - // This env var is set in ECS - // https://docs.aws.amazon.com/AmazonECS/latest/developerguide/task-iam-roles.html - if let Ok(metadata_relative_uri) = - std::env::var("AWS_CONTAINER_CREDENTIALS_RELATIVE_URI") - { - builder.metadata_endpoint = - Some(format!("{ECS_METADATA_ENDPOINT}{metadata_relative_uri}")); - } - builder } @@ -691,6 +694,9 @@ impl AmazonS3Builder { AmazonS3ConfigKey::Checksum => { self.checksum_algorithm = Some(ConfigValue::Deferred(value.into())) } + AmazonS3ConfigKey::ContainerCredentialsRelativeUri => { + self.container_credentials_relative_uri = Some(value.into()) + } AmazonS3ConfigKey::Client(key) => { self.client_options = self.client_options.with_config(key, value) } @@ -758,6 +764,9 @@ impl AmazonS3Builder { self.checksum_algorithm.as_ref().map(ToString::to_string) } AmazonS3ConfigKey::Client(key) => self.client_options.get_config_value(key), + AmazonS3ConfigKey::ContainerCredentialsRelativeUri => { + self.container_credentials_relative_uri.clone() + } } } @@ -999,6 +1008,15 @@ impl AmazonS3Builder { client, self.retry_config.clone(), )) as _ + } else if let Some(uri) = self.container_credentials_relative_uri { + info!("Using Task credential provider"); + Arc::new(TaskCredentialProvider { + url: format!("http://169.254.170.2{uri}"), + retry: self.retry_config.clone(), + // The instance metadata endpoint is access over HTTP + client: self.client_options.clone().with_allow_http(true).client()?, + cache: Default::default(), + }) as _ } else { info!("Using Instance credential provider"); @@ -1199,9 +1217,10 @@ mod tests { assert_eq!(builder.endpoint.unwrap(), aws_endpoint); assert_eq!(builder.token.unwrap(), aws_session_token); - let metadata_uri = - format!("{ECS_METADATA_ENDPOINT}{container_creds_relative_uri}"); - assert_eq!(builder.metadata_endpoint.unwrap(), metadata_uri); + assert_eq!( + builder.container_credentials_relative_uri.unwrap(), + container_creds_relative_uri + ); assert_eq!( builder.checksum_algorithm.unwrap().get().unwrap(), Checksum::SHA256 From cd02568c8ea570bf407baa8e35facbd9d2931f7b Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Fri, 2 Jun 2023 18:39:59 +0100 Subject: [PATCH 157/397] Fix ObjectStore::get_range for GetResult::File (#4350) (#4351) * Fix ObjectStore::get_range for GetResult::File (#4350) * Review feedback --- Cargo.toml | 4 ++ src/lib.rs | 14 ++++- src/local.rs | 2 +- tests/get_range_file.rs | 116 ++++++++++++++++++++++++++++++++++++++++ 4 files changed, 133 insertions(+), 3 deletions(-) create mode 100644 tests/get_range_file.rs diff --git a/Cargo.toml b/Cargo.toml index 28bf29f..4002a18 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -75,3 +75,7 @@ tempfile = "3.1.0" futures-test = "0.3" rand = "0.8" hyper = { version = "0.14.24", features = ["server"] } + +[[test]] +name = "get_range_file" +path = "tests/get_range_file.rs" diff --git a/src/lib.rs b/src/lib.rs index 98bbb7a..864cabc 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -359,10 +359,20 @@ pub trait ObjectStore: std::fmt::Display + Send + Sync + Debug + 'static { /// in the given byte range async fn get_range(&self, location: &Path, range: Range) -> Result { let options = GetOptions { - range: Some(range), + range: Some(range.clone()), ..Default::default() }; - self.get_opts(location, options).await?.bytes().await + // Temporary until GetResult::File supports range (#4352) + match self.get_opts(location, options).await? { + GetResult::Stream(s) => collect_bytes(s, None).await, + #[cfg(not(target_arch = "wasm32"))] + GetResult::File(mut file, path) => { + maybe_spawn_blocking(move || local::read_range(&mut file, &path, range)) + .await + } + #[cfg(target_arch = "wasm32")] + _ => unimplemented!("File IO not implemented on wasm32."), + } } /// Return the bytes that are stored at the specified location diff --git a/src/local.rs b/src/local.rs index 6039f8d..ffff6a5 100644 --- a/src/local.rs +++ b/src/local.rs @@ -863,7 +863,7 @@ impl AsyncWrite for LocalUpload { } } -fn read_range(file: &mut File, path: &PathBuf, range: Range) -> Result { +pub(crate) fn read_range(file: &mut File, path: &PathBuf, range: Range) -> Result { let to_read = range.end - range.start; file.seek(SeekFrom::Start(range.start as u64)) .context(SeekSnafu { path })?; diff --git a/tests/get_range_file.rs b/tests/get_range_file.rs new file mode 100644 index 0000000..f926e3b --- /dev/null +++ b/tests/get_range_file.rs @@ -0,0 +1,116 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Tests the default implementation of get_range handles GetResult::File correctly (#4350) + +use async_trait::async_trait; +use bytes::Bytes; +use futures::stream::BoxStream; +use object_store::local::LocalFileSystem; +use object_store::path::Path; +use object_store::{ + GetOptions, GetResult, ListResult, MultipartId, ObjectMeta, ObjectStore, +}; +use std::fmt::Formatter; +use tempfile::tempdir; +use tokio::io::AsyncWrite; + +#[derive(Debug)] +struct MyStore(LocalFileSystem); + +impl std::fmt::Display for MyStore { + fn fmt(&self, _: &mut Formatter<'_>) -> std::fmt::Result { + todo!() + } +} + +#[async_trait] +impl ObjectStore for MyStore { + async fn put(&self, path: &Path, data: Bytes) -> object_store::Result<()> { + self.0.put(path, data).await + } + + async fn put_multipart( + &self, + _: &Path, + ) -> object_store::Result<(MultipartId, Box)> { + todo!() + } + + async fn abort_multipart( + &self, + _: &Path, + _: &MultipartId, + ) -> object_store::Result<()> { + todo!() + } + + async fn get_opts( + &self, + location: &Path, + options: GetOptions, + ) -> object_store::Result { + self.0.get_opts(location, options).await + } + + async fn head(&self, _: &Path) -> object_store::Result { + todo!() + } + + async fn delete(&self, _: &Path) -> object_store::Result<()> { + todo!() + } + + async fn list( + &self, + _: Option<&Path>, + ) -> object_store::Result>> { + todo!() + } + + async fn list_with_delimiter( + &self, + _: Option<&Path>, + ) -> object_store::Result { + todo!() + } + + async fn copy(&self, _: &Path, _: &Path) -> object_store::Result<()> { + todo!() + } + + async fn copy_if_not_exists(&self, _: &Path, _: &Path) -> object_store::Result<()> { + todo!() + } +} + +#[tokio::test] +async fn test_get_range() { + let tmp = tempdir().unwrap(); + let store = MyStore(LocalFileSystem::new_with_prefix(tmp.path()).unwrap()); + let path = Path::from("foo"); + + let expected = Bytes::from_static(b"hello world"); + store.put(&path, expected.clone()).await.unwrap(); + let fetched = store.get(&path).await.unwrap().bytes().await.unwrap(); + assert_eq!(expected, fetched); + + for range in [0..10, 3..5, 0..expected.len()] { + let data = store.get_range(&path, range.clone()).await.unwrap(); + assert_eq!(&data[..], &expected[range]) + } +} From 8d534560486cf650cd28f798925f7833d3d6ae54 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Fri, 2 Jun 2023 19:01:44 +0100 Subject: [PATCH 158/397] Prepare object_store 0.6.1 (#4348) * Prepare object_store 0.6.1 * Final tweaks --- CHANGELOG-old.md | 64 ++++++++++++++++++++++++++++++ CHANGELOG.md | 68 +++++++------------------------- Cargo.toml | 2 +- dev/release/update_change_log.sh | 4 +- 4 files changed, 81 insertions(+), 57 deletions(-) diff --git a/CHANGELOG-old.md b/CHANGELOG-old.md index c9c4e28..3880205 100644 --- a/CHANGELOG-old.md +++ b/CHANGELOG-old.md @@ -19,6 +19,70 @@ # Historical Changelog +## [object_store_0.6.0](https://github.com/apache/arrow-rs/tree/object_store_0.6.0) (2023-05-18) + +[Full Changelog](https://github.com/apache/arrow-rs/compare/object_store_0.5.6...object_store_0.6.0) + +**Breaking changes:** + +- Add ObjectStore::get\_opts \(\#2241\) [\#4212](https://github.com/apache/arrow-rs/pull/4212) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Simplify ObjectStore configuration pattern [\#4189](https://github.com/apache/arrow-rs/pull/4189) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- object\_store: fix: Incorrect parsing of https Path Style S3 url [\#4082](https://github.com/apache/arrow-rs/pull/4082) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([roeap](https://github.com/roeap)) +- feat: add etag for objectMeta [\#3937](https://github.com/apache/arrow-rs/pull/3937) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([Weijun-H](https://github.com/Weijun-H)) + +**Implemented enhancements:** + +- Object Store Authorization [\#4223](https://github.com/apache/arrow-rs/issues/4223) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Use XML API for GCS [\#4209](https://github.com/apache/arrow-rs/issues/4209) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- ObjectStore with\_url Should Handle Path [\#4199](https://github.com/apache/arrow-rs/issues/4199) +- Return Error on Invalid Config Value [\#4191](https://github.com/apache/arrow-rs/issues/4191) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Extensible ObjectStore Authentication [\#4163](https://github.com/apache/arrow-rs/issues/4163) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- object\_store: When using an AWS profile, obtain the default AWS region from the active profile [\#4158](https://github.com/apache/arrow-rs/issues/4158) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- InMemory append API [\#4152](https://github.com/apache/arrow-rs/issues/4152) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Support accessing ipc Reader/Writer inner by reference [\#4121](https://github.com/apache/arrow-rs/issues/4121) +- \[object\_store\] Retry requests on connection error [\#4119](https://github.com/apache/arrow-rs/issues/4119) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- object\_store: Instantiate object store from provided url with store options [\#4047](https://github.com/apache/arrow-rs/issues/4047) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- object\_store: Builders \(S3/Azure/GCS\) are missing the `get method` to get the actual configuration information [\#4021](https://github.com/apache/arrow-rs/issues/4021) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] + +**Fixed bugs:** + +- ObjectStore::head Returns Directory for LocalFileSystem and Hierarchical Azure [\#4230](https://github.com/apache/arrow-rs/issues/4230) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- object\_store: different behavior from aws cli for default profile [\#4137](https://github.com/apache/arrow-rs/issues/4137) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- ImdsManagedIdentityOAuthProvider should send resource ID instead of OIDC scope [\#4096](https://github.com/apache/arrow-rs/issues/4096) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Update readme to remove reference to Jira [\#4091](https://github.com/apache/arrow-rs/issues/4091) +- object\_store: Incorrect parsing of https Path Style S3 url [\#4078](https://github.com/apache/arrow-rs/issues/4078) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- \[object\_store\] `local::tests::test_list_root` test fails during release verification [\#3772](https://github.com/apache/arrow-rs/issues/3772) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] + +**Merged pull requests:** + +- Remove AWS\_PROFILE support [\#4238](https://github.com/apache/arrow-rs/pull/4238) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Expose AwsAuthorizer [\#4237](https://github.com/apache/arrow-rs/pull/4237) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Expose CredentialProvider [\#4235](https://github.com/apache/arrow-rs/pull/4235) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Return NotFound for directories in Head and Get \(\#4230\) [\#4231](https://github.com/apache/arrow-rs/pull/4231) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Standardise credentials API \(\#4223\) \(\#4163\) [\#4225](https://github.com/apache/arrow-rs/pull/4225) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Extract Common Listing and Retrieval Functionality [\#4220](https://github.com/apache/arrow-rs/pull/4220) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- feat\(object-store\): extend Options API for http client [\#4208](https://github.com/apache/arrow-rs/pull/4208) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([roeap](https://github.com/roeap)) +- Consistently use GCP XML API [\#4207](https://github.com/apache/arrow-rs/pull/4207) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Implement list\_with\_offset for PrefixStore [\#4203](https://github.com/apache/arrow-rs/pull/4203) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Allow setting ClientOptions with Options API [\#4202](https://github.com/apache/arrow-rs/pull/4202) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Create ObjectStore from URL and Options \(\#4047\) [\#4200](https://github.com/apache/arrow-rs/pull/4200) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Skip test\_list\_root on OS X \(\#3772\) [\#4198](https://github.com/apache/arrow-rs/pull/4198) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Recognise R2 URLs for S3 object store \(\#4190\) [\#4194](https://github.com/apache/arrow-rs/pull/4194) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Fix ImdsManagedIdentityProvider \(\#4096\) [\#4193](https://github.com/apache/arrow-rs/pull/4193) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Deffered Object Store Config Parsing \(\#4191\) [\#4192](https://github.com/apache/arrow-rs/pull/4192) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Object Store \(AWS\): Support dynamically resolving S3 bucket region [\#4188](https://github.com/apache/arrow-rs/pull/4188) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([mr-brobot](https://github.com/mr-brobot)) +- Faster prefix match in object\_store path handling [\#4164](https://github.com/apache/arrow-rs/pull/4164) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Object Store \(AWS\): Support region configured via named profile [\#4161](https://github.com/apache/arrow-rs/pull/4161) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([mr-brobot](https://github.com/mr-brobot)) +- InMemory append API [\#4153](https://github.com/apache/arrow-rs/pull/4153) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([berkaysynnada](https://github.com/berkaysynnada)) +- docs: fix the wrong ln command in CONTRIBUTING.md [\#4139](https://github.com/apache/arrow-rs/pull/4139) ([SteveLauC](https://github.com/SteveLauC)) +- Display the file path in the error message when failed to open credentials file for GCS [\#4124](https://github.com/apache/arrow-rs/pull/4124) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([haoxins](https://github.com/haoxins)) +- Retry on Connection Errors [\#4120](https://github.com/apache/arrow-rs/pull/4120) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([kindly](https://github.com/kindly)) +- Simplify reference to GitHub issues [\#4092](https://github.com/apache/arrow-rs/pull/4092) ([bkmgit](https://github.com/bkmgit)) +- Use reqwest build\_split [\#4039](https://github.com/apache/arrow-rs/pull/4039) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Fix object\_store CI [\#4037](https://github.com/apache/arrow-rs/pull/4037) ([tustvold](https://github.com/tustvold)) +- Add get\_config\_value to AWS/Azure/GCP Builders [\#4035](https://github.com/apache/arrow-rs/pull/4035) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([r4ntix](https://github.com/r4ntix)) +- Update AWS SDK [\#3993](https://github.com/apache/arrow-rs/pull/3993) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) + ## [object_store_0.5.6](https://github.com/apache/arrow-rs/tree/object_store_0.5.6) (2023-03-30) [Full Changelog](https://github.com/apache/arrow-rs/compare/object_store_0.5.5...object_store_0.5.6) diff --git a/CHANGELOG.md b/CHANGELOG.md index bde0f75..fe25e23 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,70 +19,30 @@ # Changelog -## [object_store_0.6.0](https://github.com/apache/arrow-rs/tree/object_store_0.6.0) (2023-05-18) +## [object_store_0.6.1](https://github.com/apache/arrow-rs/tree/object_store_0.6.1) (2023-06-02) -[Full Changelog](https://github.com/apache/arrow-rs/compare/object_store_0.5.6...object_store_0.6.0) - -**Breaking changes:** - -- Add ObjectStore::get\_opts \(\#2241\) [\#4212](https://github.com/apache/arrow-rs/pull/4212) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- Simplify ObjectStore configuration pattern [\#4189](https://github.com/apache/arrow-rs/pull/4189) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- object\_store: fix: Incorrect parsing of https Path Style S3 url [\#4082](https://github.com/apache/arrow-rs/pull/4082) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([roeap](https://github.com/roeap)) -- feat: add etag for objectMeta [\#3937](https://github.com/apache/arrow-rs/pull/3937) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([Weijun-H](https://github.com/Weijun-H)) +[Full Changelog](https://github.com/apache/arrow-rs/compare/object_store_0.6.0...object_store_0.6.1) **Implemented enhancements:** -- Object Store Authorization [\#4223](https://github.com/apache/arrow-rs/issues/4223) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- Use XML API for GCS [\#4209](https://github.com/apache/arrow-rs/issues/4209) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- ObjectStore with\_url Should Handle Path [\#4199](https://github.com/apache/arrow-rs/issues/4199) -- Return Error on Invalid Config Value [\#4191](https://github.com/apache/arrow-rs/issues/4191) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- Extensible ObjectStore Authentication [\#4163](https://github.com/apache/arrow-rs/issues/4163) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- object\_store: When using an AWS profile, obtain the default AWS region from the active profile [\#4158](https://github.com/apache/arrow-rs/issues/4158) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- InMemory append API [\#4152](https://github.com/apache/arrow-rs/issues/4152) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- Support accessing ipc Reader/Writer inner by reference [\#4121](https://github.com/apache/arrow-rs/issues/4121) -- \[object\_store\] Retry requests on connection error [\#4119](https://github.com/apache/arrow-rs/issues/4119) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- object\_store: Instantiate object store from provided url with store options [\#4047](https://github.com/apache/arrow-rs/issues/4047) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- object\_store: Builders \(S3/Azure/GCS\) are missing the `get method` to get the actual configuration information [\#4021](https://github.com/apache/arrow-rs/issues/4021) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Support multipart upload in R2 [\#4304](https://github.com/apache/arrow-rs/issues/4304) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] **Fixed bugs:** -- ObjectStore::head Returns Directory for LocalFileSystem and Hierarchical Azure [\#4230](https://github.com/apache/arrow-rs/issues/4230) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- object\_store: different behavior from aws cli for default profile [\#4137](https://github.com/apache/arrow-rs/issues/4137) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- ImdsManagedIdentityOAuthProvider should send resource ID instead of OIDC scope [\#4096](https://github.com/apache/arrow-rs/issues/4096) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- Update readme to remove reference to Jira [\#4091](https://github.com/apache/arrow-rs/issues/4091) -- object\_store: Incorrect parsing of https Path Style S3 url [\#4078](https://github.com/apache/arrow-rs/issues/4078) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- \[object\_store\] `local::tests::test_list_root` test fails during release verification [\#3772](https://github.com/apache/arrow-rs/issues/3772) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Default ObjectStore::get\_range Doesn't Apply Range to GetResult::File [\#4350](https://github.com/apache/arrow-rs/issues/4350) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -**Merged pull requests:** +**Closed issues:** -- Remove AWS\_PROFILE support [\#4238](https://github.com/apache/arrow-rs/pull/4238) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- Expose AwsAuthorizer [\#4237](https://github.com/apache/arrow-rs/pull/4237) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- Expose CredentialProvider [\#4235](https://github.com/apache/arrow-rs/pull/4235) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- Return NotFound for directories in Head and Get \(\#4230\) [\#4231](https://github.com/apache/arrow-rs/pull/4231) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- Standardise credentials API \(\#4223\) \(\#4163\) [\#4225](https://github.com/apache/arrow-rs/pull/4225) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- Extract Common Listing and Retrieval Functionality [\#4220](https://github.com/apache/arrow-rs/pull/4220) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- feat\(object-store\): extend Options API for http client [\#4208](https://github.com/apache/arrow-rs/pull/4208) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([roeap](https://github.com/roeap)) -- Consistently use GCP XML API [\#4207](https://github.com/apache/arrow-rs/pull/4207) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- Implement list\_with\_offset for PrefixStore [\#4203](https://github.com/apache/arrow-rs/pull/4203) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- Allow setting ClientOptions with Options API [\#4202](https://github.com/apache/arrow-rs/pull/4202) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- Create ObjectStore from URL and Options \(\#4047\) [\#4200](https://github.com/apache/arrow-rs/pull/4200) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- Skip test\_list\_root on OS X \(\#3772\) [\#4198](https://github.com/apache/arrow-rs/pull/4198) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- Recognise R2 URLs for S3 object store \(\#4190\) [\#4194](https://github.com/apache/arrow-rs/pull/4194) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- Fix ImdsManagedIdentityProvider \(\#4096\) [\#4193](https://github.com/apache/arrow-rs/pull/4193) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- Deffered Object Store Config Parsing \(\#4191\) [\#4192](https://github.com/apache/arrow-rs/pull/4192) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- Object Store \(AWS\): Support dynamically resolving S3 bucket region [\#4188](https://github.com/apache/arrow-rs/pull/4188) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([mr-brobot](https://github.com/mr-brobot)) -- Faster prefix match in object\_store path handling [\#4164](https://github.com/apache/arrow-rs/pull/4164) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- Object Store \(AWS\): Support region configured via named profile [\#4161](https://github.com/apache/arrow-rs/pull/4161) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([mr-brobot](https://github.com/mr-brobot)) -- InMemory append API [\#4153](https://github.com/apache/arrow-rs/pull/4153) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([berkaysynnada](https://github.com/berkaysynnada)) -- docs: fix the wrong ln command in CONTRIBUTING.md [\#4139](https://github.com/apache/arrow-rs/pull/4139) ([SteveLauC](https://github.com/SteveLauC)) -- Display the file path in the error message when failed to open credentials file for GCS [\#4124](https://github.com/apache/arrow-rs/pull/4124) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([haoxins](https://github.com/haoxins)) -- Retry on Connection Errors [\#4120](https://github.com/apache/arrow-rs/pull/4120) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([kindly](https://github.com/kindly)) -- Simplify reference to GitHub issues [\#4092](https://github.com/apache/arrow-rs/pull/4092) ([bkmgit](https://github.com/bkmgit)) -- Use reqwest build\_split [\#4039](https://github.com/apache/arrow-rs/pull/4039) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- Fix object\_store CI [\#4037](https://github.com/apache/arrow-rs/pull/4037) ([tustvold](https://github.com/tustvold)) -- Add get\_config\_value to AWS/Azure/GCP Builders [\#4035](https://github.com/apache/arrow-rs/pull/4035) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([r4ntix](https://github.com/r4ntix)) -- Update AWS SDK [\#3993](https://github.com/apache/arrow-rs/pull/3993) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- \[object\_store - AmazonS3Builder\] incorrect metadata\_endpoint set in `from_env` in an ECS environment [\#4283](https://github.com/apache/arrow-rs/issues/4283) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +**Merged pull requests:** +- Fix ObjectStore::get\_range for GetResult::File \(\#4350\) [\#4351](https://github.com/apache/arrow-rs/pull/4351) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Don't exclude FIFO files from LocalFileSystem [\#4345](https://github.com/apache/arrow-rs/pull/4345) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Fix support for ECS IAM credentials [\#4310](https://github.com/apache/arrow-rs/pull/4310) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- feat: use exactly equal parts in multipart upload [\#4305](https://github.com/apache/arrow-rs/pull/4305) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([wjones127](https://github.com/wjones127)) +- Set ECS specific metadata endpoint [\#4288](https://github.com/apache/arrow-rs/pull/4288) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([jfuechsl](https://github.com/jfuechsl)) +- Prepare 40.0.0 release [\#4245](https://github.com/apache/arrow-rs/pull/4245) ([tustvold](https://github.com/tustvold)) +- feat: support bulk deletes in object\_store [\#4060](https://github.com/apache/arrow-rs/pull/4060) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([wjones127](https://github.com/wjones127)) \* *This Changelog was automatically generated by [github_changelog_generator](https://github.com/github-changelog-generator/github-changelog-generator)* diff --git a/Cargo.toml b/Cargo.toml index 4002a18..5e2009d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "object_store" -version = "0.6.0" +version = "0.6.1" edition = "2021" license = "MIT/Apache-2.0" readme = "README.md" diff --git a/dev/release/update_change_log.sh b/dev/release/update_change_log.sh index 6090630..3e9f8bd 100755 --- a/dev/release/update_change_log.sh +++ b/dev/release/update_change_log.sh @@ -29,8 +29,8 @@ set -e -SINCE_TAG="object_store_0.5.6" -FUTURE_RELEASE="object_store_0.6.0" +SINCE_TAG="object_store_0.6.0" +FUTURE_RELEASE="object_store_0.6.1" SOURCE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" SOURCE_TOP_DIR="$(cd "${SOURCE_DIR}/../../" && pwd)" From f3576f8f098073056b03cc8abd3220fbc6153595 Mon Sep 17 00:00:00 2001 From: Vaibhav Rabber Date: Wed, 21 Jun 2023 04:23:14 +0530 Subject: [PATCH 159/397] gcp: Exclude authorization header when bearer empty (#4418) GCP tries to authorize when there's the authorization header. If the bearer is empty, exclude the header since this doesn't let us get a public object. Signed-off-by: Vaibhav --- src/gcp/mod.rs | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/src/gcp/mod.rs b/src/gcp/mod.rs index 7b11273..d4d3703 100644 --- a/src/gcp/mod.rs +++ b/src/gcp/mod.rs @@ -394,16 +394,19 @@ impl GetClient for GoogleCloudStorageClient { false => Method::GET, }; - let response = self - .client - .request(method, url) - .bearer_auth(&credential.bearer) - .with_get_options(options) - .send_retry(&self.retry_config) - .await - .context(GetRequestSnafu { - path: path.as_ref(), - })?; + let mut request = self.client.request(method, url).with_get_options(options); + + if !credential.bearer.is_empty() { + request = request.bearer_auth(&credential.bearer); + } + + let response = + request + .send_retry(&self.retry_config) + .await + .context(GetRequestSnafu { + path: path.as_ref(), + })?; Ok(response) } From 61b9778596b22596a7097884ad2d604c73b8a45e Mon Sep 17 00:00:00 2001 From: Tobias Bieniek Date: Mon, 10 Jul 2023 15:39:58 +0200 Subject: [PATCH 160/397] object_store/InMemory: Add `fork()` fn and deprecate `clone()` fn (#4499) --- src/memory.rs | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/src/memory.rs b/src/memory.rs index 82d4859..98b3a15 100644 --- a/src/memory.rs +++ b/src/memory.rs @@ -287,14 +287,18 @@ impl InMemory { Self::default() } - /// Creates a clone of the store - pub async fn clone(&self) -> Self { + /// Creates a fork of the store, with the current content copied into the + /// new store. + pub fn fork(&self) -> Self { let storage = self.storage.read(); - let storage = storage.clone(); + let storage = Arc::new(RwLock::new(storage.clone())); + Self { storage } + } - Self { - storage: Arc::new(RwLock::new(storage)), - } + /// Creates a clone of the store + #[deprecated(note = "Use fork() instead")] + pub async fn clone(&self) -> Self { + self.fork() } async fn entry(&self, location: &Path) -> Result<(Bytes, DateTime)> { From 094c8fbec0839d52a0f608a8dbe2a7f77b6e8867 Mon Sep 17 00:00:00 2001 From: eitsupi <50911393+eitsupi@users.noreply.github.com> Date: Mon, 10 Jul 2023 23:03:48 +0900 Subject: [PATCH 161/397] ci: verify MSRV on CI (#4490) --- Cargo.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/Cargo.toml b/Cargo.toml index 5e2009d..255b972 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -24,6 +24,7 @@ readme = "README.md" description = "A generic object store interface for uniformly interacting with AWS S3, Google Cloud Storage, Azure Blob Storage and local files." keywords = ["object", "storage", "cloud"] repository = "https://github.com/apache/arrow-rs/tree/master/object_store" +rust-version = "1.62.1" [package.metadata.docs.rs] all-features = true From 8e233d6c9762c557e5f4404b90ebc9848788063d Mon Sep 17 00:00:00 2001 From: Tobias Bieniek Date: Mon, 10 Jul 2023 20:04:50 +0200 Subject: [PATCH 162/397] object_store: Implement `ObjectStore` for `Arc` (#4502) * object_store: Add `Box` tests * object_store: Extract `as_ref_impl!()` macro * object_store: Implement `ObjectStore` for `Arc` --- src/lib.rs | 179 ++++++++++++++++++++++++++++---------------------- src/memory.rs | 26 ++++++++ 2 files changed, 125 insertions(+), 80 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 864cabc..97e6aae 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -270,6 +270,7 @@ use std::fmt::{Debug, Formatter}; #[cfg(not(target_arch = "wasm32"))] use std::io::{Read, Seek, SeekFrom}; use std::ops::Range; +use std::sync::Arc; use tokio::io::AsyncWrite; #[cfg(any(feature = "azure", feature = "aws", feature = "gcp", feature = "http"))] @@ -526,105 +527,123 @@ pub trait ObjectStore: std::fmt::Display + Send + Sync + Debug + 'static { } } -#[async_trait] -impl ObjectStore for Box { - async fn put(&self, location: &Path, bytes: Bytes) -> Result<()> { - self.as_ref().put(location, bytes).await - } +macro_rules! as_ref_impl { + ($type:ty) => { + #[async_trait] + impl ObjectStore for $type { + async fn put(&self, location: &Path, bytes: Bytes) -> Result<()> { + self.as_ref().put(location, bytes).await + } - async fn put_multipart( - &self, - location: &Path, - ) -> Result<(MultipartId, Box)> { - self.as_ref().put_multipart(location).await - } + async fn put_multipart( + &self, + location: &Path, + ) -> Result<(MultipartId, Box)> { + self.as_ref().put_multipart(location).await + } - async fn abort_multipart( - &self, - location: &Path, - multipart_id: &MultipartId, - ) -> Result<()> { - self.as_ref().abort_multipart(location, multipart_id).await - } + async fn abort_multipart( + &self, + location: &Path, + multipart_id: &MultipartId, + ) -> Result<()> { + self.as_ref().abort_multipart(location, multipart_id).await + } - async fn append( - &self, - location: &Path, - ) -> Result> { - self.as_ref().append(location).await - } + async fn append( + &self, + location: &Path, + ) -> Result> { + self.as_ref().append(location).await + } - async fn get(&self, location: &Path) -> Result { - self.as_ref().get(location).await - } + async fn get(&self, location: &Path) -> Result { + self.as_ref().get(location).await + } - async fn get_opts(&self, location: &Path, options: GetOptions) -> Result { - self.as_ref().get_opts(location, options).await - } + async fn get_opts( + &self, + location: &Path, + options: GetOptions, + ) -> Result { + self.as_ref().get_opts(location, options).await + } - async fn get_range(&self, location: &Path, range: Range) -> Result { - self.as_ref().get_range(location, range).await - } + async fn get_range( + &self, + location: &Path, + range: Range, + ) -> Result { + self.as_ref().get_range(location, range).await + } - async fn get_ranges( - &self, - location: &Path, - ranges: &[Range], - ) -> Result> { - self.as_ref().get_ranges(location, ranges).await - } + async fn get_ranges( + &self, + location: &Path, + ranges: &[Range], + ) -> Result> { + self.as_ref().get_ranges(location, ranges).await + } - async fn head(&self, location: &Path) -> Result { - self.as_ref().head(location).await - } + async fn head(&self, location: &Path) -> Result { + self.as_ref().head(location).await + } - async fn delete(&self, location: &Path) -> Result<()> { - self.as_ref().delete(location).await - } + async fn delete(&self, location: &Path) -> Result<()> { + self.as_ref().delete(location).await + } - fn delete_stream<'a>( - &'a self, - locations: BoxStream<'a, Result>, - ) -> BoxStream<'a, Result> { - self.as_ref().delete_stream(locations) - } + fn delete_stream<'a>( + &'a self, + locations: BoxStream<'a, Result>, + ) -> BoxStream<'a, Result> { + self.as_ref().delete_stream(locations) + } - async fn list( - &self, - prefix: Option<&Path>, - ) -> Result>> { - self.as_ref().list(prefix).await - } + async fn list( + &self, + prefix: Option<&Path>, + ) -> Result>> { + self.as_ref().list(prefix).await + } - async fn list_with_offset( - &self, - prefix: Option<&Path>, - offset: &Path, - ) -> Result>> { - self.as_ref().list_with_offset(prefix, offset).await - } + async fn list_with_offset( + &self, + prefix: Option<&Path>, + offset: &Path, + ) -> Result>> { + self.as_ref().list_with_offset(prefix, offset).await + } - async fn list_with_delimiter(&self, prefix: Option<&Path>) -> Result { - self.as_ref().list_with_delimiter(prefix).await - } + async fn list_with_delimiter( + &self, + prefix: Option<&Path>, + ) -> Result { + self.as_ref().list_with_delimiter(prefix).await + } - async fn copy(&self, from: &Path, to: &Path) -> Result<()> { - self.as_ref().copy(from, to).await - } + async fn copy(&self, from: &Path, to: &Path) -> Result<()> { + self.as_ref().copy(from, to).await + } - async fn rename(&self, from: &Path, to: &Path) -> Result<()> { - self.as_ref().rename(from, to).await - } + async fn rename(&self, from: &Path, to: &Path) -> Result<()> { + self.as_ref().rename(from, to).await + } - async fn copy_if_not_exists(&self, from: &Path, to: &Path) -> Result<()> { - self.as_ref().copy_if_not_exists(from, to).await - } + async fn copy_if_not_exists(&self, from: &Path, to: &Path) -> Result<()> { + self.as_ref().copy_if_not_exists(from, to).await + } - async fn rename_if_not_exists(&self, from: &Path, to: &Path) -> Result<()> { - self.as_ref().rename_if_not_exists(from, to).await - } + async fn rename_if_not_exists(&self, from: &Path, to: &Path) -> Result<()> { + self.as_ref().rename_if_not_exists(from, to).await + } + } + }; } +as_ref_impl!(Arc); +as_ref_impl!(Box); + /// Result of a list call that includes objects, prefixes (directories) and a /// token for the next set of results. Individual result sets may be limited to /// 1,000 objects based on the underlying object storage's limitations. diff --git a/src/memory.rs b/src/memory.rs index 98b3a15..cfc2ac8 100644 --- a/src/memory.rs +++ b/src/memory.rs @@ -415,6 +415,32 @@ mod tests { stream_get(&integration).await; } + #[tokio::test] + async fn box_test() { + let integration: Box = Box::new(InMemory::new()); + + put_get_delete_list(&integration).await; + get_opts(&integration).await; + list_uses_directories_correctly(&integration).await; + list_with_delimiter(&integration).await; + rename_and_copy(&integration).await; + copy_if_not_exists(&integration).await; + stream_get(&integration).await; + } + + #[tokio::test] + async fn arc_test() { + let integration: Arc = Arc::new(InMemory::new()); + + put_get_delete_list(&integration).await; + get_opts(&integration).await; + list_uses_directories_correctly(&integration).await; + list_with_delimiter(&integration).await; + rename_and_copy(&integration).await; + copy_if_not_exists(&integration).await; + stream_get(&integration).await; + } + #[tokio::test] async fn unknown_length() { let integration = InMemory::new(); From 3c24d42b3e77f93936d534055a1fc852795fce3f Mon Sep 17 00:00:00 2001 From: Ahmad Sattar Date: Thu, 13 Jul 2023 18:58:41 +0200 Subject: [PATCH 163/397] object_store: Export `ClientConfigKey` and add `HttpBuilder::with_config` (#4516) * object_store: Export `ClientConfigKey` * object_store: Add `HttpBuilder::with_config` --- src/http/mod.rs | 10 ++++++++-- src/lib.rs | 2 +- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/src/http/mod.rs b/src/http/mod.rs index 124b7da..bc01c17 100644 --- a/src/http/mod.rs +++ b/src/http/mod.rs @@ -43,8 +43,8 @@ use url::Url; use crate::http::client::Client; use crate::path::Path; use crate::{ - ClientOptions, GetOptions, GetResult, ListResult, MultipartId, ObjectMeta, - ObjectStore, Result, RetryConfig, + ClientConfigKey, ClientOptions, GetOptions, GetResult, ListResult, MultipartId, + ObjectMeta, ObjectStore, Result, RetryConfig, }; mod client; @@ -231,6 +231,12 @@ impl HttpBuilder { self } + /// Set individual client configuration without overriding the entire config + pub fn with_config(mut self, key: ClientConfigKey, value: impl Into) -> Self { + self.client_options = self.client_options.with_config(key, value); + self + } + /// Sets the client options, overriding any already set pub fn with_client_options(mut self, options: ClientOptions) -> Self { self.client_options = options; diff --git a/src/lib.rs b/src/lib.rs index 97e6aae..4867d48 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -274,7 +274,7 @@ use std::sync::Arc; use tokio::io::AsyncWrite; #[cfg(any(feature = "azure", feature = "aws", feature = "gcp", feature = "http"))] -pub use client::ClientOptions; +pub use client::{ClientConfigKey, ClientOptions}; /// An alias for a dynamically dispatched object store implementation. pub type DynObjectStore = dyn ObjectStore; From c9fbc535f947ce3d722b2af7dcedde5fd2caf376 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Fri, 14 Jul 2023 12:52:20 -0400 Subject: [PATCH 164/397] Handle empty S3 payloads (#4514) (#4518) --- src/aws/client.rs | 27 ++++++++++++++++----------- src/aws/mod.rs | 4 ++-- src/azure/client.rs | 2 +- src/lib.rs | 9 +++++++++ 4 files changed, 28 insertions(+), 14 deletions(-) diff --git a/src/aws/client.rs b/src/aws/client.rs index 0c24936..971d2c6 100644 --- a/src/aws/client.rs +++ b/src/aws/client.rs @@ -35,7 +35,10 @@ use bytes::{Buf, Bytes}; use itertools::Itertools; use percent_encoding::{utf8_percent_encode, PercentEncode}; use quick_xml::events::{self as xml_events}; -use reqwest::{header::CONTENT_TYPE, Client as ReqwestClient, Method, Response}; +use reqwest::{ + header::{CONTENT_LENGTH, CONTENT_TYPE}, + Client as ReqwestClient, Method, Response, +}; use serde::{Deserialize, Serialize}; use snafu::{ResultExt, Snafu}; use std::sync::Arc; @@ -236,7 +239,7 @@ impl S3Client { pub async fn put_request( &self, path: &Path, - bytes: Option, + bytes: Bytes, query: &T, ) -> Result { let credential = self.get_credential().await?; @@ -244,18 +247,20 @@ impl S3Client { let mut builder = self.client.request(Method::PUT, url); let mut payload_sha256 = None; - if let Some(bytes) = bytes { - if let Some(checksum) = self.config().checksum { - let digest = checksum.digest(&bytes); - builder = builder - .header(checksum.header_name(), BASE64_STANDARD.encode(&digest)); - if checksum == Checksum::SHA256 { - payload_sha256 = Some(digest); - } + if let Some(checksum) = self.config().checksum { + let digest = checksum.digest(&bytes); + builder = + builder.header(checksum.header_name(), BASE64_STANDARD.encode(&digest)); + if checksum == Checksum::SHA256 { + payload_sha256 = Some(digest); } - builder = builder.body(bytes); } + builder = match bytes.is_empty() { + true => builder.header(CONTENT_LENGTH, 0), // Handle empty uploads (#4514) + false => builder.body(bytes), + }; + if let Some(value) = self.config().client_options.get_content_type(path) { builder = builder.header(CONTENT_TYPE, value); } diff --git a/src/aws/mod.rs b/src/aws/mod.rs index 8a486f9..e74e6f2 100644 --- a/src/aws/mod.rs +++ b/src/aws/mod.rs @@ -211,7 +211,7 @@ impl AmazonS3 { #[async_trait] impl ObjectStore for AmazonS3 { async fn put(&self, location: &Path, bytes: Bytes) -> Result<()> { - self.client.put_request(location, Some(bytes), &()).await?; + self.client.put_request(location, bytes, &()).await?; Ok(()) } @@ -321,7 +321,7 @@ impl CloudMultiPartUploadImpl for S3MultiPartUpload { .client .put_request( &self.location, - Some(buf.into()), + buf.into(), &[("partNumber", &part), ("uploadId", &self.upload_id)], ) .await?; diff --git a/src/azure/client.rs b/src/azure/client.rs index 5ed6f24..e18135c 100644 --- a/src/azure/client.rs +++ b/src/azure/client.rs @@ -387,7 +387,7 @@ fn to_list_result(value: ListResultInternal, prefix: Option<&str>) -> Result 0 && obj.location.as_ref().len() > prefix.as_ref().len() { diff --git a/src/lib.rs b/src/lib.rs index 4867d48..94261e7 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1258,6 +1258,15 @@ mod tests { } delete_fixtures(storage).await; + + let path = Path::from("empty"); + storage.put(&path, Bytes::new()).await.unwrap(); + let meta = storage.head(&path).await.unwrap(); + assert_eq!(meta.size, 0); + let data = storage.get(&path).await.unwrap().bytes().await.unwrap(); + assert_eq!(data.len(), 0); + + storage.delete(&path).await.unwrap(); } pub(crate) async fn get_opts(storage: &dyn ObjectStore) { From c223796e12ca2c6b5f7f6a967e7771aa54bde0d4 Mon Sep 17 00:00:00 2001 From: Yijie Shen Date: Wed, 26 Jul 2023 08:55:30 -0700 Subject: [PATCH 165/397] Make object_store::multipart public (#4570) * Make object_store::multipart public * one more public * docs * doc * more docs * derive debug * debug --- src/lib.rs | 4 ++-- src/multipart.rs | 30 +++++++++++++++++++++++++++--- 2 files changed, 29 insertions(+), 5 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 94261e7..082dca2 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -250,8 +250,8 @@ pub use client::{backoff::BackoffConfig, retry::RetryConfig, CredentialProvider} #[cfg(any(feature = "gcp", feature = "aws", feature = "azure", feature = "http"))] mod config; -#[cfg(any(feature = "azure", feature = "aws", feature = "gcp"))] -mod multipart; +#[cfg(feature = "cloud")] +pub mod multipart; mod parse; mod util; diff --git a/src/multipart.rs b/src/multipart.rs index 2658030..5f9b7e6 100644 --- a/src/multipart.rs +++ b/src/multipart.rs @@ -15,6 +15,12 @@ // specific language governing permissions and limitations // under the License. +//! Cloud Multipart Upload +//! +//! This crate provides an asynchronous interface for multipart file uploads to cloud storage services. +//! It's designed to offer efficient, non-blocking operations, +//! especially useful when dealing with large files or high-throughput systems. + use async_trait::async_trait; use futures::{stream::FuturesUnordered, Future, StreamExt}; use std::{io, pin::Pin, sync::Arc, task::Poll}; @@ -28,7 +34,7 @@ type BoxedTryFuture = Pin> + Sen /// and used in combination with [`CloudMultiPartUpload`] to provide /// multipart upload support #[async_trait] -pub(crate) trait CloudMultiPartUploadImpl: 'static { +pub trait CloudMultiPartUploadImpl: 'static { /// Upload a single part async fn put_multipart_part( &self, @@ -42,12 +48,15 @@ pub(crate) trait CloudMultiPartUploadImpl: 'static { async fn complete(&self, completed_parts: Vec) -> Result<(), io::Error>; } +/// Represents a part of a file that has been successfully uploaded in a multipart upload process. #[derive(Debug, Clone)] -pub(crate) struct UploadPart { +pub struct UploadPart { + /// Id of this part pub content_id: String, } -pub(crate) struct CloudMultiPartUpload +/// Struct that manages and controls multipart uploads to a cloud storage service. +pub struct CloudMultiPartUpload where T: CloudMultiPartUploadImpl, { @@ -75,6 +84,7 @@ impl CloudMultiPartUpload where T: CloudMultiPartUploadImpl, { + /// Create a new multipart upload with the implementation and the given maximum concurrency pub fn new(inner: T, max_concurrency: usize) -> Self { Self { inner: Arc::new(inner), @@ -103,6 +113,7 @@ where to_copy } + /// Poll current tasks pub fn poll_tasks( mut self: Pin<&mut Self>, cx: &mut std::task::Context<'_>, @@ -257,3 +268,16 @@ where Pin::new(completion_task).poll(cx) } } + +impl std::fmt::Debug for CloudMultiPartUpload { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("CloudMultiPartUpload") + .field("completed_parts", &self.completed_parts) + .field("tasks", &self.tasks) + .field("max_concurrency", &self.max_concurrency) + .field("current_buffer", &self.current_buffer) + .field("part_size", &self.part_size) + .field("current_part_idx", &self.current_part_idx) + .finish() + } +} From ce017f6bfd5fc80575a746f8924f7104c2f6f4b6 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 27 Jul 2023 02:32:07 -0400 Subject: [PATCH 166/397] Cleanup multipart upload trait (#4572) * Cleanup multipart upload trait * Update object_store/src/multipart.rs Co-authored-by: Liang-Chi Hsieh --------- Co-authored-by: Liang-Chi Hsieh --- src/aws/client.rs | 4 +-- src/aws/mod.rs | 30 +++++------------- src/azure/mod.rs | 17 ++++------- src/gcp/mod.rs | 77 +++++++++++++++++++++++------------------------ src/multipart.rs | 50 ++++++++++-------------------- 5 files changed, 69 insertions(+), 109 deletions(-) diff --git a/src/aws/client.rs b/src/aws/client.rs index 971d2c6..1888976 100644 --- a/src/aws/client.rs +++ b/src/aws/client.rs @@ -23,7 +23,7 @@ use crate::client::list::ListClient; use crate::client::list_response::ListResponse; use crate::client::retry::RetryExt; use crate::client::GetOptionsExt; -use crate::multipart::UploadPart; +use crate::multipart::PartId; use crate::path::DELIMITER; use crate::{ ClientOptions, GetOptions, ListResult, MultipartId, Path, Result, RetryConfig, @@ -479,7 +479,7 @@ impl S3Client { &self, location: &Path, upload_id: &str, - parts: Vec, + parts: Vec, ) -> Result<()> { let parts = parts .into_iter() diff --git a/src/aws/mod.rs b/src/aws/mod.rs index e74e6f2..5a29bd0 100644 --- a/src/aws/mod.rs +++ b/src/aws/mod.rs @@ -56,7 +56,7 @@ use crate::client::{ TokenCredentialProvider, }; use crate::config::ConfigValue; -use crate::multipart::{CloudMultiPartUpload, CloudMultiPartUploadImpl, UploadPart}; +use crate::multipart::{PartId, PutPart, WriteMultiPart}; use crate::{ ClientOptions, GetOptions, GetResult, ListResult, MultipartId, ObjectMeta, ObjectStore, Path, Result, RetryConfig, @@ -227,7 +227,7 @@ impl ObjectStore for AmazonS3 { client: Arc::clone(&self.client), }; - Ok((id, Box::new(CloudMultiPartUpload::new(upload, 8)))) + Ok((id, Box::new(WriteMultiPart::new(upload, 8)))) } async fn abort_multipart( @@ -308,12 +308,8 @@ struct S3MultiPartUpload { } #[async_trait] -impl CloudMultiPartUploadImpl for S3MultiPartUpload { - async fn put_multipart_part( - &self, - buf: Vec, - part_idx: usize, - ) -> Result { +impl PutPart for S3MultiPartUpload { + async fn put_part(&self, buf: Vec, part_idx: usize) -> Result { use reqwest::header::ETAG; let part = (part_idx + 1).to_string(); @@ -326,26 +322,16 @@ impl CloudMultiPartUploadImpl for S3MultiPartUpload { ) .await?; - let etag = response - .headers() - .get(ETAG) - .context(MissingEtagSnafu) - .map_err(crate::Error::from)?; + let etag = response.headers().get(ETAG).context(MissingEtagSnafu)?; - let etag = etag - .to_str() - .context(BadHeaderSnafu) - .map_err(crate::Error::from)?; + let etag = etag.to_str().context(BadHeaderSnafu)?; - Ok(UploadPart { + Ok(PartId { content_id: etag.to_string(), }) } - async fn complete( - &self, - completed_parts: Vec, - ) -> Result<(), std::io::Error> { + async fn complete(&self, completed_parts: Vec) -> Result<()> { self.client .complete_multipart(&self.location, &self.upload_id, completed_parts) .await?; diff --git a/src/azure/mod.rs b/src/azure/mod.rs index d273503..8619319 100644 --- a/src/azure/mod.rs +++ b/src/azure/mod.rs @@ -28,7 +28,7 @@ //! after 7 days. use self::client::{BlockId, BlockList}; use crate::{ - multipart::{CloudMultiPartUpload, CloudMultiPartUploadImpl, UploadPart}, + multipart::{PartId, PutPart, WriteMultiPart}, path::Path, ClientOptions, GetOptions, GetResult, ListResult, MultipartId, ObjectMeta, ObjectStore, Result, RetryConfig, @@ -42,7 +42,6 @@ use percent_encoding::percent_decode_str; use serde::{Deserialize, Serialize}; use snafu::{OptionExt, ResultExt, Snafu}; use std::fmt::{Debug, Formatter}; -use std::io; use std::str::FromStr; use std::sync::Arc; use tokio::io::AsyncWrite; @@ -186,7 +185,7 @@ impl ObjectStore for MicrosoftAzure { client: Arc::clone(&self.client), location: location.to_owned(), }; - Ok((String::new(), Box::new(CloudMultiPartUpload::new(inner, 8)))) + Ok((String::new(), Box::new(WriteMultiPart::new(inner, 8)))) } async fn abort_multipart( @@ -243,12 +242,8 @@ struct AzureMultiPartUpload { } #[async_trait] -impl CloudMultiPartUploadImpl for AzureMultiPartUpload { - async fn put_multipart_part( - &self, - buf: Vec, - part_idx: usize, - ) -> Result { +impl PutPart for AzureMultiPartUpload { + async fn put_part(&self, buf: Vec, part_idx: usize) -> Result { let content_id = format!("{part_idx:20}"); let block_id: BlockId = content_id.clone().into(); @@ -264,10 +259,10 @@ impl CloudMultiPartUploadImpl for AzureMultiPartUpload { ) .await?; - Ok(UploadPart { content_id }) + Ok(PartId { content_id }) } - async fn complete(&self, completed_parts: Vec) -> Result<(), io::Error> { + async fn complete(&self, completed_parts: Vec) -> Result<()> { let blocks = completed_parts .into_iter() .map(|part| BlockId::from(part.content_id)) diff --git a/src/gcp/mod.rs b/src/gcp/mod.rs index d4d3703..d98e6b0 100644 --- a/src/gcp/mod.rs +++ b/src/gcp/mod.rs @@ -29,7 +29,6 @@ //! to abort the upload and drop those unneeded parts. In addition, you may wish to //! consider implementing automatic clean up of unused parts that are older than one //! week. -use std::io; use std::str::FromStr; use std::sync::Arc; @@ -52,7 +51,7 @@ use crate::client::{ TokenCredentialProvider, }; use crate::{ - multipart::{CloudMultiPartUpload, CloudMultiPartUploadImpl, UploadPart}, + multipart::{PartId, PutPart, WriteMultiPart}, path::{Path, DELIMITER}, ClientOptions, GetOptions, GetResult, ListResult, MultipartId, ObjectMeta, ObjectStore, Result, RetryConfig, @@ -117,6 +116,15 @@ enum Error { #[snafu(display("Error getting put response body: {}", source))] PutResponseBody { source: reqwest::Error }, + #[snafu(display("Got invalid put response: {}", source))] + InvalidPutResponse { source: quick_xml::de::DeError }, + + #[snafu(display("Error performing post request {}: {}", path, source))] + PostRequest { + source: crate::client::retry::Error, + path: String, + }, + #[snafu(display("Error decoding object size: {}", source))] InvalidSize { source: std::num::ParseIntError }, @@ -148,6 +156,12 @@ enum Error { #[snafu(display("Configuration key: '{}' is not known.", key))] UnknownConfigurationKey { key: String }, + + #[snafu(display("ETag Header missing from response"))] + MissingEtag, + + #[snafu(display("Received header containing non-ASCII data"))] + BadHeader { source: header::ToStrError }, } impl From for super::Error { @@ -283,14 +297,9 @@ impl GoogleCloudStorageClient { })?; let data = response.bytes().await.context(PutResponseBodySnafu)?; - let result: InitiateMultipartUploadResult = quick_xml::de::from_reader( - data.as_ref().reader(), - ) - .context(InvalidXMLResponseSnafu { - method: "POST".to_string(), - url, - data, - })?; + let result: InitiateMultipartUploadResult = + quick_xml::de::from_reader(data.as_ref().reader()) + .context(InvalidPutResponseSnafu)?; Ok(result.upload_id) } @@ -472,24 +481,16 @@ struct GCSMultipartUpload { } #[async_trait] -impl CloudMultiPartUploadImpl for GCSMultipartUpload { +impl PutPart for GCSMultipartUpload { /// Upload an object part - async fn put_multipart_part( - &self, - buf: Vec, - part_idx: usize, - ) -> Result { + async fn put_part(&self, buf: Vec, part_idx: usize) -> Result { let upload_id = self.multipart_id.clone(); let url = format!( "{}/{}/{}", self.client.base_url, self.client.bucket_name_encoded, self.encoded_path ); - let credential = self - .client - .get_credential() - .await - .map_err(|err| io::Error::new(io::ErrorKind::Other, err))?; + let credential = self.client.get_credential().await?; let response = self .client @@ -504,26 +505,24 @@ impl CloudMultiPartUploadImpl for GCSMultipartUpload { .header(header::CONTENT_LENGTH, format!("{}", buf.len())) .body(buf) .send_retry(&self.client.retry_config) - .await?; + .await + .context(PutRequestSnafu { + path: &self.encoded_path, + })?; let content_id = response .headers() .get("ETag") - .ok_or_else(|| { - io::Error::new( - io::ErrorKind::InvalidData, - "response headers missing ETag", - ) - })? + .context(MissingEtagSnafu)? .to_str() - .map_err(|err| io::Error::new(io::ErrorKind::InvalidData, err))? + .context(BadHeaderSnafu)? .to_string(); - Ok(UploadPart { content_id }) + Ok(PartId { content_id }) } /// Complete a multipart upload - async fn complete(&self, completed_parts: Vec) -> Result<(), io::Error> { + async fn complete(&self, completed_parts: Vec) -> Result<()> { let upload_id = self.multipart_id.clone(); let url = format!( "{}/{}/{}", @@ -539,16 +538,11 @@ impl CloudMultiPartUploadImpl for GCSMultipartUpload { }) .collect(); - let credential = self - .client - .get_credential() - .await - .map_err(|err| io::Error::new(io::ErrorKind::Other, err))?; - + let credential = self.client.get_credential().await?; let upload_info = CompleteMultipartUpload { parts }; let data = quick_xml::se::to_string(&upload_info) - .map_err(|err| io::Error::new(io::ErrorKind::Other, err))? + .context(InvalidPutResponseSnafu)? // We cannot disable the escaping that transforms "/" to ""e;" :( // https://github.com/tafia/quick-xml/issues/362 // https://github.com/tafia/quick-xml/issues/350 @@ -561,7 +555,10 @@ impl CloudMultiPartUploadImpl for GCSMultipartUpload { .query(&[("uploadId", upload_id)]) .body(data) .send_retry(&self.client.retry_config) - .await?; + .await + .context(PostRequestSnafu { + path: &self.encoded_path, + })?; Ok(()) } @@ -588,7 +585,7 @@ impl ObjectStore for GoogleCloudStorage { multipart_id: upload_id.clone(), }; - Ok((upload_id, Box::new(CloudMultiPartUpload::new(inner, 8)))) + Ok((upload_id, Box::new(WriteMultiPart::new(inner, 8)))) } async fn abort_multipart( diff --git a/src/multipart.rs b/src/multipart.rs index 5f9b7e6..d4c911f 100644 --- a/src/multipart.rs +++ b/src/multipart.rs @@ -31,40 +31,33 @@ use crate::Result; type BoxedTryFuture = Pin> + Send>>; /// A trait that can be implemented by cloud-based object stores -/// and used in combination with [`CloudMultiPartUpload`] to provide +/// and used in combination with [`WriteMultiPart`] to provide /// multipart upload support #[async_trait] -pub trait CloudMultiPartUploadImpl: 'static { +pub trait PutPart: Send + Sync + 'static { /// Upload a single part - async fn put_multipart_part( - &self, - buf: Vec, - part_idx: usize, - ) -> Result; + async fn put_part(&self, buf: Vec, part_idx: usize) -> Result; /// Complete the upload with the provided parts /// /// `completed_parts` is in order of part number - async fn complete(&self, completed_parts: Vec) -> Result<(), io::Error>; + async fn complete(&self, completed_parts: Vec) -> Result<()>; } /// Represents a part of a file that has been successfully uploaded in a multipart upload process. #[derive(Debug, Clone)] -pub struct UploadPart { +pub struct PartId { /// Id of this part pub content_id: String, } -/// Struct that manages and controls multipart uploads to a cloud storage service. -pub struct CloudMultiPartUpload -where - T: CloudMultiPartUploadImpl, -{ +/// Wrapper around a [`PutPart`] that implements [`AsyncWrite`] +pub struct WriteMultiPart { inner: Arc, /// A list of completed parts, in sequential order. - completed_parts: Vec>, + completed_parts: Vec>, /// Part upload tasks currently running - tasks: FuturesUnordered>, + tasks: FuturesUnordered>, /// Maximum number of upload tasks to run concurrently max_concurrency: usize, /// Buffer that will be sent in next upload. @@ -80,10 +73,7 @@ where completion_task: Option>, } -impl CloudMultiPartUpload -where - T: CloudMultiPartUploadImpl, -{ +impl WriteMultiPart { /// Create a new multipart upload with the implementation and the given maximum concurrency pub fn new(inner: T, max_concurrency: usize) -> Self { Self { @@ -114,7 +104,7 @@ where } /// Poll current tasks - pub fn poll_tasks( + fn poll_tasks( mut self: Pin<&mut Self>, cx: &mut std::task::Context<'_>, ) -> Result<(), io::Error> { @@ -130,12 +120,7 @@ where } Ok(()) } -} -impl CloudMultiPartUpload -where - T: CloudMultiPartUploadImpl + Send + Sync, -{ // The `poll_flush` function will only flush the in-progress tasks. // The `final_flush` method called during `poll_shutdown` will flush // the `current_buffer` along with in-progress tasks. @@ -153,7 +138,7 @@ where let inner = Arc::clone(&self.inner); let part_idx = self.current_part_idx; self.tasks.push(Box::pin(async move { - let upload_part = inner.put_multipart_part(out_buffer, part_idx).await?; + let upload_part = inner.put_part(out_buffer, part_idx).await?; Ok((part_idx, upload_part)) })); } @@ -169,10 +154,7 @@ where } } -impl AsyncWrite for CloudMultiPartUpload -where - T: CloudMultiPartUploadImpl + Send + Sync, -{ +impl AsyncWrite for WriteMultiPart { fn poll_write( mut self: Pin<&mut Self>, cx: &mut std::task::Context<'_>, @@ -199,7 +181,7 @@ where let inner = Arc::clone(&self.inner); let part_idx = self.current_part_idx; self.tasks.push(Box::pin(async move { - let upload_part = inner.put_multipart_part(out_buffer, part_idx).await?; + let upload_part = inner.put_part(out_buffer, part_idx).await?; Ok((part_idx, upload_part)) })); self.current_part_idx += 1; @@ -269,9 +251,9 @@ where } } -impl std::fmt::Debug for CloudMultiPartUpload { +impl std::fmt::Debug for WriteMultiPart { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - f.debug_struct("CloudMultiPartUpload") + f.debug_struct("WriteMultiPart") .field("completed_parts", &self.completed_parts) .field("tasks", &self.tasks) .field("max_concurrency", &self.max_concurrency) From 850daa38c75a214cbc35d1e93bdbe1dfae261e1c Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Wed, 2 Aug 2023 15:58:12 +0100 Subject: [PATCH 167/397] Use Config System for Object Store Integration Tests (#4628) --- Cargo.toml | 1 - src/aws/mod.rs | 171 +++++------------------------------------------ src/azure/mod.rs | 101 ++-------------------------- src/gcp/mod.rs | 78 +++++---------------- src/http/mod.rs | 7 +- src/lib.rs | 10 +++ 6 files changed, 48 insertions(+), 320 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 255b972..eca5a5c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -71,7 +71,6 @@ aws = ["cloud"] http = ["cloud"] [dev-dependencies] # In alphabetical order -dotenv = "0.15.0" tempfile = "3.1.0" futures-test = "0.3" rand = "0.8" diff --git a/src/aws/mod.rs b/src/aws/mod.rs index 5a29bd0..f6066d4 100644 --- a/src/aws/mod.rs +++ b/src/aws/mod.rs @@ -1067,153 +1067,9 @@ mod tests { }; use bytes::Bytes; use std::collections::HashMap; - use std::env; const NON_EXISTENT_NAME: &str = "nonexistentname"; - // Helper macro to skip tests if TEST_INTEGRATION and the AWS - // environment variables are not set. Returns a configured - // AmazonS3Builder - macro_rules! maybe_skip_integration { - () => {{ - dotenv::dotenv().ok(); - - let required_vars = [ - "OBJECT_STORE_AWS_DEFAULT_REGION", - "OBJECT_STORE_BUCKET", - "OBJECT_STORE_AWS_ACCESS_KEY_ID", - "OBJECT_STORE_AWS_SECRET_ACCESS_KEY", - ]; - let unset_vars: Vec<_> = required_vars - .iter() - .filter_map(|&name| match env::var(name) { - Ok(_) => None, - Err(_) => Some(name), - }) - .collect(); - let unset_var_names = unset_vars.join(", "); - - let force = env::var("TEST_INTEGRATION"); - - if force.is_ok() && !unset_var_names.is_empty() { - panic!( - "TEST_INTEGRATION is set, \ - but variable(s) {} need to be set", - unset_var_names - ); - } else if force.is_err() { - eprintln!( - "skipping AWS integration test - set {}TEST_INTEGRATION to run", - if unset_var_names.is_empty() { - String::new() - } else { - format!("{} and ", unset_var_names) - } - ); - return; - } else { - let config = AmazonS3Builder::new() - .with_access_key_id( - env::var("OBJECT_STORE_AWS_ACCESS_KEY_ID") - .expect("already checked OBJECT_STORE_AWS_ACCESS_KEY_ID"), - ) - .with_secret_access_key( - env::var("OBJECT_STORE_AWS_SECRET_ACCESS_KEY") - .expect("already checked OBJECT_STORE_AWS_SECRET_ACCESS_KEY"), - ) - .with_region( - env::var("OBJECT_STORE_AWS_DEFAULT_REGION") - .expect("already checked OBJECT_STORE_AWS_DEFAULT_REGION"), - ) - .with_bucket_name( - env::var("OBJECT_STORE_BUCKET") - .expect("already checked OBJECT_STORE_BUCKET"), - ) - .with_allow_http(true); - - let config = if let Ok(endpoint) = env::var("OBJECT_STORE_AWS_ENDPOINT") { - config.with_endpoint(endpoint) - } else { - config - }; - - let config = if let Ok(token) = env::var("OBJECT_STORE_AWS_SESSION_TOKEN") - { - config.with_token(token) - } else { - config - }; - - let config = if let Ok(virtual_hosted_style_request) = - env::var("OBJECT_STORE_VIRTUAL_HOSTED_STYLE_REQUEST") - { - config.with_virtual_hosted_style_request( - virtual_hosted_style_request.trim().parse().unwrap(), - ) - } else { - config - }; - - config - } - }}; - } - - #[test] - fn s3_test_config_from_env() { - let aws_access_key_id = env::var("AWS_ACCESS_KEY_ID") - .unwrap_or_else(|_| "object_store:fake_access_key_id".into()); - let aws_secret_access_key = env::var("AWS_SECRET_ACCESS_KEY") - .unwrap_or_else(|_| "object_store:fake_secret_key".into()); - - let aws_default_region = env::var("AWS_DEFAULT_REGION") - .unwrap_or_else(|_| "object_store:fake_default_region".into()); - - let aws_endpoint = env::var("AWS_ENDPOINT") - .unwrap_or_else(|_| "object_store:fake_endpoint".into()); - let aws_session_token = env::var("AWS_SESSION_TOKEN") - .unwrap_or_else(|_| "object_store:fake_session_token".into()); - - let container_creds_relative_uri = - env::var("AWS_CONTAINER_CREDENTIALS_RELATIVE_URI") - .unwrap_or_else(|_| "/object_store/fake_credentials_uri".into()); - - // required - env::set_var("AWS_ACCESS_KEY_ID", &aws_access_key_id); - env::set_var("AWS_SECRET_ACCESS_KEY", &aws_secret_access_key); - env::set_var("AWS_DEFAULT_REGION", &aws_default_region); - - // optional - env::set_var("AWS_ENDPOINT", &aws_endpoint); - env::set_var("AWS_SESSION_TOKEN", &aws_session_token); - env::set_var( - "AWS_CONTAINER_CREDENTIALS_RELATIVE_URI", - &container_creds_relative_uri, - ); - env::set_var("AWS_UNSIGNED_PAYLOAD", "true"); - env::set_var("AWS_CHECKSUM_ALGORITHM", "sha256"); - - let builder = AmazonS3Builder::from_env(); - assert_eq!(builder.access_key_id.unwrap(), aws_access_key_id.as_str()); - assert_eq!( - builder.secret_access_key.unwrap(), - aws_secret_access_key.as_str() - ); - assert_eq!(builder.region.unwrap(), aws_default_region); - - assert_eq!(builder.endpoint.unwrap(), aws_endpoint); - assert_eq!(builder.token.unwrap(), aws_session_token); - assert_eq!( - builder.container_credentials_relative_uri.unwrap(), - container_creds_relative_uri - ); - assert_eq!( - builder.checksum_algorithm.unwrap().get().unwrap(), - Checksum::SHA256 - ); - assert!(builder.unsigned_payload.get().unwrap()); - } - #[test] fn s3_test_config_from_map() { let aws_access_key_id = "object_store:fake_access_key_id".to_string(); @@ -1304,7 +1160,9 @@ mod tests { #[tokio::test] async fn s3_test() { - let config = maybe_skip_integration!(); + crate::test_util::maybe_skip_integration!(); + let config = AmazonS3Builder::from_env(); + let is_local = matches!(&config.endpoint, Some(e) if e.starts_with("http://")); let integration = config.build().unwrap(); @@ -1317,13 +1175,14 @@ mod tests { stream_get(&integration).await; // run integration test with unsigned payload enabled - let config = maybe_skip_integration!().with_unsigned_payload(true); + let config = AmazonS3Builder::from_env().with_unsigned_payload(true); let is_local = matches!(&config.endpoint, Some(e) if e.starts_with("http://")); let integration = config.build().unwrap(); put_get_delete_list_opts(&integration, is_local).await; // run integration test with checksum set to sha256 - let config = maybe_skip_integration!().with_checksum_algorithm(Checksum::SHA256); + let config = + AmazonS3Builder::from_env().with_checksum_algorithm(Checksum::SHA256); let is_local = matches!(&config.endpoint, Some(e) if e.starts_with("http://")); let integration = config.build().unwrap(); put_get_delete_list_opts(&integration, is_local).await; @@ -1331,8 +1190,8 @@ mod tests { #[tokio::test] async fn s3_test_get_nonexistent_location() { - let config = maybe_skip_integration!(); - let integration = config.build().unwrap(); + crate::test_util::maybe_skip_integration!(); + let integration = AmazonS3Builder::from_env().build().unwrap(); let location = Path::from_iter([NON_EXISTENT_NAME]); @@ -1344,7 +1203,8 @@ mod tests { #[tokio::test] async fn s3_test_get_nonexistent_bucket() { - let config = maybe_skip_integration!().with_bucket_name(NON_EXISTENT_NAME); + crate::test_util::maybe_skip_integration!(); + let config = AmazonS3Builder::from_env().with_bucket_name(NON_EXISTENT_NAME); let integration = config.build().unwrap(); let location = Path::from_iter([NON_EXISTENT_NAME]); @@ -1355,8 +1215,8 @@ mod tests { #[tokio::test] async fn s3_test_put_nonexistent_bucket() { - let config = maybe_skip_integration!().with_bucket_name(NON_EXISTENT_NAME); - + crate::test_util::maybe_skip_integration!(); + let config = AmazonS3Builder::from_env().with_bucket_name(NON_EXISTENT_NAME); let integration = config.build().unwrap(); let location = Path::from_iter([NON_EXISTENT_NAME]); @@ -1368,8 +1228,8 @@ mod tests { #[tokio::test] async fn s3_test_delete_nonexistent_location() { - let config = maybe_skip_integration!(); - let integration = config.build().unwrap(); + crate::test_util::maybe_skip_integration!(); + let integration = AmazonS3Builder::from_env().build().unwrap(); let location = Path::from_iter([NON_EXISTENT_NAME]); @@ -1378,7 +1238,8 @@ mod tests { #[tokio::test] async fn s3_test_delete_nonexistent_bucket() { - let config = maybe_skip_integration!().with_bucket_name(NON_EXISTENT_NAME); + crate::test_util::maybe_skip_integration!(); + let config = AmazonS3Builder::from_env().with_bucket_name(NON_EXISTENT_NAME); let integration = config.build().unwrap(); let location = Path::from_iter([NON_EXISTENT_NAME]); diff --git a/src/azure/mod.rs b/src/azure/mod.rs index 8619319..019cde5 100644 --- a/src/azure/mod.rs +++ b/src/azure/mod.rs @@ -1021,107 +1021,18 @@ mod tests { use super::*; use crate::tests::{ copy_if_not_exists, get_opts, list_uses_directories_correctly, - list_with_delimiter, put_get_delete_list, put_get_delete_list_opts, - rename_and_copy, stream_get, + list_with_delimiter, put_get_delete_list_opts, rename_and_copy, stream_get, }; use std::collections::HashMap; - use std::env; - - // Helper macro to skip tests if TEST_INTEGRATION and the Azure environment - // variables are not set. - macro_rules! maybe_skip_integration { - () => {{ - dotenv::dotenv().ok(); - - let use_emulator = std::env::var("AZURE_USE_EMULATOR").is_ok(); - - let mut required_vars = vec!["OBJECT_STORE_BUCKET"]; - if !use_emulator { - required_vars.push("AZURE_STORAGE_ACCOUNT"); - required_vars.push("AZURE_STORAGE_ACCESS_KEY"); - } - let unset_vars: Vec<_> = required_vars - .iter() - .filter_map(|&name| match env::var(name) { - Ok(_) => None, - Err(_) => Some(name), - }) - .collect(); - let unset_var_names = unset_vars.join(", "); - - let force = std::env::var("TEST_INTEGRATION"); - - if force.is_ok() && !unset_var_names.is_empty() { - panic!( - "TEST_INTEGRATION is set, \ - but variable(s) {} need to be set", - unset_var_names - ) - } else if force.is_err() { - eprintln!( - "skipping Azure integration test - set {}TEST_INTEGRATION to run", - if unset_var_names.is_empty() { - String::new() - } else { - format!("{} and ", unset_var_names) - } - ); - return; - } else { - let builder = MicrosoftAzureBuilder::new() - .with_container_name( - env::var("OBJECT_STORE_BUCKET") - .expect("already checked OBJECT_STORE_BUCKET"), - ) - .with_use_emulator(use_emulator); - if !use_emulator { - builder - .with_account( - env::var("AZURE_STORAGE_ACCOUNT").unwrap_or_default(), - ) - .with_access_key( - env::var("AZURE_STORAGE_ACCESS_KEY").unwrap_or_default(), - ) - } else { - builder - } - } - }}; - } #[tokio::test] async fn azure_blob_test() { - let integration = maybe_skip_integration!().build().unwrap(); - put_get_delete_list_opts(&integration, false).await; - get_opts(&integration).await; - list_uses_directories_correctly(&integration).await; - list_with_delimiter(&integration).await; - rename_and_copy(&integration).await; - copy_if_not_exists(&integration).await; - stream_get(&integration).await; - } + crate::test_util::maybe_skip_integration!(); + let container_name = std::env::var("AZURE_CONTAINER_NAME").unwrap(); // (#4629) + let config = MicrosoftAzureBuilder::from_env(); + let integration = config.with_container_name(container_name).build().unwrap(); - // test for running integration test against actual blob service with service principal - // credentials. To run make sure all environment variables are set and remove the ignore - #[tokio::test] - #[ignore] - async fn azure_blob_test_sp() { - dotenv::dotenv().ok(); - let builder = MicrosoftAzureBuilder::new() - .with_account( - env::var("AZURE_STORAGE_ACCOUNT") - .expect("must be set AZURE_STORAGE_ACCOUNT"), - ) - .with_container_name( - env::var("OBJECT_STORE_BUCKET").expect("must be set OBJECT_STORE_BUCKET"), - ) - .with_access_key( - env::var("AZURE_STORAGE_ACCESS_KEY") - .expect("must be set AZURE_STORAGE_CLIENT_ID"), - ); - let integration = builder.build().unwrap(); - - put_get_delete_list(&integration).await; + put_get_delete_list_opts(&integration, false).await; get_opts(&integration).await; list_uses_directories_correctly(&integration).await; list_with_delimiter(&integration).await; diff --git a/src/gcp/mod.rs b/src/gcp/mod.rs index d98e6b0..58a5d19 100644 --- a/src/gcp/mod.rs +++ b/src/gcp/mod.rs @@ -1090,7 +1090,6 @@ impl GoogleCloudStorageBuilder { mod test { use bytes::Bytes; use std::collections::HashMap; - use std::env; use std::io::Write; use tempfile::NamedTempFile; @@ -1101,56 +1100,10 @@ mod test { const FAKE_KEY: &str = r#"{"private_key": "private_key", "client_email":"client_email", "disable_oauth":true}"#; const NON_EXISTENT_NAME: &str = "nonexistentname"; - // Helper macro to skip tests if TEST_INTEGRATION and the GCP environment variables are not set. - macro_rules! maybe_skip_integration { - () => {{ - dotenv::dotenv().ok(); - - let required_vars = ["OBJECT_STORE_BUCKET", "GOOGLE_SERVICE_ACCOUNT"]; - let unset_vars: Vec<_> = required_vars - .iter() - .filter_map(|&name| match env::var(name) { - Ok(_) => None, - Err(_) => Some(name), - }) - .collect(); - let unset_var_names = unset_vars.join(", "); - - let force = std::env::var("TEST_INTEGRATION"); - - if force.is_ok() && !unset_var_names.is_empty() { - panic!( - "TEST_INTEGRATION is set, \ - but variable(s) {} need to be set", - unset_var_names - ) - } else if force.is_err() { - eprintln!( - "skipping Google Cloud integration test - set {}TEST_INTEGRATION to run", - if unset_var_names.is_empty() { - String::new() - } else { - format!("{} and ", unset_var_names) - } - ); - return; - } else { - GoogleCloudStorageBuilder::new() - .with_bucket_name( - env::var("OBJECT_STORE_BUCKET") - .expect("already checked OBJECT_STORE_BUCKET") - ) - .with_service_account_path( - env::var("GOOGLE_SERVICE_ACCOUNT") - .expect("already checked GOOGLE_SERVICE_ACCOUNT") - ) - } - }}; - } - #[tokio::test] async fn gcs_test() { - let integration = maybe_skip_integration!().build().unwrap(); + crate::test_util::maybe_skip_integration!(); + let integration = GoogleCloudStorageBuilder::from_env().build().unwrap(); put_get_delete_list(&integration).await; list_uses_directories_correctly(&integration).await; @@ -1170,7 +1123,8 @@ mod test { #[tokio::test] async fn gcs_test_get_nonexistent_location() { - let integration = maybe_skip_integration!().build().unwrap(); + crate::test_util::maybe_skip_integration!(); + let integration = GoogleCloudStorageBuilder::from_env().build().unwrap(); let location = Path::from_iter([NON_EXISTENT_NAME]); @@ -1184,10 +1138,9 @@ mod test { #[tokio::test] async fn gcs_test_get_nonexistent_bucket() { - let integration = maybe_skip_integration!() - .with_bucket_name(NON_EXISTENT_NAME) - .build() - .unwrap(); + crate::test_util::maybe_skip_integration!(); + let config = GoogleCloudStorageBuilder::from_env(); + let integration = config.with_bucket_name(NON_EXISTENT_NAME).build().unwrap(); let location = Path::from_iter([NON_EXISTENT_NAME]); @@ -1203,7 +1156,8 @@ mod test { #[tokio::test] async fn gcs_test_delete_nonexistent_location() { - let integration = maybe_skip_integration!().build().unwrap(); + crate::test_util::maybe_skip_integration!(); + let integration = GoogleCloudStorageBuilder::from_env().build().unwrap(); let location = Path::from_iter([NON_EXISTENT_NAME]); @@ -1216,10 +1170,9 @@ mod test { #[tokio::test] async fn gcs_test_delete_nonexistent_bucket() { - let integration = maybe_skip_integration!() - .with_bucket_name(NON_EXISTENT_NAME) - .build() - .unwrap(); + crate::test_util::maybe_skip_integration!(); + let config = GoogleCloudStorageBuilder::from_env(); + let integration = config.with_bucket_name(NON_EXISTENT_NAME).build().unwrap(); let location = Path::from_iter([NON_EXISTENT_NAME]); @@ -1232,10 +1185,9 @@ mod test { #[tokio::test] async fn gcs_test_put_nonexistent_bucket() { - let integration = maybe_skip_integration!() - .with_bucket_name(NON_EXISTENT_NAME) - .build() - .unwrap(); + crate::test_util::maybe_skip_integration!(); + let config = GoogleCloudStorageBuilder::from_env(); + let integration = config.with_bucket_name(NON_EXISTENT_NAME).build().unwrap(); let location = Path::from_iter([NON_EXISTENT_NAME]); let data = Bytes::from("arbitrary data"); diff --git a/src/http/mod.rs b/src/http/mod.rs index bc01c17..6927f1b 100644 --- a/src/http/mod.rs +++ b/src/http/mod.rs @@ -262,12 +262,7 @@ mod tests { #[tokio::test] async fn http_test() { - dotenv::dotenv().ok(); - let force = std::env::var("TEST_INTEGRATION"); - if force.is_err() { - eprintln!("skipping HTTP integration test - set TEST_INTEGRATION to run"); - return; - } + crate::test_util::maybe_skip_integration!(); let url = std::env::var("HTTP_URL").expect("HTTP_URL must be set"); let options = ClientOptions::new().with_allow_http(true); let integration = HttpBuilder::new() diff --git a/src/lib.rs b/src/lib.rs index 082dca2..6c70326 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -910,6 +910,16 @@ mod test_util { use super::*; use futures::TryStreamExt; + macro_rules! maybe_skip_integration { + () => { + if std::env::var("TEST_INTEGRATION").is_err() { + eprintln!("Skipping integration test - set TEST_INTEGRATION"); + return; + } + }; + } + pub(crate) use maybe_skip_integration; + pub async fn flatten_list_stream( storage: &DynObjectStore, prefix: Option<&Path>, From 74dce2d47e9b66e737c00bba7c74d38827c70f58 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Mon, 7 Aug 2023 15:44:40 +0100 Subject: [PATCH 168/397] Support copy_if_not_exists for Cloudflare R2 (#4190) (#4239) * Support copy_if_not_exists for Cloudflare R2 (#4190) * Add tests --- src/aws/client.rs | 48 +++++++++++++++++++++++++++++----- src/aws/copy.rs | 66 +++++++++++++++++++++++++++++++++++++++++++++++ src/aws/mod.rs | 44 ++++++++++++++++++++++++++----- 3 files changed, 144 insertions(+), 14 deletions(-) create mode 100644 src/aws/copy.rs diff --git a/src/aws/client.rs b/src/aws/client.rs index 1888976..1c35586 100644 --- a/src/aws/client.rs +++ b/src/aws/client.rs @@ -17,7 +17,9 @@ use crate::aws::checksum::Checksum; use crate::aws::credential::{AwsCredential, CredentialExt}; -use crate::aws::{AwsCredentialProvider, STORE, STRICT_PATH_ENCODE_SET}; +use crate::aws::{ + AwsCredentialProvider, S3CopyIfNotExists, STORE, STRICT_PATH_ENCODE_SET, +}; use crate::client::get::GetClient; use crate::client::list::ListClient; use crate::client::list_response::ListResponse; @@ -37,7 +39,7 @@ use percent_encoding::{utf8_percent_encode, PercentEncode}; use quick_xml::events::{self as xml_events}; use reqwest::{ header::{CONTENT_LENGTH, CONTENT_TYPE}, - Client as ReqwestClient, Method, Response, + Client as ReqwestClient, Method, Response, StatusCode, }; use serde::{Deserialize, Serialize}; use snafu::{ResultExt, Snafu}; @@ -206,6 +208,7 @@ pub struct S3Config { pub client_options: ClientOptions, pub sign_payload: bool, pub checksum: Option, + pub copy_if_not_exists: Option, } impl S3Config { @@ -424,14 +427,37 @@ impl S3Client { } /// Make an S3 Copy request - pub async fn copy_request(&self, from: &Path, to: &Path) -> Result<()> { + pub async fn copy_request( + &self, + from: &Path, + to: &Path, + overwrite: bool, + ) -> Result<()> { let credential = self.get_credential().await?; let url = self.config.path_url(to); let source = format!("{}/{}", self.config.bucket, encode_path(from)); - self.client + let mut builder = self + .client .request(Method::PUT, url) - .header("x-amz-copy-source", source) + .header("x-amz-copy-source", source); + + if !overwrite { + match &self.config.copy_if_not_exists { + Some(S3CopyIfNotExists::Header(k, v)) => { + builder = builder.header(k, v); + } + None => { + return Err(crate::Error::NotSupported { + source: "S3 does not support copy-if-not-exists" + .to_string() + .into(), + }) + } + } + } + + builder .with_aws_sigv4( credential.as_ref(), &self.config.region, @@ -441,8 +467,16 @@ impl S3Client { ) .send_retry(&self.config.retry_config) .await - .context(CopyRequestSnafu { - path: from.as_ref(), + .map_err(|source| match source.status() { + Some(StatusCode::PRECONDITION_FAILED) => crate::Error::AlreadyExists { + source: Box::new(source), + path: to.to_string(), + }, + _ => Error::CopyRequest { + source, + path: from.to_string(), + } + .into(), })?; Ok(()) diff --git a/src/aws/copy.rs b/src/aws/copy.rs new file mode 100644 index 0000000..6b96f99 --- /dev/null +++ b/src/aws/copy.rs @@ -0,0 +1,66 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::config::Parse; + +/// Configure how to provide [`ObjectStore::copy_if_not_exists`] for [`AmazonS3`] +#[derive(Debug, Clone)] +#[non_exhaustive] +pub enum S3CopyIfNotExists { + /// Some S3-compatible stores, such as Cloudflare R2, support copy if not exists + /// semantics through custom headers. + /// + /// If set, [`ObjectStore::copy_if_not_exists`] will perform a normal copy operation + /// with the provided header pair, and expect the store to fail with `412 Precondition Failed` + /// if the destination file already exists + /// + /// Encoded as `header::` ignoring whitespace + /// + /// For example `header: cf-copy-destination-if-none-match: *`, would set + /// the header `cf-copy-destination-if-none-match` to `*` + Header(String, String), +} + +impl std::fmt::Display for S3CopyIfNotExists { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Self::Header(k, v) => write!(f, "header: {}: {}", k, v), + } + } +} + +impl S3CopyIfNotExists { + fn from_str(s: &str) -> Option { + let (variant, value) = s.split_once(':')?; + match variant.trim() { + "header" => { + let (k, v) = value.split_once(':')?; + Some(Self::Header(k.trim().to_string(), v.trim().to_string())) + } + _ => None, + } + } +} + +impl Parse for S3CopyIfNotExists { + fn parse(v: &str) -> crate::Result { + Self::from_str(v).ok_or_else(|| crate::Error::Generic { + store: "Config", + source: format!("Failed to parse \"{v}\" as S3CopyIfNotExists").into(), + }) + } +} diff --git a/src/aws/mod.rs b/src/aws/mod.rs index f6066d4..7e16b5a 100644 --- a/src/aws/mod.rs +++ b/src/aws/mod.rs @@ -44,7 +44,6 @@ use tokio::io::AsyncWrite; use tracing::info; use url::Url; -pub use crate::aws::checksum::Checksum; use crate::aws::client::{S3Client, S3Config}; use crate::aws::credential::{ InstanceCredentialProvider, TaskCredentialProvider, WebIdentityProvider, @@ -64,8 +63,12 @@ use crate::{ mod checksum; mod client; +mod copy; mod credential; +pub use checksum::Checksum; +pub use copy::S3CopyIfNotExists; + // http://docs.aws.amazon.com/general/latest/gr/sigv4-create-canonical-request.html // // Do not URI-encode any of the unreserved characters that RFC 3986 defines: @@ -292,12 +295,11 @@ impl ObjectStore for AmazonS3 { } async fn copy(&self, from: &Path, to: &Path) -> Result<()> { - self.client.copy_request(from, to).await + self.client.copy_request(from, to, true).await } - async fn copy_if_not_exists(&self, _source: &Path, _dest: &Path) -> Result<()> { - // Will need dynamodb_lock - Err(crate::Error::NotImplemented) + async fn copy_if_not_exists(&self, from: &Path, to: &Path) -> Result<()> { + self.client.copy_request(from, to, false).await } } @@ -390,6 +392,8 @@ pub struct AmazonS3Builder { client_options: ClientOptions, /// Credentials credentials: Option, + /// Copy if not exists + copy_if_not_exists: Option>, } /// Configuration keys for [`AmazonS3Builder`] @@ -521,6 +525,11 @@ pub enum AmazonS3ConfigKey { /// ContainerCredentialsRelativeUri, + /// Configure how to provide [`ObjectStore::copy_if_not_exists`] + /// + /// See [`S3CopyIfNotExists`] + CopyIfNotExists, + /// Client options Client(ClientConfigKey), } @@ -543,6 +552,7 @@ impl AsRef for AmazonS3ConfigKey { Self::ContainerCredentialsRelativeUri => { "aws_container_credentials_relative_uri" } + Self::CopyIfNotExists => "copy_if_not_exists", Self::Client(opt) => opt.as_ref(), } } @@ -576,6 +586,7 @@ impl FromStr for AmazonS3ConfigKey { "aws_container_credentials_relative_uri" => { Ok(Self::ContainerCredentialsRelativeUri) } + "copy_if_not_exists" => Ok(Self::CopyIfNotExists), // Backwards compatibility "aws_allow_http" => Ok(Self::Client(ClientConfigKey::AllowHttp)), _ => match s.parse() { @@ -686,6 +697,9 @@ impl AmazonS3Builder { AmazonS3ConfigKey::Client(key) => { self.client_options = self.client_options.with_config(key, value) } + AmazonS3ConfigKey::CopyIfNotExists => { + self.copy_if_not_exists = Some(ConfigValue::Deferred(value.into())) + } }; self } @@ -753,6 +767,9 @@ impl AmazonS3Builder { AmazonS3ConfigKey::ContainerCredentialsRelativeUri => { self.container_credentials_relative_uri.clone() } + AmazonS3ConfigKey::CopyIfNotExists => { + self.copy_if_not_exists.as_ref().map(ToString::to_string) + } } } @@ -935,6 +952,12 @@ impl AmazonS3Builder { self } + /// Configure how to provide [`ObjectStore::copy_if_not_exists`] + pub fn with_copy_if_not_exists(mut self, config: S3CopyIfNotExists) -> Self { + self.copy_if_not_exists = Some(config.into()); + self + } + /// Create a [`AmazonS3`] instance from the provided values, /// consuming `self`. pub fn build(mut self) -> Result { @@ -945,6 +968,7 @@ impl AmazonS3Builder { let bucket = self.bucket_name.context(MissingBucketNameSnafu)?; let region = self.region.context(MissingRegionSnafu)?; let checksum = self.checksum_algorithm.map(|x| x.get()).transpose()?; + let copy_if_not_exists = self.copy_if_not_exists.map(|x| x.get()).transpose()?; let credentials = if let Some(credentials) = self.credentials { credentials @@ -1050,6 +1074,7 @@ impl AmazonS3Builder { client_options: self.client_options, sign_payload: !self.unsigned_payload.get()?, checksum, + copy_if_not_exists, }; let client = Arc::new(S3Client::new(config)?); @@ -1062,8 +1087,9 @@ impl AmazonS3Builder { mod tests { use super::*; use crate::tests::{ - get_nonexistent_object, get_opts, list_uses_directories_correctly, - list_with_delimiter, put_get_delete_list_opts, rename_and_copy, stream_get, + copy_if_not_exists, get_nonexistent_object, get_opts, + list_uses_directories_correctly, list_with_delimiter, put_get_delete_list_opts, + rename_and_copy, stream_get, }; use bytes::Bytes; use std::collections::HashMap; @@ -1164,6 +1190,7 @@ mod tests { let config = AmazonS3Builder::from_env(); let is_local = matches!(&config.endpoint, Some(e) if e.starts_with("http://")); + let test_not_exists = config.copy_if_not_exists.is_some(); let integration = config.build().unwrap(); // Localstack doesn't support listing with spaces https://github.com/localstack/localstack/issues/6328 @@ -1173,6 +1200,9 @@ mod tests { list_with_delimiter(&integration).await; rename_and_copy(&integration).await; stream_get(&integration).await; + if test_not_exists { + copy_if_not_exists(&integration).await; + } // run integration test with unsigned payload enabled let config = AmazonS3Builder::from_env().with_unsigned_payload(true); From f15012ca6ac504755a9af8ada800026fe3ccdcf4 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 11 Aug 2023 08:03:34 -0400 Subject: [PATCH 169/397] Minor: improve object_store docs.rs library landing page (#4682) --- src/lib.rs | 28 +++++++++++++++++++++++++--- 1 file changed, 25 insertions(+), 3 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 6c70326..bb4ba5c 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -28,10 +28,32 @@ //! # object_store //! -//! This crate provides a uniform API for interacting with object storage services and -//! local files via the the [`ObjectStore`] trait. +//! This crate provides a uniform API for interacting with object +//! storage services and local files via the [`ObjectStore`] +//! trait. //! -//! # Create an [`ObjectStore`] implementation: +//! Using this crate, the same binary and code can run in multiple +//! clouds and local test environments, via a simple runtime +//! configuration change. +//! +//! # Features: +//! +//! 1. A focused, easy to use, idiomatic, well documented, high +//! performance, `async` API. +//! +//! 2. Production quality, leading this crate to be used in large +//! scale production systems, such as [crates.io] and [InfluxDB IOx]. +//! +//! 3. Stable and predictable governance via the [Apache Arrow] project. +//! +//! Originally developed for [InfluxDB IOx] and subsequently donated +//! to [Apache Arrow]. +//! +//! [Apache Arrow]: https://arrow.apache.org/ +//! [InfluxDB IOx]: https://github.com/influxdata/influxdb_iox/ +//! [crates.io]: https://github.com/rust-lang/crates.io +//! +//! # Example: Create an [`ObjectStore`] implementation: //! #![cfg_attr( feature = "gcp", From f8cc0260b87962efcd4bf8f4deba53ff3970bed2 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Fri, 11 Aug 2023 13:35:10 +0100 Subject: [PATCH 170/397] Check object_store format in CI (#4679) * Check object_store format in CI * Format --- src/local.rs | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/local.rs b/src/local.rs index ffff6a5..a0933cc 100644 --- a/src/local.rs +++ b/src/local.rs @@ -863,7 +863,11 @@ impl AsyncWrite for LocalUpload { } } -pub(crate) fn read_range(file: &mut File, path: &PathBuf, range: Range) -> Result { +pub(crate) fn read_range( + file: &mut File, + path: &PathBuf, + range: Range, +) -> Result { let to_read = range.end - range.start; file.seek(SeekFrom::Start(range.start as u64)) .context(SeekSnafu { path })?; From 1e71e94b0e936a7d17bfad81cd2a20b419fb367f Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 11 Aug 2023 10:53:11 -0400 Subject: [PATCH 171/397] Fix object_store docs and Add CI job (#4684) * Add CI job for object_store_docs * fix job * fix again * fix * Fix doc links * Add comment about why a different workflow is needed * Fix AmazonS3 link --- src/aws/copy.rs | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/aws/copy.rs b/src/aws/copy.rs index 6b96f99..da4e280 100644 --- a/src/aws/copy.rs +++ b/src/aws/copy.rs @@ -17,7 +17,11 @@ use crate::config::Parse; -/// Configure how to provide [`ObjectStore::copy_if_not_exists`] for [`AmazonS3`] +/// Configure how to provide [`ObjectStore::copy_if_not_exists`] for +/// [`AmazonS3`]. +/// +/// [`ObjectStore::copy_if_not_exists`]: crate::ObjectStore::copy_if_not_exists +/// [`AmazonS3`]: super::AmazonS3 #[derive(Debug, Clone)] #[non_exhaustive] pub enum S3CopyIfNotExists { @@ -32,6 +36,8 @@ pub enum S3CopyIfNotExists { /// /// For example `header: cf-copy-destination-if-none-match: *`, would set /// the header `cf-copy-destination-if-none-match` to `*` + /// + /// [`ObjectStore::copy_if_not_exists`]: crate::ObjectStore::copy_if_not_exists Header(String, String), } From fd6887b3172989f9cfea9be7ff7bd05268cd39e3 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Fri, 11 Aug 2023 16:19:14 +0100 Subject: [PATCH 172/397] Faster stream_get test (#4685) --- src/lib.rs | 29 ++++++++++++++--------------- 1 file changed, 14 insertions(+), 15 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index bb4ba5c..cf7e479 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -959,8 +959,7 @@ mod test_util { mod tests { use super::*; use crate::test_util::flatten_list_stream; - use bytes::{BufMut, BytesMut}; - use itertools::Itertools; + use rand::{thread_rng, Rng}; use tokio::io::AsyncWriteExt; pub(crate) async fn put_get_delete_list(storage: &DynObjectStore) { @@ -1380,27 +1379,27 @@ mod tests { } } - fn get_random_bytes(len: usize) -> Bytes { - use rand::Rng; - let mut rng = rand::thread_rng(); - let mut bytes = BytesMut::with_capacity(len); - for _ in 0..len { - bytes.put_u8(rng.gen()); + /// Returns a chunk of length `chunk_length` + fn get_chunk(chunk_length: usize) -> Bytes { + let mut data = vec![0_u8; chunk_length]; + let mut rng = thread_rng(); + // Set a random selection of bytes + for _ in 0..1000 { + data[rng.gen_range(0..chunk_length)] = rng.gen(); } - bytes.freeze() + data.into() } - fn get_vec_of_bytes(chunk_length: usize, num_chunks: usize) -> Vec { - std::iter::repeat(get_random_bytes(chunk_length)) - .take(num_chunks) - .collect() + /// Returns `num_chunks` of length `chunks` + fn get_chunks(chunk_length: usize, num_chunks: usize) -> Vec { + (0..num_chunks).map(|_| get_chunk(chunk_length)).collect() } pub(crate) async fn stream_get(storage: &DynObjectStore) { let location = Path::from("test_dir/test_upload_file.txt"); // Can write to storage - let data = get_vec_of_bytes(5_000, 10); + let data = get_chunks(5_000, 10); let bytes_expected = data.concat(); let (_, mut writer) = storage.put_multipart(&location).await.unwrap(); for chunk in &data { @@ -1427,7 +1426,7 @@ mod tests { // Can overwrite some storage // Sizes chosen to ensure we write three parts - let data = (0..7).map(|_| get_random_bytes(3_200_000)).collect_vec(); + let data = get_chunks(3_200_000, 7); let bytes_expected = data.concat(); let (_, mut writer) = storage.put_multipart(&location).await.unwrap(); for chunk in &data { From ad1d9da5c32192f46fc8c0b957669fcfa6e2ce9e Mon Sep 17 00:00:00 2001 From: vmuddassir-msft <140655500+vmuddassir-msft@users.noreply.github.com> Date: Fri, 11 Aug 2023 20:50:16 +0530 Subject: [PATCH 173/397] Add Support for Microsoft Fabric / OneLake (#4573) * Changes required for onelake-fix * Fix Unit tests * Add Unit Tests * Add onelake read/write test * Add with_use_fabric , for fabric url check * Final tweaks * Further tweaks * Automatically set use_fabric_endpoint --------- Co-authored-by: Raphael Taylor-Davies --- src/azure/mod.rs | 101 ++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 100 insertions(+), 1 deletion(-) diff --git a/src/azure/mod.rs b/src/azure/mod.rs index 019cde5..6bb4cda 100644 --- a/src/azure/mod.rs +++ b/src/azure/mod.rs @@ -341,6 +341,10 @@ pub struct MicrosoftAzureBuilder { client_options: ClientOptions, /// Credentials credentials: Option, + /// When set to true, fabric url scheme will be used + /// + /// i.e. https://{account_name}.dfs.fabric.microsoft.com + use_fabric_endpoint: ConfigValue, } /// Configuration keys for [`MicrosoftAzureBuilder`] @@ -430,6 +434,13 @@ pub enum AzureConfigKey { /// - `use_emulator` UseEmulator, + /// Use object store with url scheme account.dfs.fabric.microsoft.com + /// + /// Supported keys: + /// - `azure_use_fabric_endpoint` + /// - `use_fabric_endpoint` + UseFabricEndpoint, + /// Endpoint to request a imds managed identity token /// /// Supported keys: @@ -482,6 +493,7 @@ impl AsRef for AzureConfigKey { Self::SasKey => "azure_storage_sas_key", Self::Token => "azure_storage_token", Self::UseEmulator => "azure_storage_use_emulator", + Self::UseFabricEndpoint => "azure_use_fabric_endpoint", Self::MsiEndpoint => "azure_msi_endpoint", Self::ObjectId => "azure_object_id", Self::MsiResourceId => "azure_msi_resource_id", @@ -531,6 +543,9 @@ impl FromStr for AzureConfigKey { "azure_federated_token_file" | "federated_token_file" => { Ok(Self::FederatedTokenFile) } + "azure_use_fabric_endpoint" | "use_fabric_endpoint" => { + Ok(Self::UseFabricEndpoint) + } "azure_use_azure_cli" | "use_azure_cli" => Ok(Self::UseAzureCli), // Backwards compatibility "azure_allow_http" => Ok(Self::Client(ClientConfigKey::AllowHttp)), @@ -600,11 +615,16 @@ impl MicrosoftAzureBuilder { /// /// - `abfs[s]:///` (according to [fsspec](https://github.com/fsspec/adlfs)) /// - `abfs[s]://@.dfs.core.windows.net/` + /// - `abfs[s]://@.dfs.fabric.microsoft.com/` /// - `az:///` (according to [fsspec](https://github.com/fsspec/adlfs)) /// - `adl:///` (according to [fsspec](https://github.com/fsspec/adlfs)) /// - `azure:///` (custom) /// - `https://.dfs.core.windows.net` /// - `https://.blob.core.windows.net` + /// - `https://.dfs.fabric.microsoft.com` + /// - `https://.dfs.fabric.microsoft.com/` + /// - `https://.blob.fabric.microsoft.com` + /// - `https://.blob.fabric.microsoft.com/` /// /// Note: Settings derived from the URL will override any others set on this builder /// @@ -639,6 +659,7 @@ impl MicrosoftAzureBuilder { } AzureConfigKey::UseAzureCli => self.use_azure_cli.parse(value), AzureConfigKey::UseEmulator => self.use_emulator.parse(value), + AzureConfigKey::UseFabricEndpoint => self.use_fabric_endpoint.parse(value), AzureConfigKey::Client(key) => { self.client_options = self.client_options.with_config(key, value) } @@ -692,6 +713,9 @@ impl MicrosoftAzureBuilder { AzureConfigKey::SasKey => self.sas_key.clone(), AzureConfigKey::Token => self.bearer_token.clone(), AzureConfigKey::UseEmulator => Some(self.use_emulator.to_string()), + AzureConfigKey::UseFabricEndpoint => { + Some(self.use_fabric_endpoint.to_string()) + } AzureConfigKey::MsiEndpoint => self.msi_endpoint.clone(), AzureConfigKey::ObjectId => self.object_id.clone(), AzureConfigKey::MsiResourceId => self.msi_resource_id.clone(), @@ -724,6 +748,10 @@ impl MicrosoftAzureBuilder { } else if let Some(a) = host.strip_suffix(".dfs.core.windows.net") { self.container_name = Some(validate(parsed.username())?); self.account_name = Some(validate(a)?); + } else if let Some(a) = host.strip_suffix(".dfs.fabric.microsoft.com") { + self.container_name = Some(validate(parsed.username())?); + self.account_name = Some(validate(a)?); + self.use_fabric_endpoint = true.into(); } else { return Err(UrlNotRecognisedSnafu { url }.build().into()); } @@ -733,6 +761,21 @@ impl MicrosoftAzureBuilder { | Some((a, "blob.core.windows.net")) => { self.account_name = Some(validate(a)?); } + Some((a, "dfs.fabric.microsoft.com")) + | Some((a, "blob.fabric.microsoft.com")) => { + self.account_name = Some(validate(a)?); + // Attempt to infer the container name from the URL + // - https://onelake.dfs.fabric.microsoft.com///Files/test.csv + // - https://onelake.dfs.fabric.microsoft.com//.// + // + // See + if let Some(workspace) = parsed.path_segments().unwrap().next() { + if !workspace.is_empty() { + self.container_name = Some(workspace.to_string()) + } + } + self.use_fabric_endpoint = true.into(); + } _ => return Err(UrlNotRecognisedSnafu { url }.build().into()), }, scheme => return Err(UnknownUrlSchemeSnafu { scheme }.build().into()), @@ -819,6 +862,14 @@ impl MicrosoftAzureBuilder { self } + /// Set if Microsoft Fabric url scheme should be used (defaults to false) + /// When disabled the url scheme used is `https://{account}.blob.core.windows.net` + /// When enabled the url scheme used is `https://{account}.dfs.fabric.microsoft.com` + pub fn with_use_fabric_endpoint(mut self, use_fabric_endpoint: bool) -> Self { + self.use_fabric_endpoint = use_fabric_endpoint.into(); + self + } + /// Sets what protocol is allowed. If `allow_http` is : /// * false (default): Only HTTPS are allowed /// * true: HTTP and HTTPS are allowed @@ -885,6 +936,7 @@ impl MicrosoftAzureBuilder { } let container = self.container_name.ok_or(Error::MissingContainerName {})?; + let static_creds = |credential: AzureCredential| -> AzureCredentialProvider { Arc::new(StaticCredentialProvider::new(credential)) }; @@ -906,7 +958,11 @@ impl MicrosoftAzureBuilder { (true, url, credential, account_name) } else { let account_name = self.account_name.ok_or(Error::MissingAccount {})?; - let account_url = format!("https://{}.blob.core.windows.net", &account_name); + let account_url = match self.use_fabric_endpoint.get()? { + true => format!("https://{}.blob.fabric.microsoft.com", &account_name), + false => format!("https://{}.blob.core.windows.net", &account_name), + }; + let url = Url::parse(&account_url) .context(UnableToParseUrlSnafu { url: account_url })?; @@ -1049,6 +1105,15 @@ mod tests { .unwrap(); assert_eq!(builder.account_name, Some("account".to_string())); assert_eq!(builder.container_name, Some("file_system".to_string())); + assert!(!builder.use_fabric_endpoint.get().unwrap()); + + let mut builder = MicrosoftAzureBuilder::new(); + builder + .parse_url("abfss://file_system@account.dfs.fabric.microsoft.com/") + .unwrap(); + assert_eq!(builder.account_name, Some("account".to_string())); + assert_eq!(builder.container_name, Some("file_system".to_string())); + assert!(builder.use_fabric_endpoint.get().unwrap()); let mut builder = MicrosoftAzureBuilder::new(); builder.parse_url("abfs://container/path").unwrap(); @@ -1067,12 +1132,46 @@ mod tests { .parse_url("https://account.dfs.core.windows.net/") .unwrap(); assert_eq!(builder.account_name, Some("account".to_string())); + assert!(!builder.use_fabric_endpoint.get().unwrap()); let mut builder = MicrosoftAzureBuilder::new(); builder .parse_url("https://account.blob.core.windows.net/") .unwrap(); assert_eq!(builder.account_name, Some("account".to_string())); + assert!(!builder.use_fabric_endpoint.get().unwrap()); + + let mut builder = MicrosoftAzureBuilder::new(); + builder + .parse_url("https://account.dfs.fabric.microsoft.com/") + .unwrap(); + assert_eq!(builder.account_name, Some("account".to_string())); + assert_eq!(builder.container_name, None); + assert!(builder.use_fabric_endpoint.get().unwrap()); + + let mut builder = MicrosoftAzureBuilder::new(); + builder + .parse_url("https://account.dfs.fabric.microsoft.com/container") + .unwrap(); + assert_eq!(builder.account_name, Some("account".to_string())); + assert_eq!(builder.container_name.as_deref(), Some("container")); + assert!(builder.use_fabric_endpoint.get().unwrap()); + + let mut builder = MicrosoftAzureBuilder::new(); + builder + .parse_url("https://account.blob.fabric.microsoft.com/") + .unwrap(); + assert_eq!(builder.account_name, Some("account".to_string())); + assert_eq!(builder.container_name, None); + assert!(builder.use_fabric_endpoint.get().unwrap()); + + let mut builder = MicrosoftAzureBuilder::new(); + builder + .parse_url("https://account.blob.fabric.microsoft.com/container") + .unwrap(); + assert_eq!(builder.account_name, Some("account".to_string())); + assert_eq!(builder.container_name.as_deref(), Some("container")); + assert!(builder.use_fabric_endpoint.get().unwrap()); let err_cases = [ "mailto://account.blob.core.windows.net/", From 02b5cff5eeafa8752eecd86288a3fcb1ca991ba4 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Mon, 14 Aug 2023 10:09:21 +0100 Subject: [PATCH 174/397] Add AzureConfigKey::ContainerName (#4629) (#4686) --- src/azure/mod.rs | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/src/azure/mod.rs b/src/azure/mod.rs index 6bb4cda..27bbbfb 100644 --- a/src/azure/mod.rs +++ b/src/azure/mod.rs @@ -478,6 +478,13 @@ pub enum AzureConfigKey { /// - `use_azure_cli` UseAzureCli, + /// Container name + /// + /// Supported keys: + /// - `azure_container_name` + /// - `container_name` + ContainerName, + /// Client options Client(ClientConfigKey), } @@ -499,6 +506,7 @@ impl AsRef for AzureConfigKey { Self::MsiResourceId => "azure_msi_resource_id", Self::FederatedTokenFile => "azure_federated_token_file", Self::UseAzureCli => "azure_use_azure_cli", + Self::ContainerName => "azure_container_name", Self::Client(key) => key.as_ref(), } } @@ -547,6 +555,7 @@ impl FromStr for AzureConfigKey { Ok(Self::UseFabricEndpoint) } "azure_use_azure_cli" | "use_azure_cli" => Ok(Self::UseAzureCli), + "azure_container_name" | "container_name" => Ok(Self::ContainerName), // Backwards compatibility "azure_allow_http" => Ok(Self::Client(ClientConfigKey::AllowHttp)), _ => match s.parse() { @@ -663,6 +672,7 @@ impl MicrosoftAzureBuilder { AzureConfigKey::Client(key) => { self.client_options = self.client_options.with_config(key, value) } + AzureConfigKey::ContainerName => self.container_name = Some(value.into()), }; self } @@ -722,6 +732,7 @@ impl MicrosoftAzureBuilder { AzureConfigKey::FederatedTokenFile => self.federated_token_file.clone(), AzureConfigKey::UseAzureCli => Some(self.use_azure_cli.to_string()), AzureConfigKey::Client(key) => self.client_options.get_config_value(key), + AzureConfigKey::ContainerName => self.container_name.clone(), } } @@ -1084,9 +1095,7 @@ mod tests { #[tokio::test] async fn azure_blob_test() { crate::test_util::maybe_skip_integration!(); - let container_name = std::env::var("AZURE_CONTAINER_NAME").unwrap(); // (#4629) - let config = MicrosoftAzureBuilder::from_env(); - let integration = config.with_container_name(container_name).build().unwrap(); + let integration = MicrosoftAzureBuilder::from_env().build().unwrap(); put_get_delete_list_opts(&integration, false).await; get_opts(&integration).await; From 7a46cb46872f007a9841d1d8c302488eba2ddb72 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Mon, 14 Aug 2023 12:11:43 +0100 Subject: [PATCH 175/397] Add range and ObjectMeta to GetResult (#4352) (#4495) (#4677) * Add range and ObjectMeta to GetResult (#4352) (#4495) * Review feedback * Fix docs --- src/chunked.rs | 126 +++++++++++++++++++--------------------------- src/client/get.rs | 15 +++++- src/http/mod.rs | 19 +++++-- src/lib.rs | 103 ++++++++++++++----------------------- src/limit.rs | 30 +++++------ src/local.rs | 80 +++++++++++++++++++++++------ src/memory.rs | 50 ++++++++++++------ src/throttle.rs | 27 ++++++---- 8 files changed, 252 insertions(+), 198 deletions(-) diff --git a/src/chunked.rs b/src/chunked.rs index c639d7e..008dec6 100644 --- a/src/chunked.rs +++ b/src/chunked.rs @@ -18,7 +18,6 @@ //! A [`ChunkedStore`] that can be used to test streaming behaviour use std::fmt::{Debug, Display, Formatter}; -use std::io::{BufReader, Read}; use std::ops::Range; use std::sync::Arc; @@ -29,8 +28,9 @@ use futures::StreamExt; use tokio::io::AsyncWrite; use crate::path::Path; -use crate::util::maybe_spawn_blocking; -use crate::{GetOptions, GetResult, ListResult, ObjectMeta, ObjectStore}; +use crate::{ + GetOptions, GetResult, GetResultPayload, ListResult, ObjectMeta, ObjectStore, +}; use crate::{MultipartId, Result}; /// Wraps a [`ObjectStore`] and makes its get response return chunks @@ -82,77 +82,57 @@ impl ObjectStore for ChunkedStore { } async fn get_opts(&self, location: &Path, options: GetOptions) -> Result { - match self.inner.get_opts(location, options).await? { - GetResult::File(std_file, ..) => { - let reader = BufReader::new(std_file); - let chunk_size = self.chunk_size; - Ok(GetResult::Stream( - futures::stream::try_unfold(reader, move |mut reader| async move { - let (r, out, reader) = maybe_spawn_blocking(move || { - let mut out = Vec::with_capacity(chunk_size); - let r = (&mut reader) - .take(chunk_size as u64) - .read_to_end(&mut out) - .map_err(|err| crate::Error::Generic { - store: "ChunkedStore", - source: Box::new(err), - })?; - Ok((r, out, reader)) - }) - .await?; - - match r { - 0 => Ok(None), - _ => Ok(Some((out.into(), reader))), - } - }) - .boxed(), - )) + let r = self.inner.get_opts(location, options).await?; + let stream = match r.payload { + GetResultPayload::File(file, path) => { + crate::local::chunked_stream(file, path, r.range.clone(), self.chunk_size) } - GetResult::Stream(stream) => { + GetResultPayload::Stream(stream) => { let buffer = BytesMut::new(); - Ok(GetResult::Stream( - futures::stream::unfold( - (stream, buffer, false, self.chunk_size), - |(mut stream, mut buffer, mut exhausted, chunk_size)| async move { - // Keep accumulating bytes until we reach capacity as long as - // the stream can provide them: - if exhausted { - return None; - } - while buffer.len() < chunk_size { - match stream.next().await { - None => { - exhausted = true; - let slice = buffer.split_off(0).freeze(); - return Some(( - Ok(slice), - (stream, buffer, exhausted, chunk_size), - )); - } - Some(Ok(bytes)) => { - buffer.put(bytes); - } - Some(Err(e)) => { - return Some(( - Err(crate::Error::Generic { - store: "ChunkedStore", - source: Box::new(e), - }), - (stream, buffer, exhausted, chunk_size), - )) - } - }; - } - // Return the chunked values as the next value in the stream - let slice = buffer.split_to(chunk_size).freeze(); - Some((Ok(slice), (stream, buffer, exhausted, chunk_size))) - }, - ) - .boxed(), - )) + futures::stream::unfold( + (stream, buffer, false, self.chunk_size), + |(mut stream, mut buffer, mut exhausted, chunk_size)| async move { + // Keep accumulating bytes until we reach capacity as long as + // the stream can provide them: + if exhausted { + return None; + } + while buffer.len() < chunk_size { + match stream.next().await { + None => { + exhausted = true; + let slice = buffer.split_off(0).freeze(); + return Some(( + Ok(slice), + (stream, buffer, exhausted, chunk_size), + )); + } + Some(Ok(bytes)) => { + buffer.put(bytes); + } + Some(Err(e)) => { + return Some(( + Err(crate::Error::Generic { + store: "ChunkedStore", + source: Box::new(e), + }), + (stream, buffer, exhausted, chunk_size), + )) + } + }; + } + // Return the chunked values as the next value in the stream + let slice = buffer.split_to(chunk_size).freeze(); + Some((Ok(slice), (stream, buffer, exhausted, chunk_size))) + }, + ) + .boxed() } - } + }; + Ok(GetResult { + payload: GetResultPayload::Stream(stream), + ..r + }) } async fn get_range(&self, location: &Path, range: Range) -> Result { @@ -217,8 +197,8 @@ mod tests { for chunk_size in [10, 20, 31] { let store = ChunkedStore::new(Arc::clone(&store), chunk_size); - let mut s = match store.get(&location).await.unwrap() { - GetResult::Stream(s) => s, + let mut s = match store.get(&location).await.unwrap().payload { + GetResultPayload::Stream(s) => s, _ => unreachable!(), }; diff --git a/src/client/get.rs b/src/client/get.rs index 3c66a72..6b2d60a 100644 --- a/src/client/get.rs +++ b/src/client/get.rs @@ -17,8 +17,8 @@ use crate::client::header::header_meta; use crate::path::Path; -use crate::Result; use crate::{Error, GetOptions, GetResult, ObjectMeta}; +use crate::{GetResultPayload, Result}; use async_trait::async_trait; use futures::{StreamExt, TryStreamExt}; use reqwest::Response; @@ -47,7 +47,14 @@ pub trait GetClientExt { #[async_trait] impl GetClientExt for T { async fn get_opts(&self, location: &Path, options: GetOptions) -> Result { + let range = options.range.clone(); let response = self.get_request(location, options, false).await?; + let meta = + header_meta(location, response.headers()).map_err(|e| Error::Generic { + store: T::STORE, + source: Box::new(e), + })?; + let stream = response .bytes_stream() .map_err(|source| Error::Generic { @@ -56,7 +63,11 @@ impl GetClientExt for T { }) .boxed(); - Ok(GetResult::Stream(stream)) + Ok(GetResult { + range: range.unwrap_or(0..meta.size), + payload: GetResultPayload::Stream(stream), + meta, + }) } async fn head(&self, location: &Path) -> Result { diff --git a/src/http/mod.rs b/src/http/mod.rs index 6927f1b..e8e7b45 100644 --- a/src/http/mod.rs +++ b/src/http/mod.rs @@ -40,11 +40,12 @@ use snafu::{OptionExt, ResultExt, Snafu}; use tokio::io::AsyncWrite; use url::Url; +use crate::client::header::header_meta; use crate::http::client::Client; use crate::path::Path; use crate::{ - ClientConfigKey, ClientOptions, GetOptions, GetResult, ListResult, MultipartId, - ObjectMeta, ObjectStore, Result, RetryConfig, + ClientConfigKey, ClientOptions, GetOptions, GetResult, GetResultPayload, ListResult, + MultipartId, ObjectMeta, ObjectStore, Result, RetryConfig, }; mod client; @@ -60,6 +61,11 @@ enum Error { url: String, }, + #[snafu(display("Unable to extract metadata from headers: {}", source))] + Metadata { + source: crate::client::header::Error, + }, + #[snafu(display("Request error: {}", source))] Reqwest { source: reqwest::Error }, } @@ -109,13 +115,20 @@ impl ObjectStore for HttpStore { } async fn get_opts(&self, location: &Path, options: GetOptions) -> Result { + let range = options.range.clone(); let response = self.client.get(location, options).await?; + let meta = header_meta(location, response.headers()).context(MetadataSnafu)?; + let stream = response .bytes_stream() .map_err(|source| Error::Reqwest { source }.into()) .boxed(); - Ok(GetResult::Stream(stream)) + Ok(GetResult { + payload: GetResultPayload::Stream(stream), + range: range.unwrap_or(0..meta.size), + meta, + }) } async fn head(&self, location: &Path) -> Result { diff --git a/src/lib.rs b/src/lib.rs index cf7e479..7496b58 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -374,8 +374,6 @@ pub trait ObjectStore: std::fmt::Display + Send + Sync + Debug + 'static { } /// Perform a get request with options - /// - /// Note: options.range will be ignored if [`GetResult::File`] async fn get_opts(&self, location: &Path, options: GetOptions) -> Result; /// Return the bytes that are stored at the specified location @@ -385,17 +383,7 @@ pub trait ObjectStore: std::fmt::Display + Send + Sync + Debug + 'static { range: Some(range.clone()), ..Default::default() }; - // Temporary until GetResult::File supports range (#4352) - match self.get_opts(location, options).await? { - GetResult::Stream(s) => collect_bytes(s, None).await, - #[cfg(not(target_arch = "wasm32"))] - GetResult::File(mut file, path) => { - maybe_spawn_blocking(move || local::read_range(&mut file, &path, range)) - .await - } - #[cfg(target_arch = "wasm32")] - _ => unimplemented!("File IO not implemented on wasm32."), - } + self.get_opts(location, options).await?.bytes().await } /// Return the bytes that are stored at the specified location @@ -751,21 +739,32 @@ impl GetOptions { } /// Result for a get request +#[derive(Debug)] +pub struct GetResult { + /// The [`GetResultPayload`] + pub payload: GetResultPayload, + /// The [`ObjectMeta`] for this object + pub meta: ObjectMeta, + /// The range of bytes returned by this request + pub range: Range, +} + +/// The kind of a [`GetResult`] /// /// This special cases the case of a local file, as some systems may /// be able to optimise the case of a file already present on local disk -pub enum GetResult { - /// A file and its path on the local filesystem +pub enum GetResultPayload { + /// The file, path File(std::fs::File, std::path::PathBuf), - /// An asynchronous stream + /// An opaque stream of bytes Stream(BoxStream<'static, Result>), } -impl Debug for GetResult { +impl Debug for GetResultPayload { fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { match self { - Self::File(_, _) => write!(f, "GetResult(File)"), - Self::Stream(_) => write!(f, "GetResult(Stream)"), + Self::File(_, _) => write!(f, "GetResultPayload(File)"), + Self::Stream(_) => write!(f, "GetResultPayload(Stream)"), } } } @@ -773,32 +772,31 @@ impl Debug for GetResult { impl GetResult { /// Collects the data into a [`Bytes`] pub async fn bytes(self) -> Result { - match self { + let len = self.range.end - self.range.start; + match self.payload { #[cfg(not(target_arch = "wasm32"))] - Self::File(mut file, path) => { + GetResultPayload::File(mut file, path) => { maybe_spawn_blocking(move || { - let len = file.seek(SeekFrom::End(0)).map_err(|source| { - local::Error::Seek { + file.seek(SeekFrom::Start(self.range.start as _)).map_err( + |source| local::Error::Seek { source, path: path.clone(), - } - })?; - - file.rewind().map_err(|source| local::Error::Seek { - source, - path: path.clone(), - })?; + }, + )?; - let mut buffer = Vec::with_capacity(len as usize); - file.read_to_end(&mut buffer).map_err(|source| { - local::Error::UnableToReadBytes { source, path } - })?; + let mut buffer = Vec::with_capacity(len); + file.take(len as _) + .read_to_end(&mut buffer) + .map_err(|source| local::Error::UnableToReadBytes { + source, + path, + })?; Ok(buffer.into()) }) .await } - Self::Stream(s) => collect_bytes(s, None).await, + GetResultPayload::Stream(s) => collect_bytes(s, Some(len)).await, #[cfg(target_arch = "wasm32")] _ => unimplemented!("File IO not implemented on wasm32."), } @@ -806,8 +804,8 @@ impl GetResult { /// Converts this into a byte stream /// - /// If the result is [`Self::File`] will perform chunked reads of the file, otherwise - /// will return the [`Self::Stream`]. + /// If the `self.kind` is [`GetResultPayload::File`] will perform chunked reads of the file, + /// otherwise will return the [`GetResultPayload::Stream`]. /// /// # Tokio Compatibility /// @@ -819,36 +817,13 @@ impl GetResult { /// If not called from a tokio context, this will perform IO on the current thread with /// no additional complexity or overheads pub fn into_stream(self) -> BoxStream<'static, Result> { - match self { + match self.payload { #[cfg(not(target_arch = "wasm32"))] - Self::File(file, path) => { + GetResultPayload::File(file, path) => { const CHUNK_SIZE: usize = 8 * 1024; - - futures::stream::try_unfold( - (file, path, false), - |(mut file, path, finished)| { - maybe_spawn_blocking(move || { - if finished { - return Ok(None); - } - - let mut buffer = Vec::with_capacity(CHUNK_SIZE); - let read = file - .by_ref() - .take(CHUNK_SIZE as u64) - .read_to_end(&mut buffer) - .map_err(|e| local::Error::UnableToReadBytes { - source: e, - path: path.clone(), - })?; - - Ok(Some((buffer.into(), (file, path, read != CHUNK_SIZE)))) - }) - }, - ) - .boxed() + local::chunked_stream(file, path, self.range, CHUNK_SIZE) } - Self::Stream(s) => s, + GetResultPayload::Stream(s) => s, #[cfg(target_arch = "wasm32")] _ => unimplemented!("File IO not implemented on wasm32."), } diff --git a/src/limit.rs b/src/limit.rs index 630fd14..a9b8c4b 100644 --- a/src/limit.rs +++ b/src/limit.rs @@ -18,8 +18,8 @@ //! An object store that limits the maximum concurrency of the wrapped implementation use crate::{ - BoxStream, GetOptions, GetResult, ListResult, MultipartId, ObjectMeta, ObjectStore, - Path, Result, StreamExt, + BoxStream, GetOptions, GetResult, GetResultPayload, ListResult, MultipartId, + ObjectMeta, ObjectStore, Path, Result, StreamExt, }; use async_trait::async_trait; use bytes::Bytes; @@ -106,22 +106,14 @@ impl ObjectStore for LimitStore { async fn get(&self, location: &Path) -> Result { let permit = Arc::clone(&self.semaphore).acquire_owned().await.unwrap(); - match self.inner.get(location).await? { - r @ GetResult::File(_, _) => Ok(r), - GetResult::Stream(s) => { - Ok(GetResult::Stream(PermitWrapper::new(s, permit).boxed())) - } - } + let r = self.inner.get(location).await?; + Ok(permit_get_result(r, permit)) } async fn get_opts(&self, location: &Path, options: GetOptions) -> Result { let permit = Arc::clone(&self.semaphore).acquire_owned().await.unwrap(); - match self.inner.get_opts(location, options).await? { - r @ GetResult::File(_, _) => Ok(r), - GetResult::Stream(s) => { - Ok(GetResult::Stream(PermitWrapper::new(s, permit).boxed())) - } - } + let r = self.inner.get_opts(location, options).await?; + Ok(permit_get_result(r, permit)) } async fn get_range(&self, location: &Path, range: Range) -> Result { @@ -200,6 +192,16 @@ impl ObjectStore for LimitStore { } } +fn permit_get_result(r: GetResult, permit: OwnedSemaphorePermit) -> GetResult { + let payload = match r.payload { + v @ GetResultPayload::File(_, _) => v, + GetResultPayload::Stream(s) => { + GetResultPayload::Stream(PermitWrapper::new(s, permit).boxed()) + } + }; + GetResult { payload, ..r } +} + /// Combines an [`OwnedSemaphorePermit`] with some other type struct PermitWrapper { inner: T, diff --git a/src/local.rs b/src/local.rs index a0933cc..4d57ef1 100644 --- a/src/local.rs +++ b/src/local.rs @@ -19,16 +19,17 @@ use crate::{ maybe_spawn_blocking, path::{absolute_path_to_url, Path}, - GetOptions, GetResult, ListResult, MultipartId, ObjectMeta, ObjectStore, Result, + GetOptions, GetResult, GetResultPayload, ListResult, MultipartId, ObjectMeta, + ObjectStore, Result, }; use async_trait::async_trait; use bytes::Bytes; use chrono::{DateTime, Utc}; use futures::future::BoxFuture; -use futures::FutureExt; use futures::{stream::BoxStream, StreamExt}; +use futures::{FutureExt, TryStreamExt}; use snafu::{ensure, OptionExt, ResultExt, Snafu}; -use std::fs::{metadata, symlink_metadata, File, OpenOptions}; +use std::fs::{metadata, symlink_metadata, File, Metadata, OpenOptions}; use std::io::{ErrorKind, Read, Seek, SeekFrom, Write}; use std::ops::Range; use std::pin::Pin; @@ -370,18 +371,20 @@ impl ObjectStore for LocalFileSystem { let location = location.clone(); let path = self.config.path_to_filesystem(&location)?; maybe_spawn_blocking(move || { - let file = open_file(&path)?; + let (file, metadata) = open_file(&path)?; if options.if_unmodified_since.is_some() || options.if_modified_since.is_some() { - let metadata = file.metadata().map_err(|e| Error::Metadata { - source: e.into(), - path: location.to_string(), - })?; options.check_modified(&location, last_modified(&metadata))?; } - Ok(GetResult::File(file, path)) + let meta = convert_metadata(metadata, location)?; + + Ok(GetResult { + payload: GetResultPayload::File(file, path), + range: options.range.unwrap_or(0..meta.size), + meta, + }) }) .await } @@ -389,7 +392,7 @@ impl ObjectStore for LocalFileSystem { async fn get_range(&self, location: &Path, range: Range) -> Result { let path = self.config.path_to_filesystem(location)?; maybe_spawn_blocking(move || { - let mut file = open_file(&path)?; + let (mut file, _) = open_file(&path)?; read_range(&mut file, &path, range) }) .await @@ -404,7 +407,7 @@ impl ObjectStore for LocalFileSystem { let ranges = ranges.to_vec(); maybe_spawn_blocking(move || { // Vectored IO might be faster - let mut file = open_file(&path)?; + let (mut file, _) = open_file(&path)?; ranges .into_iter() .map(|r| read_range(&mut file, &path, r)) @@ -863,6 +866,51 @@ impl AsyncWrite for LocalUpload { } } +pub(crate) fn chunked_stream( + mut file: File, + path: PathBuf, + range: Range, + chunk_size: usize, +) -> BoxStream<'static, Result> { + futures::stream::once(async move { + let (file, path) = maybe_spawn_blocking(move || { + file.seek(SeekFrom::Start(range.start as _)) + .map_err(|source| Error::Seek { + source, + path: path.clone(), + })?; + Ok((file, path)) + }) + .await?; + + let stream = futures::stream::try_unfold( + (file, path, range.end - range.start), + move |(mut file, path, remaining)| { + maybe_spawn_blocking(move || { + if remaining == 0 { + return Ok(None); + } + + let to_read = remaining.min(chunk_size); + let mut buffer = Vec::with_capacity(to_read); + let read = (&mut file) + .take(to_read as u64) + .read_to_end(&mut buffer) + .map_err(|e| Error::UnableToReadBytes { + source: e, + path: path.clone(), + })?; + + Ok(Some((buffer.into(), (file, path, remaining - read)))) + }) + }, + ); + Ok::<_, super::Error>(stream) + }) + .try_flatten() + .boxed() +} + pub(crate) fn read_range( file: &mut File, path: &PathBuf, @@ -889,8 +937,8 @@ pub(crate) fn read_range( Ok(buf.into()) } -fn open_file(path: &PathBuf) -> Result { - let file = match File::open(path).and_then(|f| Ok((f.metadata()?, f))) { +fn open_file(path: &PathBuf) -> Result<(File, Metadata)> { + let ret = match File::open(path).and_then(|f| Ok((f.metadata()?, f))) { Err(e) => Err(match e.kind() { ErrorKind::NotFound => Error::NotFound { path: path.clone(), @@ -902,14 +950,14 @@ fn open_file(path: &PathBuf) -> Result { }, }), Ok((metadata, file)) => match !metadata.is_dir() { - true => Ok(file), + true => Ok((file, metadata)), false => Err(Error::NotFound { path: path.clone(), source: io::Error::new(ErrorKind::NotFound, "is directory"), }), }, }?; - Ok(file) + Ok(ret) } fn convert_entry(entry: DirEntry, location: Path) -> Result { @@ -927,7 +975,7 @@ fn last_modified(metadata: &std::fs::Metadata) -> DateTime { .into() } -fn convert_metadata(metadata: std::fs::Metadata, location: Path) -> Result { +fn convert_metadata(metadata: Metadata, location: Path) -> Result { let last_modified = last_modified(&metadata); let size = usize::try_from(metadata.len()).context(FileSizeOverflowedUsizeSnafu { path: location.as_ref(), diff --git a/src/memory.rs b/src/memory.rs index cfc2ac8..1e8e3c1 100644 --- a/src/memory.rs +++ b/src/memory.rs @@ -16,7 +16,9 @@ // under the License. //! An in-memory object store implementation -use crate::{path::Path, GetResult, ListResult, ObjectMeta, ObjectStore, Result}; +use crate::{ + path::Path, GetResult, GetResultPayload, ListResult, ObjectMeta, ObjectStore, Result, +}; use crate::{GetOptions, MultipartId}; use async_trait::async_trait; use bytes::Bytes; @@ -43,11 +45,13 @@ enum Error { #[snafu(display("No data in memory found. Location: {path}"))] NoDataInMemory { path: String }, - #[snafu(display("Out of range"))] - OutOfRange, + #[snafu(display( + "Requested range {}..{} is out of bounds for object with length {}", range.start, range.end, len + ))] + OutOfRange { range: Range, len: usize }, - #[snafu(display("Bad range"))] - BadRange, + #[snafu(display("Invalid range: {}..{}", range.start, range.end))] + BadRange { range: Range }, #[snafu(display("Object already exists at that location: {path}"))] AlreadyExists { path: String }, @@ -136,17 +140,29 @@ impl ObjectStore for InMemory { } let (data, last_modified) = self.entry(location).await?; options.check_modified(location, last_modified)?; + let meta = ObjectMeta { + location: location.clone(), + last_modified, + size: data.len(), + e_tag: None, + }; + let (range, data) = match options.range { + Some(range) => { + let len = data.len(); + ensure!(range.end <= len, OutOfRangeSnafu { range, len }); + ensure!(range.start <= range.end, BadRangeSnafu { range }); + (range.clone(), data.slice(range)) + } + None => (0..data.len(), data), + }; let stream = futures::stream::once(futures::future::ready(Ok(data))); - Ok(GetResult::Stream(stream.boxed())) - } - - async fn get_range(&self, location: &Path, range: Range) -> Result { - let data = self.entry(location).await?; - ensure!(range.end <= data.0.len(), OutOfRangeSnafu); - ensure!(range.start <= range.end, BadRangeSnafu); - Ok(data.0.slice(range)) + Ok(GetResult { + payload: GetResultPayload::Stream(stream.boxed()), + meta, + range, + }) } async fn get_ranges( @@ -158,9 +174,11 @@ impl ObjectStore for InMemory { ranges .iter() .map(|range| { - ensure!(range.end <= data.0.len(), OutOfRangeSnafu); - ensure!(range.start <= range.end, BadRangeSnafu); - Ok(data.0.slice(range.clone())) + let range = range.clone(); + let len = data.0.len(); + ensure!(range.end <= data.0.len(), OutOfRangeSnafu { range, len }); + ensure!(range.start <= range.end, BadRangeSnafu { range }); + Ok(data.0.slice(range)) }) .collect() } diff --git a/src/throttle.rs b/src/throttle.rs index fb90afc..58c476a 100644 --- a/src/throttle.rs +++ b/src/throttle.rs @@ -20,7 +20,9 @@ use parking_lot::Mutex; use std::ops::Range; use std::{convert::TryInto, sync::Arc}; -use crate::{path::Path, GetResult, ListResult, ObjectMeta, ObjectStore, Result}; +use crate::{ + path::Path, GetResult, GetResultPayload, ListResult, ObjectMeta, ObjectStore, Result, +}; use crate::{GetOptions, MultipartId}; use async_trait::async_trait; use bytes::Bytes; @@ -301,15 +303,20 @@ fn usize_to_u32_saturate(x: usize) -> u32 { } fn throttle_get(result: GetResult, wait_get_per_byte: Duration) -> GetResult { - let s = match result { - GetResult::Stream(s) => s, - GetResult::File(_, _) => unimplemented!(), + let s = match result.payload { + GetResultPayload::Stream(s) => s, + GetResultPayload::File(_, _) => unimplemented!(), }; - GetResult::Stream(throttle_stream(s, move |bytes| { + let stream = throttle_stream(s, move |bytes| { let bytes_len: u32 = usize_to_u32_saturate(bytes.len()); wait_get_per_byte * bytes_len - })) + }); + + GetResult { + payload: GetResultPayload::Stream(stream), + ..result + } } fn throttle_stream( @@ -330,7 +337,7 @@ where #[cfg(test)] mod tests { use super::*; - use crate::{memory::InMemory, tests::*}; + use crate::{memory::InMemory, tests::*, GetResultPayload}; use bytes::Bytes; use futures::TryStreamExt; use tokio::time::Duration; @@ -550,9 +557,9 @@ mod tests { let res = store.get(&path).await; if n_bytes.is_some() { // need to consume bytes to provoke sleep times - let s = match res.unwrap() { - GetResult::Stream(s) => s, - GetResult::File(_, _) => unimplemented!(), + let s = match res.unwrap().payload { + GetResultPayload::Stream(s) => s, + GetResultPayload::File(_, _) => unimplemented!(), }; s.map_ok(|b| bytes::BytesMut::from(&b[..])) From 40cc57aacb09979e5a8e5e47e7c505cf38c5dbb8 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Tue, 15 Aug 2023 10:51:36 +0100 Subject: [PATCH 176/397] Prepare object_store 0.7.0 (#4699) --- CHANGELOG.md | 45 +++++++++++++++++++++++--------- Cargo.toml | 2 +- dev/release/update_change_log.sh | 4 +-- 3 files changed, 36 insertions(+), 15 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index fe25e23..1250639 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,30 +19,51 @@ # Changelog -## [object_store_0.6.1](https://github.com/apache/arrow-rs/tree/object_store_0.6.1) (2023-06-02) +## [object_store_0.7.0](https://github.com/apache/arrow-rs/tree/object_store_0.7.0) (2023-08-15) -[Full Changelog](https://github.com/apache/arrow-rs/compare/object_store_0.6.0...object_store_0.6.1) +[Full Changelog](https://github.com/apache/arrow-rs/compare/object_store_0.6.1...object_store_0.7.0) + +**Breaking changes:** + +- Add range and ObjectMeta to GetResult \(\#4352\) \(\#4495\) [\#4677](https://github.com/apache/arrow-rs/pull/4677) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) **Implemented enhancements:** -- Support multipart upload in R2 [\#4304](https://github.com/apache/arrow-rs/issues/4304) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Add AzureConfigKey::ContainerName [\#4629](https://github.com/apache/arrow-rs/issues/4629) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- object\_store: multipart ranges for HTTP [\#4612](https://github.com/apache/arrow-rs/issues/4612) +- Make object\_store::multipart public [\#4569](https://github.com/apache/arrow-rs/issues/4569) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- object\_store: Export `ClientConfigKey` and make the `HttpBuilder` more consistent with other builders [\#4515](https://github.com/apache/arrow-rs/issues/4515) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- object\_store/InMemory: Make `clone()` non-async [\#4496](https://github.com/apache/arrow-rs/issues/4496) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Add Range to GetResult::File [\#4352](https://github.com/apache/arrow-rs/issues/4352) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Support copy\_if\_not\_exists for Cloudflare R2 \(S3 API\) [\#4190](https://github.com/apache/arrow-rs/issues/4190) **Fixed bugs:** -- Default ObjectStore::get\_range Doesn't Apply Range to GetResult::File [\#4350](https://github.com/apache/arrow-rs/issues/4350) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- object\_store documentation is broken [\#4683](https://github.com/apache/arrow-rs/issues/4683) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Exports are not sufficient for configuring some object stores, for example minio running locally [\#4530](https://github.com/apache/arrow-rs/issues/4530) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- object\_store: Uploading empty file to S3 results in "411 Length Required" [\#4514](https://github.com/apache/arrow-rs/issues/4514) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- GCP doesn't fetch public objects [\#4417](https://github.com/apache/arrow-rs/issues/4417) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] **Closed issues:** -- \[object\_store - AmazonS3Builder\] incorrect metadata\_endpoint set in `from_env` in an ECS environment [\#4283](https://github.com/apache/arrow-rs/issues/4283) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- \[object\_store\] when Create a AmazonS3 instance work with MinIO without set endpoint got error MissingRegion [\#4617](https://github.com/apache/arrow-rs/issues/4617) +- AWS Profile credentials no longer working in object\_store 0.6.1 [\#4556](https://github.com/apache/arrow-rs/issues/4556) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] **Merged pull requests:** -- Fix ObjectStore::get\_range for GetResult::File \(\#4350\) [\#4351](https://github.com/apache/arrow-rs/pull/4351) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- Don't exclude FIFO files from LocalFileSystem [\#4345](https://github.com/apache/arrow-rs/pull/4345) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- Fix support for ECS IAM credentials [\#4310](https://github.com/apache/arrow-rs/pull/4310) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- feat: use exactly equal parts in multipart upload [\#4305](https://github.com/apache/arrow-rs/pull/4305) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([wjones127](https://github.com/wjones127)) -- Set ECS specific metadata endpoint [\#4288](https://github.com/apache/arrow-rs/pull/4288) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([jfuechsl](https://github.com/jfuechsl)) -- Prepare 40.0.0 release [\#4245](https://github.com/apache/arrow-rs/pull/4245) ([tustvold](https://github.com/tustvold)) -- feat: support bulk deletes in object\_store [\#4060](https://github.com/apache/arrow-rs/pull/4060) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([wjones127](https://github.com/wjones127)) +- Add AzureConfigKey::ContainerName \(\#4629\) [\#4686](https://github.com/apache/arrow-rs/pull/4686) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Fix MSRV CI [\#4671](https://github.com/apache/arrow-rs/pull/4671) ([tustvold](https://github.com/tustvold)) +- Use Config System for Object Store Integration Tests [\#4628](https://github.com/apache/arrow-rs/pull/4628) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Prepare arrow 45 [\#4590](https://github.com/apache/arrow-rs/pull/4590) ([tustvold](https://github.com/tustvold)) +- Add Support for Microsoft Fabric / OneLake [\#4573](https://github.com/apache/arrow-rs/pull/4573) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([vmuddassir-msft](https://github.com/vmuddassir-msft)) +- Cleanup multipart upload trait [\#4572](https://github.com/apache/arrow-rs/pull/4572) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Make object\_store::multipart public [\#4570](https://github.com/apache/arrow-rs/pull/4570) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([yjshen](https://github.com/yjshen)) +- Handle empty S3 payloads \(\#4514\) [\#4518](https://github.com/apache/arrow-rs/pull/4518) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- object\_store: Export `ClientConfigKey` and add `HttpBuilder::with_config` [\#4516](https://github.com/apache/arrow-rs/pull/4516) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([thehabbos007](https://github.com/thehabbos007)) +- object\_store: Implement `ObjectStore` for `Arc` [\#4502](https://github.com/apache/arrow-rs/pull/4502) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([Turbo87](https://github.com/Turbo87)) +- object\_store/InMemory: Add `fork()` fn and deprecate `clone()` fn [\#4499](https://github.com/apache/arrow-rs/pull/4499) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([Turbo87](https://github.com/Turbo87)) +- Bump actions/deploy-pages from 1 to 2 [\#4449](https://github.com/apache/arrow-rs/pull/4449) ([dependabot[bot]](https://github.com/apps/dependabot)) +- gcp: Exclude authorization header when bearer empty [\#4418](https://github.com/apache/arrow-rs/pull/4418) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([vrongmeal](https://github.com/vrongmeal)) +- Support copy\_if\_not\_exists for Cloudflare R2 \(\#4190\) [\#4239](https://github.com/apache/arrow-rs/pull/4239) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) \* *This Changelog was automatically generated by [github_changelog_generator](https://github.com/github-changelog-generator/github-changelog-generator)* diff --git a/Cargo.toml b/Cargo.toml index eca5a5c..7ef395a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "object_store" -version = "0.6.1" +version = "0.7.0" edition = "2021" license = "MIT/Apache-2.0" readme = "README.md" diff --git a/dev/release/update_change_log.sh b/dev/release/update_change_log.sh index 3e9f8bd..48835c7 100755 --- a/dev/release/update_change_log.sh +++ b/dev/release/update_change_log.sh @@ -29,8 +29,8 @@ set -e -SINCE_TAG="object_store_0.6.0" -FUTURE_RELEASE="object_store_0.6.1" +SINCE_TAG="object_store_0.6.1" +FUTURE_RELEASE="object_store_0.7.0" SOURCE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" SOURCE_TOP_DIR="$(cd "${SOURCE_DIR}/../../" && pwd)" From 74f4ffc87ae98624544c7afb693f2cf0c90fcece Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Wed, 16 Aug 2023 13:09:51 +0100 Subject: [PATCH 177/397] Update object_store Dependencies and Configure Dependabot (#4700) * Update itertools and quick-xml * Add dependabot --- Cargo.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 7ef395a..3c10f4a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -35,7 +35,7 @@ bytes = "1.0" chrono = { version = "0.4.23", default-features = false, features = ["clock"] } futures = "0.3" humantime = "2.1" -itertools = "0.10.1" +itertools = "0.11.0" parking_lot = { version = "0.12" } percent-encoding = "2.1" snafu = "0.7" @@ -46,7 +46,7 @@ walkdir = "2" # Cloud storage support base64 = { version = "0.21", default-features = false, features = ["std"], optional = true } hyper = { version = "0.14", default-features = false, optional = true } -quick-xml = { version = "0.28.0", features = ["serialize", "overlapped-lists"], optional = true } +quick-xml = { version = "0.30.0", features = ["serialize", "overlapped-lists"], optional = true } serde = { version = "1.0", default-features = false, features = ["derive"], optional = true } serde_json = { version = "1.0", default-features = false, optional = true } rand = { version = "0.8", default-features = false, features = ["std", "std_rng"], optional = true } From 944b91d44f713ebb21b065f524f2a09533add49a Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 24 Aug 2023 19:54:13 +0100 Subject: [PATCH 178/397] Fix new clippy lints (#4734) * Fix new clippy lints * More clippy * Even more clippy * Clippy --- src/memory.rs | 2 +- src/util.rs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/memory.rs b/src/memory.rs index 1e8e3c1..0e22988 100644 --- a/src/memory.rs +++ b/src/memory.rs @@ -393,7 +393,7 @@ impl AsyncWrite for InMemoryAppend { if let Some((bytes, _)) = writer.remove(&self.location) { let buf = std::mem::take(&mut self.data); - let concat = Bytes::from_iter(bytes.into_iter().chain(buf.into_iter())); + let concat = Bytes::from_iter(bytes.into_iter().chain(buf)); writer.insert(self.location.clone(), (concat, Utc::now())); } else { writer.insert( diff --git a/src/util.rs b/src/util.rs index 79ca4bb..07d3ed4 100644 --- a/src/util.rs +++ b/src/util.rs @@ -207,7 +207,7 @@ mod tests { let fetches = do_fetch(vec![], 0).await; assert!(fetches.is_empty()); - let fetches = do_fetch(vec![0..3], 0).await; + let fetches = do_fetch(vec![0..3; 1], 0).await; assert_eq!(fetches, vec![0..3]); let fetches = do_fetch(vec![0..2, 3..5], 0).await; From 7da7975ade510e5d0ed258bf18478f0d569456ab Mon Sep 17 00:00:00 2001 From: Gordon Wang <36049150+gordonwang0@users.noreply.github.com> Date: Fri, 25 Aug 2023 01:07:26 -0700 Subject: [PATCH 179/397] Add `with_proxy_ca_certificate` and `with_proxy_excludes` (#4714) * Add proxy_auth and proxy_exclude * Add proxy_ca_certificate * Add public fns * rename fn * clippy fix --- src/aws/mod.rs | 17 ++++++++++++++++ src/azure/mod.rs | 19 +++++++++++++++++- src/client/mod.rs | 50 ++++++++++++++++++++++++++++++++++++++++++++--- src/gcp/mod.rs | 17 ++++++++++++++++ 4 files changed, 99 insertions(+), 4 deletions(-) diff --git a/src/aws/mod.rs b/src/aws/mod.rs index 7e16b5a..db3e1b9 100644 --- a/src/aws/mod.rs +++ b/src/aws/mod.rs @@ -946,6 +946,23 @@ impl AmazonS3Builder { self } + /// Set a trusted proxy CA certificate + pub fn with_proxy_ca_certificate( + mut self, + proxy_ca_certificate: impl Into, + ) -> Self { + self.client_options = self + .client_options + .with_proxy_ca_certificate(proxy_ca_certificate); + self + } + + /// Set a list of hosts to exclude from proxy connections + pub fn with_proxy_excludes(mut self, proxy_excludes: impl Into) -> Self { + self.client_options = self.client_options.with_proxy_excludes(proxy_excludes); + self + } + /// Sets the client options, overriding any already set pub fn with_client_options(mut self, options: ClientOptions) -> Self { self.client_options = options; diff --git a/src/azure/mod.rs b/src/azure/mod.rs index 27bbbfb..2a07710 100644 --- a/src/azure/mod.rs +++ b/src/azure/mod.rs @@ -436,7 +436,7 @@ pub enum AzureConfigKey { /// Use object store with url scheme account.dfs.fabric.microsoft.com /// - /// Supported keys: + /// Supported keys: /// - `azure_use_fabric_endpoint` /// - `use_fabric_endpoint` UseFabricEndpoint, @@ -909,6 +909,23 @@ impl MicrosoftAzureBuilder { self } + /// Set a trusted proxy CA certificate + pub fn with_proxy_ca_certificate( + mut self, + proxy_ca_certificate: impl Into, + ) -> Self { + self.client_options = self + .client_options + .with_proxy_ca_certificate(proxy_ca_certificate); + self + } + + /// Set a list of hosts to exclude from proxy connections + pub fn with_proxy_excludes(mut self, proxy_excludes: impl Into) -> Self { + self.client_options = self.client_options.with_proxy_excludes(proxy_excludes); + self + } + /// Sets the client options, overriding any already set pub fn with_client_options(mut self, options: ClientOptions) -> Self { self.client_options = options; diff --git a/src/client/mod.rs b/src/client/mod.rs index 5f3a042..d4995a5 100644 --- a/src/client/mod.rs +++ b/src/client/mod.rs @@ -48,7 +48,7 @@ use std::sync::Arc; use std::time::Duration; use reqwest::header::{HeaderMap, HeaderValue}; -use reqwest::{Client, ClientBuilder, Proxy, RequestBuilder}; +use reqwest::{Client, ClientBuilder, NoProxy, Proxy, RequestBuilder}; use serde::{Deserialize, Serialize}; use crate::config::{fmt_duration, ConfigValue}; @@ -103,6 +103,10 @@ pub enum ClientConfigKey { PoolMaxIdlePerHost, /// HTTP proxy to use for requests ProxyUrl, + /// PEM-formatted CA certificate for proxy connections + ProxyCaCertificate, + /// List of hosts that bypass proxy + ProxyExcludes, /// Request timeout /// /// The timeout is applied from when the request starts connecting until the @@ -127,6 +131,8 @@ impl AsRef for ClientConfigKey { Self::PoolIdleTimeout => "pool_idle_timeout", Self::PoolMaxIdlePerHost => "pool_max_idle_per_host", Self::ProxyUrl => "proxy_url", + Self::ProxyCaCertificate => "proxy_ca_certificate", + Self::ProxyExcludes => "proxy_excludes", Self::Timeout => "timeout", Self::UserAgent => "user_agent", } @@ -168,6 +174,8 @@ pub struct ClientOptions { default_content_type: Option, default_headers: Option, proxy_url: Option, + proxy_ca_certificate: Option, + proxy_excludes: Option, allow_http: ConfigValue, allow_insecure: ConfigValue, timeout: Option>, @@ -216,6 +224,10 @@ impl ClientOptions { self.pool_max_idle_per_host = Some(ConfigValue::Deferred(value.into())) } ClientConfigKey::ProxyUrl => self.proxy_url = Some(value.into()), + ClientConfigKey::ProxyCaCertificate => { + self.proxy_ca_certificate = Some(value.into()) + } + ClientConfigKey::ProxyExcludes => self.proxy_excludes = Some(value.into()), ClientConfigKey::Timeout => { self.timeout = Some(ConfigValue::Deferred(value.into())) } @@ -255,6 +267,8 @@ impl ClientOptions { self.pool_max_idle_per_host.as_ref().map(|v| v.to_string()) } ClientConfigKey::ProxyUrl => self.proxy_url.clone(), + ClientConfigKey::ProxyCaCertificate => self.proxy_ca_certificate.clone(), + ClientConfigKey::ProxyExcludes => self.proxy_excludes.clone(), ClientConfigKey::Timeout => self.timeout.as_ref().map(fmt_duration), ClientConfigKey::UserAgent => self .user_agent @@ -329,12 +343,27 @@ impl ClientOptions { self } - /// Set an HTTP proxy to use for requests + /// Set a proxy URL to use for requests pub fn with_proxy_url(mut self, proxy_url: impl Into) -> Self { self.proxy_url = Some(proxy_url.into()); self } + /// Set a trusted proxy CA certificate + pub fn with_proxy_ca_certificate( + mut self, + proxy_ca_certificate: impl Into, + ) -> Self { + self.proxy_ca_certificate = Some(proxy_ca_certificate.into()); + self + } + + /// Set a list of hosts to exclude from proxy connections + pub fn with_proxy_excludes(mut self, proxy_excludes: impl Into) -> Self { + self.proxy_excludes = Some(proxy_excludes.into()); + self + } + /// Set a request timeout /// /// The timeout is applied from when the request starts connecting until the @@ -429,7 +458,22 @@ impl ClientOptions { } if let Some(proxy) = &self.proxy_url { - let proxy = Proxy::all(proxy).map_err(map_client_error)?; + let mut proxy = Proxy::all(proxy).map_err(map_client_error)?; + + if let Some(certificate) = &self.proxy_ca_certificate { + let certificate = + reqwest::tls::Certificate::from_pem(certificate.as_bytes()) + .map_err(map_client_error)?; + + builder = builder.add_root_certificate(certificate); + } + + if let Some(proxy_excludes) = &self.proxy_excludes { + let no_proxy = NoProxy::from_string(proxy_excludes); + + proxy = proxy.no_proxy(no_proxy); + } + builder = builder.proxy(proxy); } diff --git a/src/gcp/mod.rs b/src/gcp/mod.rs index 58a5d19..3f5bf62 100644 --- a/src/gcp/mod.rs +++ b/src/gcp/mod.rs @@ -992,6 +992,23 @@ impl GoogleCloudStorageBuilder { self } + /// Set a trusted proxy CA certificate + pub fn with_proxy_ca_certificate( + mut self, + proxy_ca_certificate: impl Into, + ) -> Self { + self.client_options = self + .client_options + .with_proxy_ca_certificate(proxy_ca_certificate); + self + } + + /// Set a list of hosts to exclude from proxy connections + pub fn with_proxy_excludes(mut self, proxy_excludes: impl Into) -> Self { + self.client_options = self.client_options.with_proxy_excludes(proxy_excludes); + self + } + /// Sets the client options, overriding any already set pub fn with_client_options(mut self, options: ClientOptions) -> Self { self.client_options = options; From 1714a050740e1de7cb55cb819c6244ee59df942a Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Mon, 28 Aug 2023 13:31:01 -0700 Subject: [PATCH 180/397] Update nix requirement from 0.26.1 to 0.27.1 in /object_store (#4744) --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index 3c10f4a..b8d4391 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -61,7 +61,7 @@ tokio = { version = "1.25.0", features = ["sync", "macros", "rt", "time", "io-ut tokio = { version = "1.25.0", features = ["sync", "macros", "rt", "time", "io-util"] } [target.'cfg(target_family="unix")'.dev-dependencies] -nix = "0.26.1" +nix = { version = "0.27.1", features = ["fs"] } [features] cloud = ["serde", "serde_json", "quick-xml", "hyper", "reqwest", "reqwest/json", "reqwest/stream", "chrono/serde", "base64", "rand", "ring"] From 6a163d6878a685850d140e6cacf7996ace700ed3 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Sat, 2 Sep 2023 10:02:20 +0100 Subject: [PATCH 181/397] Make ObjectStore::copy Atomic and Automatically Create Parent Directories (#4758) (#4760) (#4759) * Make LocalFileSystem::copy atomic (#4758) * Create sub-directories for copy (#4760) * Fix HttpStore * Clippy * Tweak error propagation * Add doc --- src/http/client.rs | 50 +++++++++++--------- src/lib.rs | 20 +++++++- src/local.rs | 112 ++++++++++++++++++++++++++++----------------- 3 files changed, 115 insertions(+), 67 deletions(-) diff --git a/src/http/client.rs b/src/http/client.rs index 1d3df34..93cd4ee 100644 --- a/src/http/client.rs +++ b/src/http/client.rs @@ -256,31 +256,37 @@ impl Client { } pub async fn copy(&self, from: &Path, to: &Path, overwrite: bool) -> Result<()> { - let from = self.path_url(from); - let to = self.path_url(to); - let method = Method::from_bytes(b"COPY").unwrap(); - - let mut builder = self - .client - .request(method, from) - .header("Destination", to.as_str()); + let mut retry = false; + loop { + let method = Method::from_bytes(b"COPY").unwrap(); - if !overwrite { - builder = builder.header("Overwrite", "F"); - } + let mut builder = self + .client + .request(method, self.path_url(from)) + .header("Destination", self.path_url(to).as_str()); - match builder.send_retry(&self.retry_config).await { - Ok(_) => Ok(()), - Err(e) - if !overwrite - && matches!(e.status(), Some(StatusCode::PRECONDITION_FAILED)) => - { - Err(crate::Error::AlreadyExists { - path: to.to_string(), - source: Box::new(e), - }) + if !overwrite { + builder = builder.header("Overwrite", "F"); } - Err(source) => Err(Error::Request { source }.into()), + + return match builder.send_retry(&self.retry_config).await { + Ok(_) => Ok(()), + Err(source) => Err(match source.status() { + Some(StatusCode::PRECONDITION_FAILED) if !overwrite => { + crate::Error::AlreadyExists { + path: to.to_string(), + source: Box::new(source), + } + } + // Some implementations return 404 instead of 409 + Some(StatusCode::CONFLICT | StatusCode::NOT_FOUND) if !retry => { + retry = true; + self.create_parent_directories(to).await?; + continue; + } + _ => Error::Request { source }.into(), + }), + }; } } } diff --git a/src/lib.rs b/src/lib.rs index 7496b58..d1ee83b 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1105,8 +1105,24 @@ mod tests { files.sort_unstable(); assert_eq!(files, vec![emoji_file.clone(), dst.clone()]); + let dst2 = Path::from("new/nested/foo.parquet"); + storage.copy(&emoji_file, &dst2).await.unwrap(); + let mut files = flatten_list_stream(storage, None).await.unwrap(); + files.sort_unstable(); + assert_eq!(files, vec![emoji_file.clone(), dst.clone(), dst2.clone()]); + + let dst3 = Path::from("new/nested2/bar.parquet"); + storage.rename(&dst, &dst3).await.unwrap(); + let mut files = flatten_list_stream(storage, None).await.unwrap(); + files.sort_unstable(); + assert_eq!(files, vec![emoji_file.clone(), dst2.clone(), dst3.clone()]); + + let err = storage.head(&dst).await.unwrap_err(); + assert!(matches!(err, Error::NotFound { .. })); + storage.delete(&emoji_file).await.unwrap(); - storage.delete(&dst).await.unwrap(); + storage.delete(&dst3).await.unwrap(); + storage.delete(&dst2).await.unwrap(); let files = flatten_list_stream(storage, Some(&emoji_prefix)) .await .unwrap(); @@ -1605,7 +1621,7 @@ mod tests { pub(crate) async fn copy_if_not_exists(storage: &DynObjectStore) { // Create two objects let path1 = Path::from("test1"); - let path2 = Path::from("test2"); + let path2 = Path::from("not_exists_nested/test2"); let contents1 = Bytes::from("cats"); let contents2 = Bytes::from("dogs"); diff --git a/src/local.rs b/src/local.rs index 4d57ef1..495bb4f 100644 --- a/src/local.rs +++ b/src/local.rs @@ -28,7 +28,7 @@ use chrono::{DateTime, Utc}; use futures::future::BoxFuture; use futures::{stream::BoxStream, StreamExt}; use futures::{FutureExt, TryStreamExt}; -use snafu::{ensure, OptionExt, ResultExt, Snafu}; +use snafu::{ensure, ResultExt, Snafu}; use std::fs::{metadata, symlink_metadata, File, Metadata, OpenOptions}; use std::io::{ErrorKind, Read, Seek, SeekFrom, Write}; use std::ops::Range; @@ -78,10 +78,10 @@ pub(crate) enum Error { path: PathBuf, }, - #[snafu(display("Unable to create file {}: {}", path.display(), err))] + #[snafu(display("Unable to create file {}: {}", path.display(), source))] UnableToCreateFile { + source: io::Error, path: PathBuf, - err: io::Error, }, #[snafu(display("Unable to delete file {}: {}", path.display(), source))] @@ -336,12 +336,13 @@ impl ObjectStore for LocalFileSystem { // If the file was successfully opened, return it wrapped in a boxed `AsyncWrite` trait object. Ok(file) => return Ok(Box::new(file)), // If the error is that the file was not found, attempt to create the file and any necessary parent directories. - Err(err) if err.kind() == ErrorKind::NotFound => { + Err(source) if source.kind() == ErrorKind::NotFound => { // Get the path to the parent directory of the file. - let parent = path - .parent() - // If the parent directory does not exist, return a `UnableToCreateFileSnafu` error. - .context(UnableToCreateFileSnafu { path: &path, err })?; + let parent = + path.parent().ok_or_else(|| Error::UnableToCreateFile { + path: path.to_path_buf(), + source, + })?; // Create the parent directory and any necessary ancestors. tokio::fs::create_dir_all(parent) @@ -584,10 +585,27 @@ impl ObjectStore for LocalFileSystem { async fn copy(&self, from: &Path, to: &Path) -> Result<()> { let from = self.config.path_to_filesystem(from)?; let to = self.config.path_to_filesystem(to)?; - - maybe_spawn_blocking(move || { - std::fs::copy(&from, &to).context(UnableToCopyFileSnafu { from, to })?; - Ok(()) + let mut id = 0; + // In order to make this atomic we: + // + // - hard link to a hidden temporary file + // - atomically rename this temporary file into place + // + // This is necessary because hard_link returns an error if the destination already exists + maybe_spawn_blocking(move || loop { + let staged = staged_upload_path(&to, &id.to_string()); + match std::fs::hard_link(&from, &staged) { + Ok(_) => { + return std::fs::rename(&staged, &to).map_err(|source| { + Error::UnableToCopyFile { from, to, source }.into() + }) + } + Err(source) => match source.kind() { + ErrorKind::AlreadyExists => id += 1, + ErrorKind::NotFound => create_parent_dirs(&to, source)?, + _ => return Err(Error::UnableToCopyFile { from, to, source }.into()), + }, + } }) .await } @@ -595,9 +613,14 @@ impl ObjectStore for LocalFileSystem { async fn rename(&self, from: &Path, to: &Path) -> Result<()> { let from = self.config.path_to_filesystem(from)?; let to = self.config.path_to_filesystem(to)?; - maybe_spawn_blocking(move || { - std::fs::rename(&from, &to).context(UnableToCopyFileSnafu { from, to })?; - Ok(()) + maybe_spawn_blocking(move || loop { + match std::fs::rename(&from, &to) { + Ok(_) => return Ok(()), + Err(source) => match source.kind() { + ErrorKind::NotFound => create_parent_dirs(&to, source)?, + _ => return Err(Error::UnableToCopyFile { from, to, source }.into()), + }, + } }) .await } @@ -606,25 +629,37 @@ impl ObjectStore for LocalFileSystem { let from = self.config.path_to_filesystem(from)?; let to = self.config.path_to_filesystem(to)?; - maybe_spawn_blocking(move || { - std::fs::hard_link(&from, &to).map_err(|err| match err.kind() { - io::ErrorKind::AlreadyExists => Error::AlreadyExists { - path: to.to_str().unwrap().to_string(), - source: err, - } - .into(), - _ => Error::UnableToCopyFile { - from, - to, - source: err, - } - .into(), - }) + maybe_spawn_blocking(move || loop { + match std::fs::hard_link(&from, &to) { + Ok(_) => return Ok(()), + Err(source) => match source.kind() { + ErrorKind::AlreadyExists => { + return Err(Error::AlreadyExists { + path: to.to_str().unwrap().to_string(), + source, + } + .into()) + } + ErrorKind::NotFound => create_parent_dirs(&to, source)?, + _ => return Err(Error::UnableToCopyFile { from, to, source }.into()), + }, + } }) .await } } +/// Creates the parent directories of `path` or returns an error based on `source` if no parent +fn create_parent_dirs(path: &std::path::Path, source: io::Error) -> Result<()> { + let parent = path.parent().ok_or_else(|| Error::UnableToCreateFile { + path: path.to_path_buf(), + source, + })?; + + std::fs::create_dir_all(parent).context(UnableToCreateDirSnafu { path: parent })?; + Ok(()) +} + /// Generates a unique file path `{base}#{suffix}`, returning the opened `File` and `suffix` /// /// Creates any directories if necessary @@ -636,20 +671,11 @@ fn new_staged_upload(base: &std::path::Path) -> Result<(File, String)> { let mut options = OpenOptions::new(); match options.read(true).write(true).create_new(true).open(&path) { Ok(f) => return Ok((f, suffix)), - Err(e) if e.kind() == ErrorKind::AlreadyExists => { - multipart_id += 1; - } - Err(err) if err.kind() == ErrorKind::NotFound => { - let parent = path - .parent() - .context(UnableToCreateFileSnafu { path: &path, err })?; - - std::fs::create_dir_all(parent) - .context(UnableToCreateDirSnafu { path: parent })?; - - continue; - } - Err(source) => return Err(Error::UnableToOpenFile { source, path }.into()), + Err(source) => match source.kind() { + ErrorKind::AlreadyExists => multipart_id += 1, + ErrorKind::NotFound => create_parent_dirs(&path, source)?, + _ => return Err(Error::UnableToOpenFile { source, path }.into()), + }, } } } From 775a469771f6935a3c43f78cfbead84be1f2260f Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 7 Sep 2023 12:19:03 +0100 Subject: [PATCH 182/397] Update object_store chrono deprecations (#4786) --- src/util.rs | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/util.rs b/src/util.rs index 07d3ed4..25b0fc3 100644 --- a/src/util.rs +++ b/src/util.rs @@ -32,8 +32,9 @@ where D: serde::Deserializer<'de>, { let s: String = serde::Deserialize::deserialize(deserializer)?; - chrono::TimeZone::datetime_from_str(&chrono::Utc, &s, RFC1123_FMT) - .map_err(serde::de::Error::custom) + let naive = chrono::NaiveDateTime::parse_from_str(&s, RFC1123_FMT) + .map_err(serde::de::Error::custom)?; + Ok(chrono::TimeZone::from_utc_datetime(&chrono::Utc, &naive)) } #[cfg(any(feature = "aws", feature = "azure"))] From c63f9e19b69eb8966bc00ee2cced34d9147cd524 Mon Sep 17 00:00:00 2001 From: Valery Meleshkin Date: Thu, 7 Sep 2023 13:59:59 +0200 Subject: [PATCH 183/397] Make coalesce_ranges and collect_bytes available to the users (#4784) of the object_store crate. --- src/lib.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lib.rs b/src/lib.rs index d1ee83b..413b400 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -282,7 +282,7 @@ pub use parse::{parse_url, parse_url_opts}; use crate::path::Path; #[cfg(not(target_arch = "wasm32"))] use crate::util::maybe_spawn_blocking; -use crate::util::{coalesce_ranges, collect_bytes, OBJECT_STORE_COALESCE_DEFAULT}; +pub use crate::util::{coalesce_ranges, collect_bytes, OBJECT_STORE_COALESCE_DEFAULT}; use async_trait::async_trait; use bytes::Bytes; use chrono::{DateTime, Utc}; From 67c44c29b0ab98ee0d40d2fa0b79d7abb13da954 Mon Sep 17 00:00:00 2001 From: Valery Meleshkin Date: Thu, 7 Sep 2023 16:19:06 +0200 Subject: [PATCH 184/397] Relaxing type bounds on coalesce_ranges and collect_bytes (#4787) to allow using them with a wider range of Error types. --- src/util.rs | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/src/util.rs b/src/util.rs index 25b0fc3..764582a 100644 --- a/src/util.rs +++ b/src/util.rs @@ -47,9 +47,13 @@ pub(crate) fn hmac_sha256( } /// Collect a stream into [`Bytes`] avoiding copying in the event of a single chunk -pub async fn collect_bytes(mut stream: S, size_hint: Option) -> Result +pub async fn collect_bytes( + mut stream: S, + size_hint: Option, +) -> Result where - S: Stream> + Send + Unpin, + E: Send, + S: Stream> + Send + Unpin, { let first = stream.next().await.transpose()?.unwrap_or_default(); @@ -99,14 +103,15 @@ pub const OBJECT_STORE_COALESCE_PARALLEL: usize = 10; /// * Combine ranges less than `coalesce` bytes apart into a single call to `fetch` /// * Make multiple `fetch` requests in parallel (up to maximum of 10) /// -pub async fn coalesce_ranges( +pub async fn coalesce_ranges( ranges: &[std::ops::Range], fetch: F, coalesce: usize, -) -> Result> +) -> Result, E> where F: Send + FnMut(std::ops::Range) -> Fut, - Fut: std::future::Future> + Send, + E: Send, + Fut: std::future::Future> + Send, { let fetch_ranges = merge_ranges(ranges, coalesce); @@ -173,6 +178,8 @@ fn merge_ranges( #[cfg(test)] mod tests { + use crate::Error; + use super::*; use rand::{thread_rng, Rng}; use std::ops::Range; @@ -185,7 +192,7 @@ mod tests { let src: Vec<_> = (0..max).map(|x| x as u8).collect(); let mut fetches = vec![]; - let coalesced = coalesce_ranges( + let coalesced = coalesce_ranges::<_, Error, _>( &ranges, |range| { fetches.push(range.clone()); From 8ad924162a11ddccc1fcd33128bb409bbd66a0c8 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Fri, 8 Sep 2023 08:43:38 +0100 Subject: [PATCH 185/397] Best effort cleanup of staged upload files (#4778) (#4792) * Best effort cleanup of staged upload files (#4778) * Clippy * Fix MSRV --- src/local.rs | 142 +++++++++++++++++++++++++++++---------------------- 1 file changed, 81 insertions(+), 61 deletions(-) diff --git a/src/local.rs b/src/local.rs index 495bb4f..20eb3c6 100644 --- a/src/local.rs +++ b/src/local.rs @@ -26,6 +26,7 @@ use async_trait::async_trait; use bytes::Bytes; use chrono::{DateTime, Utc}; use futures::future::BoxFuture; +use futures::ready; use futures::{stream::BoxStream, StreamExt}; use futures::{FutureExt, TryStreamExt}; use snafu::{ensure, ResultExt, Snafu}; @@ -274,13 +275,15 @@ impl ObjectStore for LocalFileSystem { maybe_spawn_blocking(move || { let (mut file, suffix) = new_staged_upload(&path)?; let staging_path = staged_upload_path(&path, &suffix); - file.write_all(&bytes) - .context(UnableToCopyDataToFileSnafu)?; - - std::fs::rename(staging_path, path).context(UnableToRenameFileSnafu)?; - - Ok(()) + .context(UnableToCopyDataToFileSnafu) + .and_then(|_| { + std::fs::rename(&staging_path, &path).context(UnableToRenameFileSnafu) + }) + .map_err(|e| { + let _ = std::fs::remove_file(&staging_path); // Attempt to cleanup + e.into() + }) }) .await } @@ -304,12 +307,14 @@ impl ObjectStore for LocalFileSystem { multipart_id: &MultipartId, ) -> Result<()> { let dest = self.config.path_to_filesystem(location)?; - let staging_path: PathBuf = staged_upload_path(&dest, multipart_id); + let path: PathBuf = staged_upload_path(&dest, multipart_id); - maybe_spawn_blocking(move || { - std::fs::remove_file(&staging_path) - .context(UnableToDeleteFileSnafu { path: staging_path })?; - Ok(()) + maybe_spawn_blocking(move || match std::fs::remove_file(&path) { + Ok(_) => Ok(()), + Err(source) => match source.kind() { + ErrorKind::NotFound => Ok(()), // Already deleted + _ => Err(Error::UnableToDeleteFile { path, source }.into()), + }, }) .await } @@ -318,7 +323,6 @@ impl ObjectStore for LocalFileSystem { &self, location: &Path, ) -> Result> { - #[cfg(not(target_arch = "wasm32"))] // Get the path to the file from the configuration. let path = self.config.path_to_filesystem(location)?; loop { @@ -358,8 +362,6 @@ impl ObjectStore for LocalFileSystem { } } } - #[cfg(target_arch = "wasm32")] - Err(super::Error::NotImplemented) } async fn get_opts(&self, location: &Path, options: GetOptions) -> Result { @@ -597,8 +599,9 @@ impl ObjectStore for LocalFileSystem { match std::fs::hard_link(&from, &staged) { Ok(_) => { return std::fs::rename(&staged, &to).map_err(|source| { + let _ = std::fs::remove_file(&staged); // Attempt to clean up Error::UnableToCopyFile { from, to, source }.into() - }) + }); } Err(source) => match source.kind() { ErrorKind::AlreadyExists => id += 1, @@ -690,12 +693,9 @@ fn staged_upload_path(dest: &std::path::Path, suffix: &str) -> PathBuf { enum LocalUploadState { /// Upload is ready to send new data - Idle(Arc), + Idle(Arc), /// In the middle of a write - Writing( - Arc, - BoxFuture<'static, Result>, - ), + Writing(Arc, BoxFuture<'static, Result>), /// In the middle of syncing data and closing file. /// /// Future will contain last reference to file, so it will call drop on completion. @@ -713,11 +713,7 @@ struct LocalUpload { } impl LocalUpload { - pub fn new( - dest: PathBuf, - multipart_id: MultipartId, - file: Arc, - ) -> Self { + pub fn new(dest: PathBuf, multipart_id: MultipartId, file: Arc) -> Self { Self { inner_state: LocalUploadState::Idle(file), dest, @@ -731,14 +727,13 @@ impl AsyncWrite for LocalUpload { mut self: Pin<&mut Self>, cx: &mut std::task::Context<'_>, buf: &[u8], - ) -> std::task::Poll> { - let invalid_state = - |condition: &str| -> std::task::Poll> { - Poll::Ready(Err(io::Error::new( - io::ErrorKind::InvalidInput, - format!("Tried to write to file {condition}."), - ))) - }; + ) -> Poll> { + let invalid_state = |condition: &str| -> Poll> { + Poll::Ready(Err(io::Error::new( + ErrorKind::InvalidInput, + format!("Tried to write to file {condition}."), + ))) + }; if let Ok(runtime) = tokio::runtime::Handle::try_current() { let mut data: Vec = buf.to_vec(); @@ -757,7 +752,7 @@ impl AsyncWrite for LocalUpload { .spawn_blocking(move || (&*file2).write_all(&data)) .map(move |res| match res { Err(err) => { - Err(io::Error::new(io::ErrorKind::Other, err)) + Err(io::Error::new(ErrorKind::Other, err)) } Ok(res) => res.map(move |_| data_len), }), @@ -765,16 +760,9 @@ impl AsyncWrite for LocalUpload { ); } LocalUploadState::Writing(file, inner_write) => { - match inner_write.poll_unpin(cx) { - Poll::Ready(res) => { - self.inner_state = - LocalUploadState::Idle(Arc::clone(file)); - return Poll::Ready(res); - } - Poll::Pending => { - return Poll::Pending; - } - } + let res = ready!(inner_write.poll_unpin(cx)); + self.inner_state = LocalUploadState::Idle(Arc::clone(file)); + return Poll::Ready(res); } LocalUploadState::ShuttingDown(_) => { return invalid_state("when writer is shutting down"); @@ -800,14 +788,14 @@ impl AsyncWrite for LocalUpload { fn poll_flush( self: Pin<&mut Self>, _cx: &mut std::task::Context<'_>, - ) -> std::task::Poll> { + ) -> Poll> { Poll::Ready(Ok(())) } fn poll_shutdown( mut self: Pin<&mut Self>, cx: &mut std::task::Context<'_>, - ) -> std::task::Poll> { + ) -> Poll> { if let Ok(runtime) = tokio::runtime::Handle::try_current() { loop { match &mut self.inner_state { @@ -854,13 +842,11 @@ impl AsyncWrite for LocalUpload { "Tried to commit a file where a write is in progress.", ))); } - LocalUploadState::Committing(fut) => match fut.poll_unpin(cx) { - Poll::Ready(res) => { - self.inner_state = LocalUploadState::Complete; - return Poll::Ready(res); - } - Poll::Pending => return Poll::Pending, - }, + LocalUploadState::Committing(fut) => { + let res = ready!(fut.poll_unpin(cx)); + self.inner_state = LocalUploadState::Complete; + return Poll::Ready(res); + } LocalUploadState::Complete => { return Poll::Ready(Err(io::Error::new( io::ErrorKind::Other, @@ -876,22 +862,36 @@ impl AsyncWrite for LocalUpload { let file = Arc::clone(file); self.inner_state = LocalUploadState::Complete; file.sync_all()?; - std::mem::drop(file); + drop(file); std::fs::rename(staging_path, &self.dest)?; Poll::Ready(Ok(())) } _ => { // If we are running on this thread, then only possible states are Idle and Complete. - Poll::Ready(Err(io::Error::new( - io::ErrorKind::Other, - "Already complete", - ))) + Poll::Ready(Err(io::Error::new(ErrorKind::Other, "Already complete"))) } } } } } +impl Drop for LocalUpload { + fn drop(&mut self) { + match self.inner_state { + LocalUploadState::Complete => (), + _ => { + self.inner_state = LocalUploadState::Complete; + let path = staged_upload_path(&self.dest, &self.multipart_id); + // Try to cleanup intermediate file ignoring any error + match tokio::runtime::Handle::try_current() { + Ok(r) => drop(r.spawn_blocking(move || std::fs::remove_file(path))), + Err(_) => drop(std::fs::remove_file(path)), + }; + } + } + } +} + pub(crate) fn chunked_stream( mut file: File, path: PathBuf, @@ -1018,8 +1018,8 @@ fn convert_metadata(metadata: Metadata, location: Path) -> Result { /// Convert walkdir results and converts not-found errors into `None`. /// Convert broken symlinks to `None`. fn convert_walkdir_result( - res: std::result::Result, -) -> Result> { + res: std::result::Result, +) -> Result> { match res { Ok(entry) => { // To check for broken symlink: call symlink_metadata() - it does not traverse symlinks); @@ -1048,7 +1048,7 @@ fn convert_walkdir_result( Err(walkdir_err) => match walkdir_err.io_error() { Some(io_err) => match io_err.kind() { - io::ErrorKind::NotFound => Ok(None), + ErrorKind::NotFound => Ok(None), _ => Err(Error::UnableToWalkDir { source: walkdir_err, } @@ -1476,6 +1476,7 @@ mod not_wasm_tests { use crate::local::LocalFileSystem; use crate::{ObjectStore, Path}; use bytes::Bytes; + use std::time::Duration; use tempfile::TempDir; use tokio::io::AsyncWriteExt; @@ -1560,6 +1561,25 @@ mod not_wasm_tests { let expected_data = Bytes::from("arbitrarydatagnzarbitrarydatagnz"); assert_eq!(&*read_data, expected_data); } + + #[tokio::test] + async fn test_cleanup_intermediate_files() { + let root = TempDir::new().unwrap(); + let integration = LocalFileSystem::new_with_prefix(root.path()).unwrap(); + + let location = Path::from("some_file"); + let (_, mut writer) = integration.put_multipart(&location).await.unwrap(); + writer.write_all(b"hello").await.unwrap(); + + let file_count = std::fs::read_dir(root.path()).unwrap().count(); + assert_eq!(file_count, 1); + drop(writer); + + tokio::time::sleep(Duration::from_millis(1)).await; + + let file_count = std::fs::read_dir(root.path()).unwrap().count(); + assert_eq!(file_count, 0); + } } #[cfg(target_family = "unix")] From f60ab247409de42149d28db7c9e5afee8e6e1bb5 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 14 Sep 2023 10:21:50 +0100 Subject: [PATCH 186/397] ObjectStore Wasm32 Fixes (#4775) (#4776) (#4796) --- README.md | 2 +- src/client/mod.rs | 3 ++- src/lib.rs | 18 +++++++++--------- src/parse.rs | 2 +- 4 files changed, 13 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index 5b47a65..fd09ec7 100644 --- a/README.md +++ b/README.md @@ -39,7 +39,7 @@ See [docs.rs](https://docs.rs/object_store) for usage instructions ## Support for `wasm32-unknown-unknown` target -It's possible to build `object_store` for the `wasm32-unknown-unknown` target, however the cloud storage features `aws`, `azure`, and `gcp` are not supported. +It's possible to build `object_store` for the `wasm32-unknown-unknown` target, however the cloud storage features `aws`, `azure`, `gcp`, and `http` are not supported. ``` cargo build -p object_store --target wasm32-unknown-unknown diff --git a/src/client/mod.rs b/src/client/mod.rs index d4995a5..77b14a7 100644 --- a/src/client/mod.rs +++ b/src/client/mod.rs @@ -18,6 +18,7 @@ //! Generic utilities reqwest based ObjectStore implementations pub mod backoff; + #[cfg(test)] pub mod mock_server; @@ -35,7 +36,6 @@ pub mod list; #[cfg(any(feature = "aws", feature = "gcp", feature = "azure"))] pub mod token; -#[cfg(any(feature = "aws", feature = "gcp", feature = "azure"))] pub mod header; #[cfg(any(feature = "aws", feature = "gcp"))] @@ -575,6 +575,7 @@ pub struct StaticCredentialProvider { } impl StaticCredentialProvider { + /// A [`CredentialProvider`] for a static credential of type `T` pub fn new(credential: T) -> Self { Self { credential: Arc::new(credential), diff --git a/src/lib.rs b/src/lib.rs index 413b400..8d96ccf 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -240,9 +240,9 @@ #[cfg(all( target_arch = "wasm32", - any(feature = "gcp", feature = "aws", feature = "azure",) + any(feature = "gcp", feature = "aws", feature = "azure", feature = "http") ))] -compile_error!("Features 'gcp', 'aws', 'azure' are not supported on wasm."); +compile_error!("Features 'gcp', 'aws', 'azure', 'http' are not supported on wasm."); #[cfg(feature = "aws")] pub mod aws; @@ -263,13 +263,16 @@ pub mod path; pub mod prefix; pub mod throttle; -#[cfg(any(feature = "gcp", feature = "aws", feature = "azure", feature = "http"))] +#[cfg(feature = "cloud")] mod client; -#[cfg(any(feature = "gcp", feature = "aws", feature = "azure", feature = "http"))] -pub use client::{backoff::BackoffConfig, retry::RetryConfig, CredentialProvider}; +#[cfg(feature = "cloud")] +pub use client::{ + backoff::BackoffConfig, retry::RetryConfig, ClientConfigKey, ClientOptions, + CredentialProvider, StaticCredentialProvider, +}; -#[cfg(any(feature = "gcp", feature = "aws", feature = "azure", feature = "http"))] +#[cfg(feature = "cloud")] mod config; #[cfg(feature = "cloud")] @@ -295,9 +298,6 @@ use std::ops::Range; use std::sync::Arc; use tokio::io::AsyncWrite; -#[cfg(any(feature = "azure", feature = "aws", feature = "gcp", feature = "http"))] -pub use client::{ClientConfigKey, ClientOptions}; - /// An alias for a dynamically dispatched object store implementation. pub type DynObjectStore = dyn ObjectStore; diff --git a/src/parse.rs b/src/parse.rs index 7b89e58..1159e9a 100644 --- a/src/parse.rs +++ b/src/parse.rs @@ -104,7 +104,7 @@ impl ObjectStoreScheme { } } -#[cfg(any(feature = "aws", feature = "gcp", feature = "azure", feature = "http"))] +#[cfg(any(feature = "aws", feature = "gcp", feature = "azure"))] macro_rules! builder_opts { ($builder:ty, $url:expr, $options:expr) => {{ let builder = $options.into_iter().fold( From af0a065e62d0fe628f36fb82b90602ed65957a1a Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Sun, 17 Sep 2023 14:21:35 +0100 Subject: [PATCH 187/397] Update chrono pin (#4824) --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index b8d4391..72722df 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -32,7 +32,7 @@ all-features = true [dependencies] # In alphabetical order async-trait = "0.1.53" bytes = "1.0" -chrono = { version = "0.4.23", default-features = false, features = ["clock"] } +chrono = { version = "0.4.31", default-features = false, features = ["clock"] } futures = "0.3" humantime = "2.1" itertools = "0.11.0" From 33a92be171139829ac56065c5d655d9360b2fcdd Mon Sep 17 00:00:00 2001 From: Cory Grinstead Date: Tue, 19 Sep 2023 08:59:21 -0500 Subject: [PATCH 188/397] fix: object store http header last modified (#4834) * fix: object store http header last modified * refactor: make headermeta configurable on required fields * Update object_store/src/client/header.rs Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> * Update object_store/src/client/header.rs Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> * Update object_store/src/client/header.rs Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> --------- Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> --- src/client/get.rs | 12 +++++---- src/client/header.rs | 62 ++++++++++++++++++++++++++++++++++---------- src/http/mod.rs | 9 +++++-- 3 files changed, 62 insertions(+), 21 deletions(-) diff --git a/src/client/get.rs b/src/client/get.rs index 6b2d60a..8b84a07 100644 --- a/src/client/get.rs +++ b/src/client/get.rs @@ -49,8 +49,8 @@ impl GetClientExt for T { async fn get_opts(&self, location: &Path, options: GetOptions) -> Result { let range = options.range.clone(); let response = self.get_request(location, options, false).await?; - let meta = - header_meta(location, response.headers()).map_err(|e| Error::Generic { + let meta = header_meta(location, response.headers(), Default::default()) + .map_err(|e| Error::Generic { store: T::STORE, source: Box::new(e), })?; @@ -73,9 +73,11 @@ impl GetClientExt for T { async fn head(&self, location: &Path) -> Result { let options = GetOptions::default(); let response = self.get_request(location, options, true).await?; - header_meta(location, response.headers()).map_err(|e| Error::Generic { - store: T::STORE, - source: Box::new(e), + header_meta(location, response.headers(), Default::default()).map_err(|e| { + Error::Generic { + store: T::STORE, + source: Box::new(e), + } }) } } diff --git a/src/client/header.rs b/src/client/header.rs index cc4f16e..b55494c 100644 --- a/src/client/header.rs +++ b/src/client/header.rs @@ -19,11 +19,33 @@ use crate::path::Path; use crate::ObjectMeta; -use chrono::{DateTime, Utc}; +use chrono::{DateTime, TimeZone, Utc}; use hyper::header::{CONTENT_LENGTH, ETAG, LAST_MODIFIED}; use hyper::HeaderMap; use snafu::{OptionExt, ResultExt, Snafu}; +#[derive(Debug)] +/// Configuration for header extraction +pub struct HeaderConfig { + /// Whether to require an ETag header when extracting [`ObjectMeta`] from headers. + /// + /// Defaults to `true` + pub etag_required: bool, + /// Whether to require a Last-Modified header when extracting [`ObjectMeta`] from headers. + /// + /// Defaults to `true` + pub last_modified_required: bool, +} + +impl Default for HeaderConfig { + fn default() -> Self { + Self { + etag_required: true, + last_modified_required: true, + } + } +} + #[derive(Debug, Snafu)] pub enum Error { #[snafu(display("ETag Header missing from response"))] @@ -52,32 +74,44 @@ pub enum Error { } /// Extracts [`ObjectMeta`] from the provided [`HeaderMap`] -pub fn header_meta(location: &Path, headers: &HeaderMap) -> Result { - let last_modified = headers - .get(LAST_MODIFIED) - .context(MissingLastModifiedSnafu)?; +pub fn header_meta( + location: &Path, + headers: &HeaderMap, + cfg: HeaderConfig, +) -> Result { + let last_modified = match headers.get(LAST_MODIFIED) { + Some(last_modified) => { + let last_modified = last_modified.to_str().context(BadHeaderSnafu)?; + DateTime::parse_from_rfc2822(last_modified) + .context(InvalidLastModifiedSnafu { last_modified })? + .with_timezone(&Utc) + } + None if cfg.last_modified_required => return Err(Error::MissingLastModified), + None => Utc.timestamp_nanos(0), + }; + + let e_tag = match headers.get(ETAG) { + Some(e_tag) => { + let e_tag = e_tag.to_str().context(BadHeaderSnafu)?; + Some(e_tag.to_string()) + } + None if cfg.etag_required => return Err(Error::MissingEtag), + None => None, + }; let content_length = headers .get(CONTENT_LENGTH) .context(MissingContentLengthSnafu)?; - let last_modified = last_modified.to_str().context(BadHeaderSnafu)?; - let last_modified = DateTime::parse_from_rfc2822(last_modified) - .context(InvalidLastModifiedSnafu { last_modified })? - .with_timezone(&Utc); - let content_length = content_length.to_str().context(BadHeaderSnafu)?; let content_length = content_length .parse() .context(InvalidContentLengthSnafu { content_length })?; - let e_tag = headers.get(ETAG).context(MissingEtagSnafu)?; - let e_tag = e_tag.to_str().context(BadHeaderSnafu)?; - Ok(ObjectMeta { location: location.clone(), last_modified, size: content_length, - e_tag: Some(e_tag.to_string()), + e_tag, }) } diff --git a/src/http/mod.rs b/src/http/mod.rs index e8e7b45..6143819 100644 --- a/src/http/mod.rs +++ b/src/http/mod.rs @@ -40,7 +40,7 @@ use snafu::{OptionExt, ResultExt, Snafu}; use tokio::io::AsyncWrite; use url::Url; -use crate::client::header::header_meta; +use crate::client::header::{header_meta, HeaderConfig}; use crate::http::client::Client; use crate::path::Path; use crate::{ @@ -117,7 +117,12 @@ impl ObjectStore for HttpStore { async fn get_opts(&self, location: &Path, options: GetOptions) -> Result { let range = options.range.clone(); let response = self.client.get(location, options).await?; - let meta = header_meta(location, response.headers()).context(MetadataSnafu)?; + let cfg = HeaderConfig { + last_modified_required: false, + etag_required: false, + }; + let meta = + header_meta(location, response.headers(), cfg).context(MetadataSnafu)?; let stream = response .bytes_stream() From fb32443e9d31660c1f3fedc2483bdf591af6e425 Mon Sep 17 00:00:00 2001 From: Cory Grinstead Date: Wed, 20 Sep 2023 09:22:44 -0500 Subject: [PATCH 189/397] Error if Remote Ignores HTTP Range Header (#4841) * fix: abort http:get on !206 when issuing a range request * add some comments * pr feedback * Update object_store/src/http/client.rs Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> --------- Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> --- src/http/client.rs | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/src/http/client.rs b/src/http/client.rs index 93cd4ee..67a4129 100644 --- a/src/http/client.rs +++ b/src/http/client.rs @@ -37,6 +37,9 @@ enum Error { #[snafu(display("Request error: {}", source))] Reqwest { source: reqwest::Error }, + #[snafu(display("Range request not supported by {}", href))] + RangeNotSupported { href: String }, + #[snafu(display("Error decoding PROPFIND response: {}", source))] InvalidPropFind { source: quick_xml::de::DeError }, @@ -238,8 +241,9 @@ impl Client { pub async fn get(&self, location: &Path, options: GetOptions) -> Result { let url = self.path_url(location); let builder = self.client.get(url); + let has_range = options.range.is_some(); - builder + let res = builder .with_get_options(options) .send_retry(&self.retry_config) .await @@ -252,7 +256,19 @@ impl Client { } } _ => Error::Request { source }.into(), - }) + })?; + + // We expect a 206 Partial Content response if a range was requested + // a 200 OK response would indicate the server did not fulfill the request + if has_range && res.status() != StatusCode::PARTIAL_CONTENT { + return Err(crate::Error::NotSupported { + source: Box::new(Error::RangeNotSupported { + href: location.to_string(), + }), + }); + } + + Ok(res) } pub async fn copy(&self, from: &Path, to: &Path, overwrite: bool) -> Result<()> { From dca64138bfcd79e29590dbf161b31d3631ff15c7 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Sat, 23 Sep 2023 18:18:45 +0100 Subject: [PATCH 190/397] Perform HEAD request for HttpStore::head (#4837) * Perform HEAD request for HttpStore::head * Logical merge conflicts * Review feedback --- src/client/get.rs | 20 +++++++--- src/client/header.rs | 11 +----- src/client/mod.rs | 1 - src/http/client.rs | 90 ++++++++++++++++++++++++++++---------------- src/http/mod.rs | 47 +++-------------------- 5 files changed, 78 insertions(+), 91 deletions(-) diff --git a/src/client/get.rs b/src/client/get.rs index 8b84a07..333f6fe 100644 --- a/src/client/get.rs +++ b/src/client/get.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -use crate::client::header::header_meta; +use crate::client::header::{header_meta, HeaderConfig}; use crate::path::Path; use crate::{Error, GetOptions, GetResult, ObjectMeta}; use crate::{GetResultPayload, Result}; @@ -28,6 +28,12 @@ use reqwest::Response; pub trait GetClient: Send + Sync + 'static { const STORE: &'static str; + /// Configure the [`HeaderConfig`] for this client + const HEADER_CONFIG: HeaderConfig = HeaderConfig { + etag_required: true, + last_modified_required: true, + }; + async fn get_request( &self, path: &Path, @@ -49,10 +55,12 @@ impl GetClientExt for T { async fn get_opts(&self, location: &Path, options: GetOptions) -> Result { let range = options.range.clone(); let response = self.get_request(location, options, false).await?; - let meta = header_meta(location, response.headers(), Default::default()) - .map_err(|e| Error::Generic { - store: T::STORE, - source: Box::new(e), + let meta = + header_meta(location, response.headers(), T::HEADER_CONFIG).map_err(|e| { + Error::Generic { + store: T::STORE, + source: Box::new(e), + } })?; let stream = response @@ -73,7 +81,7 @@ impl GetClientExt for T { async fn head(&self, location: &Path) -> Result { let options = GetOptions::default(); let response = self.get_request(location, options, true).await?; - header_meta(location, response.headers(), Default::default()).map_err(|e| { + header_meta(location, response.headers(), T::HEADER_CONFIG).map_err(|e| { Error::Generic { store: T::STORE, source: Box::new(e), diff --git a/src/client/header.rs b/src/client/header.rs index b55494c..6499eff 100644 --- a/src/client/header.rs +++ b/src/client/header.rs @@ -24,7 +24,7 @@ use hyper::header::{CONTENT_LENGTH, ETAG, LAST_MODIFIED}; use hyper::HeaderMap; use snafu::{OptionExt, ResultExt, Snafu}; -#[derive(Debug)] +#[derive(Debug, Copy, Clone)] /// Configuration for header extraction pub struct HeaderConfig { /// Whether to require an ETag header when extracting [`ObjectMeta`] from headers. @@ -37,15 +37,6 @@ pub struct HeaderConfig { pub last_modified_required: bool, } -impl Default for HeaderConfig { - fn default() -> Self { - Self { - etag_required: true, - last_modified_required: true, - } - } -} - #[derive(Debug, Snafu)] pub enum Error { #[snafu(display("ETag Header missing from response"))] diff --git a/src/client/mod.rs b/src/client/mod.rs index 77b14a7..ee9d62a 100644 --- a/src/client/mod.rs +++ b/src/client/mod.rs @@ -27,7 +27,6 @@ pub mod retry; #[cfg(any(feature = "aws", feature = "gcp", feature = "azure"))] pub mod pagination; -#[cfg(any(feature = "aws", feature = "gcp", feature = "azure"))] pub mod get; #[cfg(any(feature = "aws", feature = "gcp", feature = "azure"))] diff --git a/src/http/client.rs b/src/http/client.rs index 67a4129..0bd2e56 100644 --- a/src/http/client.rs +++ b/src/http/client.rs @@ -15,11 +15,14 @@ // specific language governing permissions and limitations // under the License. +use crate::client::get::GetClient; +use crate::client::header::HeaderConfig; use crate::client::retry::{self, RetryConfig, RetryExt}; use crate::client::GetOptionsExt; use crate::path::{Path, DELIMITER}; use crate::util::deserialize_rfc1123; use crate::{ClientOptions, GetOptions, ObjectMeta, Result}; +use async_trait::async_trait; use bytes::{Buf, Bytes}; use chrono::{DateTime, Utc}; use percent_encoding::percent_decode_str; @@ -238,39 +241,6 @@ impl Client { Ok(()) } - pub async fn get(&self, location: &Path, options: GetOptions) -> Result { - let url = self.path_url(location); - let builder = self.client.get(url); - let has_range = options.range.is_some(); - - let res = builder - .with_get_options(options) - .send_retry(&self.retry_config) - .await - .map_err(|source| match source.status() { - // Some stores return METHOD_NOT_ALLOWED for get on directories - Some(StatusCode::NOT_FOUND | StatusCode::METHOD_NOT_ALLOWED) => { - crate::Error::NotFound { - source: Box::new(source), - path: location.to_string(), - } - } - _ => Error::Request { source }.into(), - })?; - - // We expect a 206 Partial Content response if a range was requested - // a 200 OK response would indicate the server did not fulfill the request - if has_range && res.status() != StatusCode::PARTIAL_CONTENT { - return Err(crate::Error::NotSupported { - source: Box::new(Error::RangeNotSupported { - href: location.to_string(), - }), - }); - } - - Ok(res) - } - pub async fn copy(&self, from: &Path, to: &Path, overwrite: bool) -> Result<()> { let mut retry = false; loop { @@ -307,6 +277,60 @@ impl Client { } } +#[async_trait] +impl GetClient for Client { + const STORE: &'static str = "HTTP"; + + /// Override the [`HeaderConfig`] to be less strict to support a + /// broader range of HTTP servers (#4831) + const HEADER_CONFIG: HeaderConfig = HeaderConfig { + etag_required: false, + last_modified_required: false, + }; + + async fn get_request( + &self, + location: &Path, + options: GetOptions, + head: bool, + ) -> Result { + let url = self.path_url(location); + let method = match head { + true => Method::HEAD, + false => Method::GET, + }; + let has_range = options.range.is_some(); + let builder = self.client.request(method, url); + + let res = builder + .with_get_options(options) + .send_retry(&self.retry_config) + .await + .map_err(|source| match source.status() { + // Some stores return METHOD_NOT_ALLOWED for get on directories + Some(StatusCode::NOT_FOUND | StatusCode::METHOD_NOT_ALLOWED) => { + crate::Error::NotFound { + source: Box::new(source), + path: location.to_string(), + } + } + _ => Error::Request { source }.into(), + })?; + + // We expect a 206 Partial Content response if a range was requested + // a 200 OK response would indicate the server did not fulfill the request + if has_range && res.status() != StatusCode::PARTIAL_CONTENT { + return Err(crate::Error::NotSupported { + source: Box::new(Error::RangeNotSupported { + href: location.to_string(), + }), + }); + } + + Ok(res) + } +} + /// The response returned by a PROPFIND request, i.e. list #[derive(Deserialize, Default)] pub struct MultiStatus { diff --git a/src/http/mod.rs b/src/http/mod.rs index 6143819..afbc0ce 100644 --- a/src/http/mod.rs +++ b/src/http/mod.rs @@ -34,18 +34,18 @@ use async_trait::async_trait; use bytes::Bytes; use futures::stream::BoxStream; -use futures::{StreamExt, TryStreamExt}; +use futures::StreamExt; use itertools::Itertools; use snafu::{OptionExt, ResultExt, Snafu}; use tokio::io::AsyncWrite; use url::Url; -use crate::client::header::{header_meta, HeaderConfig}; +use crate::client::get::GetClientExt; use crate::http::client::Client; use crate::path::Path; use crate::{ - ClientConfigKey, ClientOptions, GetOptions, GetResult, GetResultPayload, ListResult, - MultipartId, ObjectMeta, ObjectStore, Result, RetryConfig, + ClientConfigKey, ClientOptions, GetOptions, GetResult, ListResult, MultipartId, + ObjectMeta, ObjectStore, Result, RetryConfig, }; mod client; @@ -115,46 +115,11 @@ impl ObjectStore for HttpStore { } async fn get_opts(&self, location: &Path, options: GetOptions) -> Result { - let range = options.range.clone(); - let response = self.client.get(location, options).await?; - let cfg = HeaderConfig { - last_modified_required: false, - etag_required: false, - }; - let meta = - header_meta(location, response.headers(), cfg).context(MetadataSnafu)?; - - let stream = response - .bytes_stream() - .map_err(|source| Error::Reqwest { source }.into()) - .boxed(); - - Ok(GetResult { - payload: GetResultPayload::Stream(stream), - range: range.unwrap_or(0..meta.size), - meta, - }) + self.client.get_opts(location, options).await } async fn head(&self, location: &Path) -> Result { - let status = self.client.list(Some(location), "0").await?; - match status.response.len() { - 1 => { - let response = status.response.into_iter().next().unwrap(); - response.check_ok()?; - match response.is_dir() { - true => Err(crate::Error::NotFound { - path: location.to_string(), - source: "Is directory".to_string().into(), - }), - false => response.object_meta(self.client.base_url()), - } - } - x => Err(crate::Error::NotFound { - path: location.to_string(), - source: format!("Expected 1 result, got {x}").into(), - }), - } + self.client.head(location).await } async fn delete(&self, location: &Path) -> Result<()> { From 06d4f1d8b5baf49bfacf6c70480b28fe1c8f2bfd Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Sat, 23 Sep 2023 13:18:56 -0400 Subject: [PATCH 191/397] Minor: Improve object_store docs.rs landing page (#4849) * Improve object_store docs.rs landing page * Apply suggestions from code review --------- Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> --- src/http/mod.rs | 2 +- src/lib.rs | 21 +++++++++++++-------- 2 files changed, 14 insertions(+), 9 deletions(-) diff --git a/src/http/mod.rs b/src/http/mod.rs index afbc0ce..e9ed590 100644 --- a/src/http/mod.rs +++ b/src/http/mod.rs @@ -17,7 +17,7 @@ //! An object store implementation for generic HTTP servers //! -//! This follows [rfc2518] commonly known called [WebDAV] +//! This follows [rfc2518] commonly known as [WebDAV] //! //! Basic get support will work out of the box with most HTTP servers, //! even those that don't explicitly support [rfc2518] diff --git a/src/lib.rs b/src/lib.rs index 8d96ccf..cef10f1 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -36,7 +36,7 @@ //! clouds and local test environments, via a simple runtime //! configuration change. //! -//! # Features: +//! # Highlights //! //! 1. A focused, easy to use, idiomatic, well documented, high //! performance, `async` API. @@ -53,26 +53,31 @@ //! [InfluxDB IOx]: https://github.com/influxdata/influxdb_iox/ //! [crates.io]: https://github.com/rust-lang/crates.io //! -//! # Example: Create an [`ObjectStore`] implementation: +//! # Available [`ObjectStore`] Implementations +//! +//! By default, this crate provides the following implementations: +//! +//! * Memory: [`InMemory`](memory::InMemory) +//! * Local filesystem: [`LocalFileSystem`](local::LocalFileSystem) +//! +//! Feature flags are used to enable support for other implementations: //! #![cfg_attr( feature = "gcp", - doc = "* [Google Cloud Storage](https://cloud.google.com/storage/): [`GoogleCloudStorageBuilder`](gcp::GoogleCloudStorageBuilder)" + doc = "* `gcp`: [Google Cloud Storage](https://cloud.google.com/storage/) support. See [`GoogleCloudStorageBuilder`](gcp::GoogleCloudStorageBuilder)" )] #![cfg_attr( feature = "aws", - doc = "* [Amazon S3](https://aws.amazon.com/s3/): [`AmazonS3Builder`](aws::AmazonS3Builder)" + doc = "* `aws`: [Amazon S3](https://aws.amazon.com/s3/). See [`AmazonS3Builder`](aws::AmazonS3Builder)" )] #![cfg_attr( feature = "azure", - doc = "* [Azure Blob Storage](https://azure.microsoft.com/en-gb/services/storage/blobs/): [`MicrosoftAzureBuilder`](azure::MicrosoftAzureBuilder)" + doc = "* `azure`: [Azure Blob Storage](https://azure.microsoft.com/en-gb/services/storage/blobs/). See [`MicrosoftAzureBuilder`](azure::MicrosoftAzureBuilder)" )] #![cfg_attr( feature = "http", - doc = "* [HTTP Storage](https://datatracker.ietf.org/doc/html/rfc2518): [`HttpBuilder`](http::HttpBuilder)" + doc = "* `http`: [HTTP/WebDAV Storage](https://datatracker.ietf.org/doc/html/rfc2518). See [`HttpBuilder`](http::HttpBuilder)" )] -//! * In Memory: [`InMemory`](memory::InMemory) -//! * Local filesystem: [`LocalFileSystem`](local::LocalFileSystem) //! //! # Adapters //! From 65f1e9158ea552053c1184da7033bbb4333a96a2 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Mon, 25 Sep 2023 10:45:16 +0100 Subject: [PATCH 192/397] Allow overriding azure endpoint (#4853) (#4854) --- src/azure/mod.rs | 37 ++++++++++++++++++++++++++++++++++--- 1 file changed, 34 insertions(+), 3 deletions(-) diff --git a/src/azure/mod.rs b/src/azure/mod.rs index 2a07710..b210d48 100644 --- a/src/azure/mod.rs +++ b/src/azure/mod.rs @@ -325,6 +325,8 @@ pub struct MicrosoftAzureBuilder { url: Option, /// When set to true, azurite storage emulator has to be used use_emulator: ConfigValue, + /// Storage endpoint + endpoint: Option, /// Msi endpoint for acquiring managed identity token msi_endpoint: Option, /// Object id for use with managed identity authentication @@ -434,6 +436,14 @@ pub enum AzureConfigKey { /// - `use_emulator` UseEmulator, + /// Override the endpoint used to communicate with blob storage + /// + /// Supported keys: + /// - `azure_storage_endpoint` + /// - `azure_endpoint` + /// - `endpoint` + Endpoint, + /// Use object store with url scheme account.dfs.fabric.microsoft.com /// /// Supported keys: @@ -501,6 +511,7 @@ impl AsRef for AzureConfigKey { Self::Token => "azure_storage_token", Self::UseEmulator => "azure_storage_use_emulator", Self::UseFabricEndpoint => "azure_use_fabric_endpoint", + Self::Endpoint => "azure_storage_endpoint", Self::MsiEndpoint => "azure_msi_endpoint", Self::ObjectId => "azure_object_id", Self::MsiResourceId => "azure_msi_resource_id", @@ -542,6 +553,9 @@ impl FromStr for AzureConfigKey { | "sas_token" => Ok(Self::SasKey), "azure_storage_token" | "bearer_token" | "token" => Ok(Self::Token), "azure_storage_use_emulator" | "use_emulator" => Ok(Self::UseEmulator), + "azure_storage_endpoint" | "azure_endpoint" | "endpoint" => { + Ok(Self::Endpoint) + } "azure_msi_endpoint" | "azure_identity_endpoint" | "identity_endpoint" @@ -668,6 +682,7 @@ impl MicrosoftAzureBuilder { } AzureConfigKey::UseAzureCli => self.use_azure_cli.parse(value), AzureConfigKey::UseEmulator => self.use_emulator.parse(value), + AzureConfigKey::Endpoint => self.endpoint = Some(value.into()), AzureConfigKey::UseFabricEndpoint => self.use_fabric_endpoint.parse(value), AzureConfigKey::Client(key) => { self.client_options = self.client_options.with_config(key, value) @@ -726,6 +741,7 @@ impl MicrosoftAzureBuilder { AzureConfigKey::UseFabricEndpoint => { Some(self.use_fabric_endpoint.to_string()) } + AzureConfigKey::Endpoint => self.endpoint.clone(), AzureConfigKey::MsiEndpoint => self.msi_endpoint.clone(), AzureConfigKey::ObjectId => self.object_id.clone(), AzureConfigKey::MsiResourceId => self.msi_resource_id.clone(), @@ -873,9 +889,19 @@ impl MicrosoftAzureBuilder { self } + /// Override the endpoint used to communicate with blob storage + /// + /// Defaults to `https://{account}.blob.core.windows.net` + pub fn with_endpoint(mut self, endpoint: String) -> Self { + self.endpoint = Some(endpoint); + self + } + /// Set if Microsoft Fabric url scheme should be used (defaults to false) /// When disabled the url scheme used is `https://{account}.blob.core.windows.net` /// When enabled the url scheme used is `https://{account}.dfs.fabric.microsoft.com` + /// + /// Note: [`Self::with_endpoint`] will take precedence over this option pub fn with_use_fabric_endpoint(mut self, use_fabric_endpoint: bool) -> Self { self.use_fabric_endpoint = use_fabric_endpoint.into(); self @@ -986,9 +1012,14 @@ impl MicrosoftAzureBuilder { (true, url, credential, account_name) } else { let account_name = self.account_name.ok_or(Error::MissingAccount {})?; - let account_url = match self.use_fabric_endpoint.get()? { - true => format!("https://{}.blob.fabric.microsoft.com", &account_name), - false => format!("https://{}.blob.core.windows.net", &account_name), + let account_url = match self.endpoint { + Some(account_url) => account_url, + None => match self.use_fabric_endpoint.get()? { + true => { + format!("https://{}.blob.fabric.microsoft.com", &account_name) + } + false => format!("https://{}.blob.core.windows.net", &account_name), + }, }; let url = Url::parse(&account_url) From 690bf26aed40affbb857f721fa58658eb174c2fe Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Mon, 25 Sep 2023 18:24:15 +0100 Subject: [PATCH 193/397] Add ObjectStore BufReader (#4762) (#4857) * Add ObjectStore BufReader (#4762) * Clippy * More Clippy * Fix MSRV * Fix doc --- src/buffered.rs | 293 ++++++++++++++++++++++++++++++++++++++++++++++++ src/lib.rs | 1 + 2 files changed, 294 insertions(+) create mode 100644 src/buffered.rs diff --git a/src/buffered.rs b/src/buffered.rs new file mode 100644 index 0000000..bdc3f4c --- /dev/null +++ b/src/buffered.rs @@ -0,0 +1,293 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Utilities for performing tokio-style buffered IO + +use crate::path::Path; +use crate::{ObjectMeta, ObjectStore}; +use bytes::Bytes; +use futures::future::{BoxFuture, FutureExt}; +use futures::ready; +use std::cmp::Ordering; +use std::io::{Error, ErrorKind, SeekFrom}; +use std::pin::Pin; +use std::sync::Arc; +use std::task::{Context, Poll}; +use tokio::io::{AsyncBufRead, AsyncRead, AsyncSeek, ReadBuf}; + +/// The default buffer size used by [`BufReader`] +pub const DEFAULT_BUFFER_SIZE: usize = 1024 * 1024; + +/// An async-buffered reader compatible with the tokio IO traits +/// +/// Internally this maintains a buffer of the requested size, and uses [`ObjectStore::get_range`] +/// to populate its internal buffer once depleted. This buffer is cleared on seek. +/// +/// Whilst simple, this interface will typically be outperformed by the native [`ObjectStore`] +/// methods that better map to the network APIs. This is because most object stores have +/// very [high first-byte latencies], on the order of 100-200ms, and so avoiding unnecessary +/// round-trips is critical to throughput. +/// +/// Systems looking to sequentially scan a file should instead consider using [`ObjectStore::get`], +/// or [`ObjectStore::get_opts`], or [`ObjectStore::get_range`] to read a particular range. +/// +/// Systems looking to read multiple ranges of a file should instead consider using +/// [`ObjectStore::get_ranges`], which will optimise the vectored IO. +/// +/// [high first-byte latencies]: https://docs.aws.amazon.com/AmazonS3/latest/userguide/optimizing-performance.html +pub struct BufReader { + /// The object store to fetch data from + store: Arc, + /// The size of the object + size: u64, + /// The path to the object + path: Path, + /// The current position in the object + cursor: u64, + /// The number of bytes to read in a single request + capacity: usize, + /// The buffered data if any + buffer: Buffer, +} + +impl std::fmt::Debug for BufReader { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("BufReader") + .field("path", &self.path) + .field("size", &self.size) + .field("capacity", &self.capacity) + .finish() + } +} + +enum Buffer { + Empty, + Pending(BoxFuture<'static, std::io::Result>), + Ready(Bytes), +} + +impl BufReader { + /// Create a new [`BufReader`] from the provided [`ObjectMeta`] and [`ObjectStore`] + pub fn new(store: Arc, meta: &ObjectMeta) -> Self { + Self::with_capacity(store, meta, DEFAULT_BUFFER_SIZE) + } + + /// Create a new [`BufReader`] from the provided [`ObjectMeta`], [`ObjectStore`], and `capacity` + pub fn with_capacity( + store: Arc, + meta: &ObjectMeta, + capacity: usize, + ) -> Self { + Self { + path: meta.location.clone(), + size: meta.size as _, + store, + capacity, + cursor: 0, + buffer: Buffer::Empty, + } + } + + fn poll_fill_buf_impl( + &mut self, + cx: &mut Context<'_>, + amnt: usize, + ) -> Poll> { + let buf = &mut self.buffer; + loop { + match buf { + Buffer::Empty => { + let store = Arc::clone(&self.store); + let path = self.path.clone(); + let start = self.cursor.min(self.size) as _; + let end = self.cursor.saturating_add(amnt as u64).min(self.size) as _; + + if start == end { + return Poll::Ready(Ok(&[])); + } + + *buf = Buffer::Pending(Box::pin(async move { + Ok(store.get_range(&path, start..end).await?) + })) + } + Buffer::Pending(fut) => match ready!(fut.poll_unpin(cx)) { + Ok(b) => *buf = Buffer::Ready(b), + Err(e) => return Poll::Ready(Err(e)), + }, + Buffer::Ready(r) => return Poll::Ready(Ok(r)), + } + } + } +} + +impl AsyncSeek for BufReader { + fn start_seek(mut self: Pin<&mut Self>, position: SeekFrom) -> std::io::Result<()> { + self.cursor = match position { + SeekFrom::Start(offset) => offset, + SeekFrom::End(offset) => { + checked_add_signed(self.size,offset).ok_or_else(|| Error::new(ErrorKind::InvalidInput, format!("Seeking {offset} from end of {} byte file would result in overflow", self.size)))? + } + SeekFrom::Current(offset) => { + checked_add_signed(self.cursor, offset).ok_or_else(|| Error::new(ErrorKind::InvalidInput, format!("Seeking {offset} from current offset of {} would result in overflow", self.cursor)))? + } + }; + self.buffer = Buffer::Empty; + Ok(()) + } + + fn poll_complete( + self: Pin<&mut Self>, + _cx: &mut Context<'_>, + ) -> Poll> { + Poll::Ready(Ok(self.cursor)) + } +} + +impl AsyncRead for BufReader { + fn poll_read( + mut self: Pin<&mut Self>, + cx: &mut Context<'_>, + out: &mut ReadBuf<'_>, + ) -> Poll> { + // Read the maximum of the internal buffer and `out` + let to_read = out.remaining().max(self.capacity); + let r = match ready!(self.poll_fill_buf_impl(cx, to_read)) { + Ok(buf) => { + let to_consume = out.remaining().min(buf.len()); + out.put_slice(&buf[..to_consume]); + self.consume(to_consume); + Ok(()) + } + Err(e) => Err(e), + }; + Poll::Ready(r) + } +} + +impl AsyncBufRead for BufReader { + fn poll_fill_buf( + self: Pin<&mut Self>, + cx: &mut Context<'_>, + ) -> Poll> { + let capacity = self.capacity; + self.get_mut().poll_fill_buf_impl(cx, capacity) + } + + fn consume(mut self: Pin<&mut Self>, amt: usize) { + match &mut self.buffer { + Buffer::Empty => assert_eq!(amt, 0, "cannot consume from empty buffer"), + Buffer::Ready(b) => match b.len().cmp(&amt) { + Ordering::Less => panic!("{amt} exceeds buffer sized of {}", b.len()), + Ordering::Greater => *b = b.slice(amt..), + Ordering::Equal => self.buffer = Buffer::Empty, + }, + Buffer::Pending(_) => panic!("cannot consume from pending buffer"), + } + self.cursor += amt as u64; + } +} + +/// Port of standardised function as requires Rust 1.66 +/// +/// +#[inline] +fn checked_add_signed(a: u64, rhs: i64) -> Option { + let (res, overflowed) = a.overflowing_add(rhs as _); + let overflow = overflowed ^ (rhs < 0); + (!overflow).then_some(res) +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::memory::InMemory; + use crate::path::Path; + use tokio::io::{AsyncBufReadExt, AsyncReadExt, AsyncSeekExt}; + + #[tokio::test] + async fn test_buf_reader() { + let store = Arc::new(InMemory::new()) as Arc; + + let existent = Path::from("exists.txt"); + const BYTES: usize = 4096; + + let data: Bytes = b"12345678".iter().cycle().copied().take(BYTES).collect(); + store.put(&existent, data.clone()).await.unwrap(); + + let meta = store.head(&existent).await.unwrap(); + + let mut reader = BufReader::new(Arc::clone(&store), &meta); + let mut out = Vec::with_capacity(BYTES); + let read = reader.read_to_end(&mut out).await.unwrap(); + + assert_eq!(read, BYTES); + assert_eq!(&out, &data); + + let err = reader.seek(SeekFrom::Current(i64::MIN)).await.unwrap_err(); + assert_eq!(err.to_string(), "Seeking -9223372036854775808 from current offset of 4096 would result in overflow"); + + reader.rewind().await.unwrap(); + + let err = reader.seek(SeekFrom::Current(-1)).await.unwrap_err(); + assert_eq!( + err.to_string(), + "Seeking -1 from current offset of 0 would result in overflow" + ); + + // Seeking beyond the bounds of the file is permitted but should return no data + reader.seek(SeekFrom::Start(u64::MAX)).await.unwrap(); + let buf = reader.fill_buf().await.unwrap(); + assert!(buf.is_empty()); + + let err = reader.seek(SeekFrom::Current(1)).await.unwrap_err(); + assert_eq!(err.to_string(), "Seeking 1 from current offset of 18446744073709551615 would result in overflow"); + + for capacity in [200, 1024, 4096, DEFAULT_BUFFER_SIZE] { + let store = Arc::clone(&store); + let mut reader = BufReader::with_capacity(store, &meta, capacity); + + let mut bytes_read = 0; + loop { + let buf = reader.fill_buf().await.unwrap(); + if buf.is_empty() { + assert_eq!(bytes_read, BYTES); + break; + } + assert!(buf.starts_with(b"12345678")); + bytes_read += 8; + reader.consume(8); + } + + let mut buf = Vec::with_capacity(76); + reader.seek(SeekFrom::Current(-76)).await.unwrap(); + reader.read_to_end(&mut buf).await.unwrap(); + assert_eq!(&buf, &data[BYTES - 76..]); + + reader.rewind().await.unwrap(); + let buffer = reader.fill_buf().await.unwrap(); + assert_eq!(buffer, &data[..capacity.min(BYTES)]); + + reader.seek(SeekFrom::Start(325)).await.unwrap(); + let buffer = reader.fill_buf().await.unwrap(); + assert_eq!(buffer, &data[325..(325 + capacity).min(BYTES)]); + + reader.seek(SeekFrom::End(0)).await.unwrap(); + let buffer = reader.fill_buf().await.unwrap(); + assert!(buffer.is_empty()); + } + } +} diff --git a/src/lib.rs b/src/lib.rs index cef10f1..3fd363f 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -253,6 +253,7 @@ compile_error!("Features 'gcp', 'aws', 'azure', 'http' are not supported on wasm pub mod aws; #[cfg(feature = "azure")] pub mod azure; +pub mod buffered; #[cfg(not(target_arch = "wasm32"))] pub mod chunked; pub mod delimited; From 549ada8f6bf4ad533b63ad3a28cb8b7c3dcbe680 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Tue, 26 Sep 2023 16:56:27 +0100 Subject: [PATCH 194/397] Prepare object_store 0.7.1 (#4860) --- CHANGELOG-old.md | 47 +++++++++++++++++++++++ CHANGELOG.md | 64 ++++++++++++++++---------------- Cargo.toml | 2 +- dev/release/update_change_log.sh | 4 +- 4 files changed, 81 insertions(+), 36 deletions(-) diff --git a/CHANGELOG-old.md b/CHANGELOG-old.md index 3880205..a0ced7c 100644 --- a/CHANGELOG-old.md +++ b/CHANGELOG-old.md @@ -19,6 +19,53 @@ # Historical Changelog +## [object_store_0.7.0](https://github.com/apache/arrow-rs/tree/object_store_0.7.0) (2023-08-15) + +[Full Changelog](https://github.com/apache/arrow-rs/compare/object_store_0.6.1...object_store_0.7.0) + +**Breaking changes:** + +- Add range and ObjectMeta to GetResult \(\#4352\) \(\#4495\) [\#4677](https://github.com/apache/arrow-rs/pull/4677) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) + +**Implemented enhancements:** + +- Add AzureConfigKey::ContainerName [\#4629](https://github.com/apache/arrow-rs/issues/4629) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- object\_store: multipart ranges for HTTP [\#4612](https://github.com/apache/arrow-rs/issues/4612) +- Make object\_store::multipart public [\#4569](https://github.com/apache/arrow-rs/issues/4569) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- object\_store: Export `ClientConfigKey` and make the `HttpBuilder` more consistent with other builders [\#4515](https://github.com/apache/arrow-rs/issues/4515) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- object\_store/InMemory: Make `clone()` non-async [\#4496](https://github.com/apache/arrow-rs/issues/4496) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Add Range to GetResult::File [\#4352](https://github.com/apache/arrow-rs/issues/4352) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Support copy\_if\_not\_exists for Cloudflare R2 \(S3 API\) [\#4190](https://github.com/apache/arrow-rs/issues/4190) + +**Fixed bugs:** + +- object\_store documentation is broken [\#4683](https://github.com/apache/arrow-rs/issues/4683) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Exports are not sufficient for configuring some object stores, for example minio running locally [\#4530](https://github.com/apache/arrow-rs/issues/4530) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- object\_store: Uploading empty file to S3 results in "411 Length Required" [\#4514](https://github.com/apache/arrow-rs/issues/4514) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- GCP doesn't fetch public objects [\#4417](https://github.com/apache/arrow-rs/issues/4417) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] + +**Closed issues:** + +- \[object\_store\] when Create a AmazonS3 instance work with MinIO without set endpoint got error MissingRegion [\#4617](https://github.com/apache/arrow-rs/issues/4617) +- AWS Profile credentials no longer working in object\_store 0.6.1 [\#4556](https://github.com/apache/arrow-rs/issues/4556) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] + +**Merged pull requests:** + +- Add AzureConfigKey::ContainerName \(\#4629\) [\#4686](https://github.com/apache/arrow-rs/pull/4686) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Fix MSRV CI [\#4671](https://github.com/apache/arrow-rs/pull/4671) ([tustvold](https://github.com/tustvold)) +- Use Config System for Object Store Integration Tests [\#4628](https://github.com/apache/arrow-rs/pull/4628) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Prepare arrow 45 [\#4590](https://github.com/apache/arrow-rs/pull/4590) ([tustvold](https://github.com/tustvold)) +- Add Support for Microsoft Fabric / OneLake [\#4573](https://github.com/apache/arrow-rs/pull/4573) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([vmuddassir-msft](https://github.com/vmuddassir-msft)) +- Cleanup multipart upload trait [\#4572](https://github.com/apache/arrow-rs/pull/4572) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Make object\_store::multipart public [\#4570](https://github.com/apache/arrow-rs/pull/4570) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([yjshen](https://github.com/yjshen)) +- Handle empty S3 payloads \(\#4514\) [\#4518](https://github.com/apache/arrow-rs/pull/4518) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- object\_store: Export `ClientConfigKey` and add `HttpBuilder::with_config` [\#4516](https://github.com/apache/arrow-rs/pull/4516) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([thehabbos007](https://github.com/thehabbos007)) +- object\_store: Implement `ObjectStore` for `Arc` [\#4502](https://github.com/apache/arrow-rs/pull/4502) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([Turbo87](https://github.com/Turbo87)) +- object\_store/InMemory: Add `fork()` fn and deprecate `clone()` fn [\#4499](https://github.com/apache/arrow-rs/pull/4499) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([Turbo87](https://github.com/Turbo87)) +- Bump actions/deploy-pages from 1 to 2 [\#4449](https://github.com/apache/arrow-rs/pull/4449) ([dependabot[bot]](https://github.com/apps/dependabot)) +- gcp: Exclude authorization header when bearer empty [\#4418](https://github.com/apache/arrow-rs/pull/4418) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([vrongmeal](https://github.com/vrongmeal)) +- Support copy\_if\_not\_exists for Cloudflare R2 \(\#4190\) [\#4239](https://github.com/apache/arrow-rs/pull/4239) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) + ## [object_store_0.6.0](https://github.com/apache/arrow-rs/tree/object_store_0.6.0) (2023-05-18) [Full Changelog](https://github.com/apache/arrow-rs/compare/object_store_0.5.6...object_store_0.6.0) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1250639..1f069ce 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,51 +19,49 @@ # Changelog -## [object_store_0.7.0](https://github.com/apache/arrow-rs/tree/object_store_0.7.0) (2023-08-15) +## [object_store_0.7.1](https://github.com/apache/arrow-rs/tree/object_store_0.7.1) (2023-09-26) -[Full Changelog](https://github.com/apache/arrow-rs/compare/object_store_0.6.1...object_store_0.7.0) - -**Breaking changes:** - -- Add range and ObjectMeta to GetResult \(\#4352\) \(\#4495\) [\#4677](https://github.com/apache/arrow-rs/pull/4677) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +[Full Changelog](https://github.com/apache/arrow-rs/compare/object_store_0.7.0...object_store_0.7.1) **Implemented enhancements:** -- Add AzureConfigKey::ContainerName [\#4629](https://github.com/apache/arrow-rs/issues/4629) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- object\_store: multipart ranges for HTTP [\#4612](https://github.com/apache/arrow-rs/issues/4612) -- Make object\_store::multipart public [\#4569](https://github.com/apache/arrow-rs/issues/4569) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- object\_store: Export `ClientConfigKey` and make the `HttpBuilder` more consistent with other builders [\#4515](https://github.com/apache/arrow-rs/issues/4515) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- object\_store/InMemory: Make `clone()` non-async [\#4496](https://github.com/apache/arrow-rs/issues/4496) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- Add Range to GetResult::File [\#4352](https://github.com/apache/arrow-rs/issues/4352) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- Support copy\_if\_not\_exists for Cloudflare R2 \(S3 API\) [\#4190](https://github.com/apache/arrow-rs/issues/4190) +- Automatically Cleanup LocalFileSystem Temporary Files [\#4778](https://github.com/apache/arrow-rs/issues/4778) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- object-store: Expose an async reader API for object store [\#4762](https://github.com/apache/arrow-rs/issues/4762) +- Improve proxy support by using reqwest::Proxy as configuration [\#4713](https://github.com/apache/arrow-rs/issues/4713) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] **Fixed bugs:** -- object\_store documentation is broken [\#4683](https://github.com/apache/arrow-rs/issues/4683) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- Exports are not sufficient for configuring some object stores, for example minio running locally [\#4530](https://github.com/apache/arrow-rs/issues/4530) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- object\_store: Uploading empty file to S3 results in "411 Length Required" [\#4514](https://github.com/apache/arrow-rs/issues/4514) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- GCP doesn't fetch public objects [\#4417](https://github.com/apache/arrow-rs/issues/4417) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- object-store: http shouldn't perform range requests unless `accept-ranges: bytes` header is present [\#4839](https://github.com/apache/arrow-rs/issues/4839) +- object-store: http-store fails when url doesn't have last-modified header on 0.7.0 [\#4831](https://github.com/apache/arrow-rs/issues/4831) +- object-store fails to compile for `wasm32-unknown-unknown` with `http` feature [\#4776](https://github.com/apache/arrow-rs/issues/4776) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- object-store: could not find `header` in `client` for `http` feature [\#4775](https://github.com/apache/arrow-rs/issues/4775) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- LocalFileSystem Copy and Rename Don't Create Intermediate Directories [\#4760](https://github.com/apache/arrow-rs/issues/4760) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- LocalFileSystem Copy is not Atomic [\#4758](https://github.com/apache/arrow-rs/issues/4758) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] **Closed issues:** -- \[object\_store\] when Create a AmazonS3 instance work with MinIO without set endpoint got error MissingRegion [\#4617](https://github.com/apache/arrow-rs/issues/4617) -- AWS Profile credentials no longer working in object\_store 0.6.1 [\#4556](https://github.com/apache/arrow-rs/issues/4556) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- object\_store Azure Government Cloud functionality? [\#4853](https://github.com/apache/arrow-rs/issues/4853) **Merged pull requests:** -- Add AzureConfigKey::ContainerName \(\#4629\) [\#4686](https://github.com/apache/arrow-rs/pull/4686) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- Fix MSRV CI [\#4671](https://github.com/apache/arrow-rs/pull/4671) ([tustvold](https://github.com/tustvold)) -- Use Config System for Object Store Integration Tests [\#4628](https://github.com/apache/arrow-rs/pull/4628) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- Prepare arrow 45 [\#4590](https://github.com/apache/arrow-rs/pull/4590) ([tustvold](https://github.com/tustvold)) -- Add Support for Microsoft Fabric / OneLake [\#4573](https://github.com/apache/arrow-rs/pull/4573) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([vmuddassir-msft](https://github.com/vmuddassir-msft)) -- Cleanup multipart upload trait [\#4572](https://github.com/apache/arrow-rs/pull/4572) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- Make object\_store::multipart public [\#4570](https://github.com/apache/arrow-rs/pull/4570) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([yjshen](https://github.com/yjshen)) -- Handle empty S3 payloads \(\#4514\) [\#4518](https://github.com/apache/arrow-rs/pull/4518) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- object\_store: Export `ClientConfigKey` and add `HttpBuilder::with_config` [\#4516](https://github.com/apache/arrow-rs/pull/4516) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([thehabbos007](https://github.com/thehabbos007)) -- object\_store: Implement `ObjectStore` for `Arc` [\#4502](https://github.com/apache/arrow-rs/pull/4502) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([Turbo87](https://github.com/Turbo87)) -- object\_store/InMemory: Add `fork()` fn and deprecate `clone()` fn [\#4499](https://github.com/apache/arrow-rs/pull/4499) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([Turbo87](https://github.com/Turbo87)) -- Bump actions/deploy-pages from 1 to 2 [\#4449](https://github.com/apache/arrow-rs/pull/4449) ([dependabot[bot]](https://github.com/apps/dependabot)) -- gcp: Exclude authorization header when bearer empty [\#4418](https://github.com/apache/arrow-rs/pull/4418) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([vrongmeal](https://github.com/vrongmeal)) -- Support copy\_if\_not\_exists for Cloudflare R2 \(\#4190\) [\#4239](https://github.com/apache/arrow-rs/pull/4239) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Add ObjectStore BufReader \(\#4762\) [\#4857](https://github.com/apache/arrow-rs/pull/4857) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Allow overriding azure endpoint [\#4854](https://github.com/apache/arrow-rs/pull/4854) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Minor: Improve object\_store docs.rs landing page [\#4849](https://github.com/apache/arrow-rs/pull/4849) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([alamb](https://github.com/alamb)) +- Error if Remote Ignores HTTP Range Header [\#4841](https://github.com/apache/arrow-rs/pull/4841) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([universalmind303](https://github.com/universalmind303)) +- Perform HEAD request for HttpStore::head [\#4837](https://github.com/apache/arrow-rs/pull/4837) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- fix: object store http header last modified [\#4834](https://github.com/apache/arrow-rs/pull/4834) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([universalmind303](https://github.com/universalmind303)) +- Prepare arrow 47.0.0 [\#4827](https://github.com/apache/arrow-rs/pull/4827) ([tustvold](https://github.com/tustvold)) +- ObjectStore Wasm32 Fixes \(\#4775\) \(\#4776\) [\#4796](https://github.com/apache/arrow-rs/pull/4796) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Best effort cleanup of staged upload files \(\#4778\) [\#4792](https://github.com/apache/arrow-rs/pull/4792) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Relaxing type bounds on coalesce\_ranges and collect\_bytes [\#4787](https://github.com/apache/arrow-rs/pull/4787) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([sumerman](https://github.com/sumerman)) +- Update object\_store chrono deprecations [\#4786](https://github.com/apache/arrow-rs/pull/4786) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Make coalesce\_ranges and collect\_bytes available for crate users [\#4784](https://github.com/apache/arrow-rs/pull/4784) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([sumerman](https://github.com/sumerman)) +- Bump actions/checkout from 3 to 4 [\#4767](https://github.com/apache/arrow-rs/pull/4767) ([dependabot[bot]](https://github.com/apps/dependabot)) +- Make ObjectStore::copy Atomic and Automatically Create Parent Directories \(\#4758\) \(\#4760\) [\#4759](https://github.com/apache/arrow-rs/pull/4759) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Update nix requirement from 0.26.1 to 0.27.1 in /object\_store [\#4744](https://github.com/apache/arrow-rs/pull/4744) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([viirya](https://github.com/viirya)) +- Add `with_proxy_ca_certificate` and `with_proxy_excludes` [\#4714](https://github.com/apache/arrow-rs/pull/4714) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([gordonwang0](https://github.com/gordonwang0)) +- Update object\_store Dependencies and Configure Dependabot [\#4700](https://github.com/apache/arrow-rs/pull/4700) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) + + \* *This Changelog was automatically generated by [github_changelog_generator](https://github.com/github-changelog-generator/github-changelog-generator)* diff --git a/Cargo.toml b/Cargo.toml index 72722df..ff8047c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "object_store" -version = "0.7.0" +version = "0.7.1" edition = "2021" license = "MIT/Apache-2.0" readme = "README.md" diff --git a/dev/release/update_change_log.sh b/dev/release/update_change_log.sh index 48835c7..aeec3ca 100755 --- a/dev/release/update_change_log.sh +++ b/dev/release/update_change_log.sh @@ -29,8 +29,8 @@ set -e -SINCE_TAG="object_store_0.6.1" -FUTURE_RELEASE="object_store_0.7.0" +SINCE_TAG="object_store_0.7.0" +FUTURE_RELEASE="object_store_0.7.1" SOURCE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" SOURCE_TOP_DIR="$(cd "${SOURCE_DIR}/../../" && pwd)" From 4e5d08b1656e38b077d94a6e07e811f3086997d8 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Wed, 27 Sep 2023 17:52:25 +0100 Subject: [PATCH 195/397] Flush in multiple_append test (#4868) (#4869) --- src/local.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/local.rs b/src/local.rs index 20eb3c6..c625c59 100644 --- a/src/local.rs +++ b/src/local.rs @@ -1545,11 +1545,13 @@ mod not_wasm_tests { for d in &data { writer.write_all(d).await.unwrap(); } + writer.flush().await.unwrap(); let mut writer = integration.append(&location).await.unwrap(); for d in &data { writer.write_all(d).await.unwrap(); } + writer.flush().await.unwrap(); let read_data = integration .get(&location) From aeef1adbb306b1a29ec1a48cd8e39f21d90d1090 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 28 Sep 2023 17:46:49 +0100 Subject: [PATCH 196/397] Flush in creates_dir_if_not_present_append (#4872) (#4874) --- src/local.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/local.rs b/src/local.rs index c625c59..69da170 100644 --- a/src/local.rs +++ b/src/local.rs @@ -1494,6 +1494,8 @@ mod not_wasm_tests { writer.write_all(data.as_ref()).await.unwrap(); + writer.flush().await.unwrap(); + let read_data = integration .get(&location) .await From cf99b75f83b25cc988fe636d2941b0fb88a1143c Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 2 Oct 2023 15:14:34 +0100 Subject: [PATCH 197/397] Update ring requirement from 0.16 to 0.17 in /object_store (#4887) * Update ring requirement from 0.16 to 0.17 in /object_store Updates the requirements on [ring](https://github.com/briansmith/ring) to permit the latest version. - [Commits](https://github.com/briansmith/ring/commits) --- updated-dependencies: - dependency-name: ring dependency-type: direct:production ... Signed-off-by: dependabot[bot] * Clippy --------- Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Raphael Taylor-Davies --- Cargo.toml | 2 +- src/gcp/credential.rs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index ff8047c..7928648 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -51,7 +51,7 @@ serde = { version = "1.0", default-features = false, features = ["derive"], opti serde_json = { version = "1.0", default-features = false, optional = true } rand = { version = "0.8", default-features = false, features = ["std", "std_rng"], optional = true } reqwest = { version = "0.11", default-features = false, features = ["rustls-tls"], optional = true } -ring = { version = "0.16", default-features = false, features = ["std"], optional = true } +ring = { version = "0.17", default-features = false, features = ["std"], optional = true } rustls-pemfile = { version = "1.0", default-features = false, optional = true } [target.'cfg(not(target_arch = "wasm32"))'.dependencies] diff --git a/src/gcp/credential.rs b/src/gcp/credential.rs index 205b805..ad21c33 100644 --- a/src/gcp/credential.rs +++ b/src/gcp/credential.rs @@ -203,7 +203,7 @@ impl TokenProvider for OAuthProvider { let claim_str = b64_encode_obj(&claims)?; let message = [self.jwt_header.as_ref(), claim_str.as_ref()].join("."); - let mut sig_bytes = vec![0; self.key_pair.public_modulus_len()]; + let mut sig_bytes = vec![0; self.key_pair.public().modulus_len()]; self.key_pair .sign( &ring::signature::RSA_PKCS1_SHA256, From a48c746f361c799c81fde14f92fc4ff60c2e8332 Mon Sep 17 00:00:00 2001 From: "Carol (Nichols || Goulding)" <193874+carols10cents@users.noreply.github.com> Date: Fri, 6 Oct 2023 09:55:38 -0400 Subject: [PATCH 198/397] Upgrade to Rust 1.73.0 (#4899) * fix: Call Ord's implementation from PartialOrd so they stay in sync As recommended by Clippy in Rust 1.73.0 * fix: Use or_default methods instead of or_else(default) As recommended by Clippy in Rust 1.73.0 * fix: Use filter then map with bools instead of filter_map then As recommended by Clippy in Rust 1.73.0 * fix: Change a match guard to a pattern As recommended by Clippy in Rust 1.73.0 * fix: Change to a different kind of filter_map Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> --------- Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> --- src/azure/client.rs | 2 +- src/azure/credential.rs | 8 +++----- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/src/azure/client.rs b/src/azure/client.rs index e18135c..cd1a3a1 100644 --- a/src/azure/client.rs +++ b/src/azure/client.rs @@ -372,7 +372,7 @@ struct ListResultInternal { } fn to_list_result(value: ListResultInternal, prefix: Option<&str>) -> Result { - let prefix = prefix.map(Path::from).unwrap_or_else(Path::default); + let prefix = prefix.map(Path::from).unwrap_or_default(); let common_prefixes = value .blobs .blob_prefix diff --git a/src/azure/credential.rs b/src/azure/credential.rs index fd75389..8dc6136 100644 --- a/src/azure/credential.rs +++ b/src/azure/credential.rs @@ -234,11 +234,9 @@ fn string_to_sign(h: &HeaderMap, u: &Url, method: &Method, account: &str) -> Str fn canonicalize_header(headers: &HeaderMap) -> String { let mut names = headers .iter() - .filter_map(|(k, _)| { - (k.as_str().starts_with("x-ms")) - // TODO remove unwraps - .then(|| (k.as_str(), headers.get(k).unwrap().to_str().unwrap())) - }) + .filter(|&(k, _)| (k.as_str().starts_with("x-ms"))) + // TODO remove unwraps + .map(|(k, _)| (k.as_str(), headers.get(k).unwrap().to_str().unwrap())) .collect::>(); names.sort_unstable(); From 2c55bf966cde279d5f56e3fce1c619f6106b1ff7 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Wed, 11 Oct 2023 07:57:59 +0100 Subject: [PATCH 199/397] Cleanup `object_store::retry` client error handling (#4915) * Cleanup client error handling * Clippy * Format * Update test * Review feedback --- src/client/retry.rs | 180 +++++++++++++++++++++++--------------------- src/gcp/mod.rs | 2 +- 2 files changed, 96 insertions(+), 86 deletions(-) diff --git a/src/client/retry.rs b/src/client/retry.rs index 39a9131..e4d246c 100644 --- a/src/client/retry.rs +++ b/src/client/retry.rs @@ -23,46 +23,50 @@ use futures::FutureExt; use reqwest::header::LOCATION; use reqwest::{Response, StatusCode}; use snafu::Error as SnafuError; +use snafu::Snafu; use std::time::{Duration, Instant}; use tracing::info; /// Retry request error -#[derive(Debug)] -pub struct Error { - retries: usize, - message: String, - source: Option, - status: Option, -} - -impl std::fmt::Display for Error { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!( - f, - "response error \"{}\", after {} retries", - self.message, self.retries - )?; - if let Some(source) = &self.source { - write!(f, ": {source}")?; - } - Ok(()) - } -} - -impl std::error::Error for Error { - fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { - self.source.as_ref().map(|e| e as _) - } +#[derive(Debug, Snafu)] +pub enum Error { + #[snafu(display("Received redirect without LOCATION, this normally indicates an incorrectly configured region"))] + BareRedirect, + + #[snafu(display("Client error with status {status}: {}", body.as_deref().unwrap_or("No Body")))] + Client { + status: StatusCode, + body: Option, + }, + + #[snafu(display("Error after {retries} retries: {source}"))] + Reqwest { + retries: usize, + source: reqwest::Error, + }, } impl Error { /// Returns the status code associated with this error if any pub fn status(&self) -> Option { - self.status + match self { + Self::BareRedirect => None, + Self::Client { status, .. } => Some(*status), + Self::Reqwest { source, .. } => source.status(), + } + } + + /// Returns the error body if any + pub fn body(&self) -> Option<&str> { + match self { + Self::Client { body, .. } => body.as_deref(), + Self::BareRedirect => None, + Self::Reqwest { .. } => None, + } } pub fn error(self, store: &'static str, path: String) -> crate::Error { - match self.status { + match self.status() { Some(StatusCode::NOT_FOUND) => crate::Error::NotFound { path, source: Box::new(self), @@ -86,16 +90,19 @@ impl Error { impl From for std::io::Error { fn from(err: Error) -> Self { use std::io::ErrorKind; - match (&err.source, err.status()) { - (Some(source), _) if source.is_builder() || source.is_request() => { - Self::new(ErrorKind::InvalidInput, err) - } - (_, Some(StatusCode::NOT_FOUND)) => Self::new(ErrorKind::NotFound, err), - (_, Some(StatusCode::BAD_REQUEST)) => Self::new(ErrorKind::InvalidInput, err), - (Some(source), None) if source.is_timeout() => { + match &err { + Error::Client { + status: StatusCode::NOT_FOUND, + .. + } => Self::new(ErrorKind::NotFound, err), + Error::Client { + status: StatusCode::BAD_REQUEST, + .. + } => Self::new(ErrorKind::InvalidInput, err), + Error::Reqwest { source, .. } if source.is_timeout() => { Self::new(ErrorKind::TimedOut, err) } - (Some(source), None) if source.is_connect() => { + Error::Reqwest { source, .. } if source.is_connect() => { Self::new(ErrorKind::NotConnected, err) } _ => Self::new(ErrorKind::Other, err), @@ -169,27 +176,21 @@ impl RetryExt for reqwest::RequestBuilder { Ok(r) => match r.error_for_status_ref() { Ok(_) if r.status().is_success() => return Ok(r), Ok(r) if r.status() == StatusCode::NOT_MODIFIED => { - return Err(Error{ - message: "not modified".to_string(), - retries, - status: Some(r.status()), - source: None, + return Err(Error::Client { + body: None, + status: StatusCode::NOT_MODIFIED, }) } Ok(r) => { let is_bare_redirect = r.status().is_redirection() && !r.headers().contains_key(LOCATION); - let message = match is_bare_redirect { - true => "Received redirect without LOCATION, this normally indicates an incorrectly configured region".to_string(), + return match is_bare_redirect { + true => Err(Error::BareRedirect), // Not actually sure if this is reachable, but here for completeness - false => format!("request unsuccessful: {}", r.status()), - }; - - return Err(Error{ - message, - retries, - status: Some(r.status()), - source: None, - }) + false => Err(Error::Client { + body: None, + status: r.status(), + }) + } } Err(e) => { let status = r.status(); @@ -198,23 +199,26 @@ impl RetryExt for reqwest::RequestBuilder { || now.elapsed() > retry_timeout || !status.is_server_error() { - // Get the response message if returned a client error - let message = match status.is_client_error() { + return Err(match status.is_client_error() { true => match r.text().await { - Ok(message) if !message.is_empty() => message, - Ok(_) => "No Body".to_string(), - Err(e) => format!("error getting response body: {e}") + Ok(body) => { + Error::Client { + body: Some(body).filter(|b| !b.is_empty()), + status, + } + } + Err(e) => { + Error::Reqwest { + retries, + source: e, + } + } } - false => status.to_string(), - }; - - return Err(Error{ - message, - retries, - status: Some(status), - source: Some(e), - }) - + false => Error::Reqwest { + retries, + source: e, + } + }); } let sleep = backoff.next(); @@ -238,16 +242,14 @@ impl RetryExt for reqwest::RequestBuilder { || now.elapsed() > retry_timeout || !do_retry { - return Err(Error{ + return Err(Error::Reqwest { retries, - message: "request error".to_string(), - status: e.status(), - source: Some(e), + source: e, }) } let sleep = backoff.next(); retries += 1; - info!("Encountered request error ({}) backing off for {} seconds, retry {} of {}", e, sleep.as_secs_f32(), retries, max_retries); + info!("Encountered transport error ({}) backing off for {} seconds, retry {} of {}", e, sleep.as_secs_f32(), retries, max_retries); tokio::time::sleep(sleep).await; } } @@ -260,7 +262,7 @@ impl RetryExt for reqwest::RequestBuilder { #[cfg(test)] mod tests { use crate::client::mock_server::MockServer; - use crate::client::retry::RetryExt; + use crate::client::retry::{Error, RetryExt}; use crate::RetryConfig; use hyper::header::LOCATION; use hyper::{Body, Response}; @@ -294,8 +296,11 @@ mod tests { let e = do_request().await.unwrap_err(); assert_eq!(e.status().unwrap(), StatusCode::BAD_REQUEST); - assert_eq!(e.retries, 0); - assert_eq!(&e.message, "cupcakes"); + assert_eq!(e.body(), Some("cupcakes")); + assert_eq!( + e.to_string(), + "Client error with status 400 Bad Request: cupcakes" + ); // Handles client errors with no payload mock.push( @@ -307,8 +312,11 @@ mod tests { let e = do_request().await.unwrap_err(); assert_eq!(e.status().unwrap(), StatusCode::BAD_REQUEST); - assert_eq!(e.retries, 0); - assert_eq!(&e.message, "No Body"); + assert_eq!(e.body(), None); + assert_eq!( + e.to_string(), + "Client error with status 400 Bad Request: No Body" + ); // Should retry server error request mock.push( @@ -381,7 +389,8 @@ mod tests { ); let e = do_request().await.unwrap_err(); - assert_eq!(e.message, "Received redirect without LOCATION, this normally indicates an incorrectly configured region"); + assert!(matches!(e, Error::BareRedirect)); + assert_eq!(e.to_string(), "Received redirect without LOCATION, this normally indicates an incorrectly configured region"); // Gives up after the retrying the specified number of times for _ in 0..=retry.max_retries { @@ -393,22 +402,23 @@ mod tests { ); } - let e = do_request().await.unwrap_err(); - assert_eq!(e.retries, retry.max_retries); - assert_eq!(e.message, "502 Bad Gateway"); + let e = do_request().await.unwrap_err().to_string(); + assert!(e.starts_with("Error after 2 retries: HTTP status server error (502 Bad Gateway) for url"), "{e}"); // Panic results in an incomplete message error in the client mock.push_fn(|_| panic!()); let r = do_request().await.unwrap(); assert_eq!(r.status(), StatusCode::OK); - // Gives up after retrying mulitiple panics + // Gives up after retrying multiple panics for _ in 0..=retry.max_retries { mock.push_fn(|_| panic!()); } - let e = do_request().await.unwrap_err(); - assert_eq!(e.retries, retry.max_retries); - assert_eq!(e.message, "request error"); + let e = do_request().await.unwrap_err().to_string(); + assert!( + e.starts_with("Error after 2 retries: error sending request for url"), + "{e}" + ); // Shutdown mock.shutdown().await diff --git a/src/gcp/mod.rs b/src/gcp/mod.rs index 3f5bf62..a0a60f2 100644 --- a/src/gcp/mod.rs +++ b/src/gcp/mod.rs @@ -1215,7 +1215,7 @@ mod test { .unwrap_err() .to_string(); assert!( - err.contains("HTTP status client error (404 Not Found)"), + err.contains("Client error with status 404 Not Found"), "{}", err ) From 2f3dde6799614f3c70fd40e5563612ae93289cde Mon Sep 17 00:00:00 2001 From: "Carol (Nichols || Goulding)" <193874+carols10cents@users.noreply.github.com> Date: Thu, 12 Oct 2023 10:27:22 -0400 Subject: [PATCH 200/397] Add AWS presigned URL support (#4876) * refactor: Extract AWS algorithm string into a const * refactor: Extract a string_to_sign function and encapsulate non-reused values * refactor: Extract a scope function * refactor: Move hashing of canonical request into string_to_sign * refactor: Move canonical_request into string_to_sign * refactor: Move canonical URI construction into string_to_sign * refactor: Move canonical query construction into string_to_sign * feat: Implement sign method * feat: Publicly expose AWS S3 path_url for convenience constructing signed URLs * docs: Add an example of signing an upload URL * feat: Add a more convenient API on AmazonS3 for creating signed URLs * fix: Add credential token to the X-Amz-Security-Token query param if specified * fix: Change path_url to be pub crate instead of pub * feat: Define a public Signer trait for the signing interface * fix: Hide some doc test code Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> * fix: Use Method through reqwest which re-exports http anyway --------- Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> --- src/aws/client.rs | 2 +- src/aws/credential.rs | 181 ++++++++++++++++++++++++++++++++++-------- src/aws/mod.rs | 64 ++++++++++++++- src/lib.rs | 2 + src/signer.rs | 40 ++++++++++ 5 files changed, 255 insertions(+), 34 deletions(-) create mode 100644 src/signer.rs diff --git a/src/aws/client.rs b/src/aws/client.rs index 1c35586..e3ac60e 100644 --- a/src/aws/client.rs +++ b/src/aws/client.rs @@ -212,7 +212,7 @@ pub struct S3Config { } impl S3Config { - fn path_url(&self, path: &Path) -> String { + pub(crate) fn path_url(&self, path: &Path) -> String { format!("{}/{}", self.bucket_endpoint, encode_path(path)) } } diff --git a/src/aws/credential.rs b/src/aws/credential.rs index be0ffa5..e27b71f 100644 --- a/src/aws/credential.rs +++ b/src/aws/credential.rs @@ -30,7 +30,7 @@ use reqwest::{Client, Method, Request, RequestBuilder, StatusCode}; use serde::Deserialize; use std::collections::BTreeMap; use std::sync::Arc; -use std::time::Instant; +use std::time::{Duration, Instant}; use tracing::warn; use url::Url; @@ -89,6 +89,7 @@ const DATE_HEADER: &str = "x-amz-date"; const HASH_HEADER: &str = "x-amz-content-sha256"; const TOKEN_HEADER: &str = "x-amz-security-token"; const AUTH_HEADER: &str = "authorization"; +const ALGORITHM: &str = "AWS4-HMAC-SHA256"; impl<'a> AwsAuthorizer<'a> { /// Create a new [`AwsAuthorizer`] @@ -154,21 +155,110 @@ impl<'a> AwsAuthorizer<'a> { let header_digest = HeaderValue::from_str(&digest).unwrap(); request.headers_mut().insert(HASH_HEADER, header_digest); - // Each path segment must be URI-encoded twice (except for Amazon S3 which only gets URI-encoded once). + let (signed_headers, canonical_headers) = canonicalize_headers(request.headers()); + + let scope = self.scope(date); + + let string_to_sign = self.string_to_sign( + date, + &scope, + request.method(), + request.url(), + &canonical_headers, + &signed_headers, + &digest, + ); + + // sign the string + let signature = + self.credential + .sign(&string_to_sign, date, self.region, self.service); + + // build the actual auth header + let authorisation = format!( + "{} Credential={}/{}, SignedHeaders={}, Signature={}", + ALGORITHM, self.credential.key_id, scope, signed_headers, signature + ); + + let authorization_val = HeaderValue::from_str(&authorisation).unwrap(); + request.headers_mut().insert(AUTH_HEADER, authorization_val); + } + + pub(crate) fn sign(&self, method: Method, url: &mut Url, expires_in: Duration) { + let date = self.date.unwrap_or_else(Utc::now); + let scope = self.scope(date); + + // https://docs.aws.amazon.com/AmazonS3/latest/API/sigv4-query-string-auth.html + url.query_pairs_mut() + .append_pair("X-Amz-Algorithm", ALGORITHM) + .append_pair( + "X-Amz-Credential", + &format!("{}/{}", self.credential.key_id, scope), + ) + .append_pair("X-Amz-Date", &date.format("%Y%m%dT%H%M%SZ").to_string()) + .append_pair("X-Amz-Expires", &expires_in.as_secs().to_string()) + .append_pair("X-Amz-SignedHeaders", "host"); + + // For S3, you must include the X-Amz-Security-Token query parameter in the URL if + // using credentials sourced from the STS service. + if let Some(ref token) = self.credential.token { + url.query_pairs_mut() + .append_pair("X-Amz-Security-Token", token); + } + + // We don't have a payload; the user is going to send the payload directly themselves. + let digest = UNSIGNED_PAYLOAD; + + let host = &url[url::Position::BeforeHost..url::Position::AfterPort].to_string(); + let mut headers = HeaderMap::new(); + let host_val = HeaderValue::from_str(host).unwrap(); + headers.insert("host", host_val); + + let (signed_headers, canonical_headers) = canonicalize_headers(&headers); + + let string_to_sign = self.string_to_sign( + date, + &scope, + &method, + url, + &canonical_headers, + &signed_headers, + digest, + ); + + let signature = + self.credential + .sign(&string_to_sign, date, self.region, self.service); + + url.query_pairs_mut() + .append_pair("X-Amz-Signature", &signature); + } + + #[allow(clippy::too_many_arguments)] + fn string_to_sign( + &self, + date: DateTime, + scope: &str, + request_method: &Method, + url: &Url, + canonical_headers: &str, + signed_headers: &str, + digest: &str, + ) -> String { + // Each path segment must be URI-encoded twice (except for Amazon S3 which only gets + // URI-encoded once). // see https://docs.aws.amazon.com/general/latest/gr/sigv4-create-canonical-request.html let canonical_uri = match self.service { - "s3" => request.url().path().to_string(), - _ => utf8_percent_encode(request.url().path(), &STRICT_PATH_ENCODE_SET) - .to_string(), + "s3" => url.path().to_string(), + _ => utf8_percent_encode(url.path(), &STRICT_PATH_ENCODE_SET).to_string(), }; - let (signed_headers, canonical_headers) = canonicalize_headers(request.headers()); - let canonical_query = canonicalize_query(request.url()); + let canonical_query = canonicalize_query(url); // https://docs.aws.amazon.com/general/latest/gr/sigv4-create-canonical-request.html let canonical_request = format!( "{}\n{}\n{}\n{}\n{}\n{}", - request.method().as_str(), + request_method.as_str(), canonical_uri, canonical_query, canonical_headers, @@ -177,33 +267,23 @@ impl<'a> AwsAuthorizer<'a> { ); let hashed_canonical_request = hex_digest(canonical_request.as_bytes()); - let scope = format!( - "{}/{}/{}/aws4_request", - date.format("%Y%m%d"), - self.region, - self.service - ); - let string_to_sign = format!( - "AWS4-HMAC-SHA256\n{}\n{}\n{}", + format!( + "{}\n{}\n{}\n{}", + ALGORITHM, date.format("%Y%m%dT%H%M%SZ"), scope, hashed_canonical_request - ); - - // sign the string - let signature = - self.credential - .sign(&string_to_sign, date, self.region, self.service); - - // build the actual auth header - let authorisation = format!( - "AWS4-HMAC-SHA256 Credential={}/{}, SignedHeaders={}, Signature={}", - self.credential.key_id, scope, signed_headers, signature - ); + ) + } - let authorization_val = HeaderValue::from_str(&authorisation).unwrap(); - request.headers_mut().insert(AUTH_HEADER, authorization_val); + fn scope(&self, date: DateTime) -> String { + format!( + "{}/{}/{}/aws4_request", + date.format("%Y%m%d"), + self.region, + self.service + ) } } @@ -667,7 +747,46 @@ mod tests { }; authorizer.authorize(&mut request, None); - assert_eq!(request.headers().get(AUTH_HEADER).unwrap(), "AWS4-HMAC-SHA256 Credential=AKIAIOSFODNN7EXAMPLE/20220806/us-east-1/ec2/aws4_request, SignedHeaders=host;x-amz-content-sha256;x-amz-date, Signature=653c3d8ea261fd826207df58bc2bb69fbb5003e9eb3c0ef06e4a51f2a81d8699") + assert_eq!(request.headers().get(AUTH_HEADER).unwrap(), "AWS4-HMAC-SHA256 Credential=AKIAIOSFODNN7EXAMPLE/20220806/us-east-1/ec2/aws4_request, SignedHeaders=host;x-amz-content-sha256;x-amz-date, Signature=653c3d8ea261fd826207df58bc2bb69fbb5003e9eb3c0ef06e4a51f2a81d8699"); + } + + #[test] + fn signed_get_url() { + // Values from https://docs.aws.amazon.com/AmazonS3/latest/API/sigv4-query-string-auth.html + let credential = AwsCredential { + key_id: "AKIAIOSFODNN7EXAMPLE".to_string(), + secret_key: "wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY".to_string(), + token: None, + }; + + let date = DateTime::parse_from_rfc3339("2013-05-24T00:00:00Z") + .unwrap() + .with_timezone(&Utc); + + let authorizer = AwsAuthorizer { + date: Some(date), + credential: &credential, + service: "s3", + region: "us-east-1", + sign_payload: false, + }; + + let mut url = + Url::parse("https://examplebucket.s3.amazonaws.com/test.txt").unwrap(); + authorizer.sign(Method::GET, &mut url, Duration::from_secs(86400)); + + assert_eq!( + url, + Url::parse( + "https://examplebucket.s3.amazonaws.com/test.txt?\ + X-Amz-Algorithm=AWS4-HMAC-SHA256&\ + X-Amz-Credential=AKIAIOSFODNN7EXAMPLE%2F20130524%2Fus-east-1%2Fs3%2Faws4_request&\ + X-Amz-Date=20130524T000000Z&\ + X-Amz-Expires=86400&\ + X-Amz-SignedHeaders=host&\ + X-Amz-Signature=aeeed9bbccd4d02ee5c0109b86d86835f995330da4c265957d157751f604d404" + ).unwrap() + ); } #[test] diff --git a/src/aws/mod.rs b/src/aws/mod.rs index db3e1b9..0028be9 100644 --- a/src/aws/mod.rs +++ b/src/aws/mod.rs @@ -36,10 +36,10 @@ use bytes::Bytes; use futures::stream::BoxStream; use futures::{StreamExt, TryStreamExt}; use itertools::Itertools; +use reqwest::Method; use serde::{Deserialize, Serialize}; use snafu::{ensure, OptionExt, ResultExt, Snafu}; -use std::str::FromStr; -use std::sync::Arc; +use std::{str::FromStr, sync::Arc, time::Duration}; use tokio::io::AsyncWrite; use tracing::info; use url::Url; @@ -56,6 +56,7 @@ use crate::client::{ }; use crate::config::ConfigValue; use crate::multipart::{PartId, PutPart, WriteMultiPart}; +use crate::signer::Signer; use crate::{ ClientOptions, GetOptions, GetResult, ListResult, MultipartId, ObjectMeta, ObjectStore, Path, Result, RetryConfig, @@ -209,6 +210,65 @@ impl AmazonS3 { pub fn credentials(&self) -> &AwsCredentialProvider { &self.client.config().credentials } + + /// Create a full URL to the resource specified by `path` with this instance's configuration. + fn path_url(&self, path: &Path) -> String { + self.client.config().path_url(path) + } +} + +#[async_trait] +impl Signer for AmazonS3 { + /// Create a URL containing the relevant [AWS SigV4] query parameters that authorize a request + /// via `method` to the resource at `path` valid for the duration specified in `expires_in`. + /// + /// [AWS SigV4]: https://docs.aws.amazon.com/IAM/latest/UserGuide/create-signed-request.html + /// + /// # Example + /// + /// This example returns a URL that will enable a user to upload a file to + /// "some-folder/some-file.txt" in the next hour. + /// + /// ``` + /// # async fn example() -> Result<(), Box> { + /// # use object_store::{aws::AmazonS3Builder, path::Path, signer::Signer}; + /// # use reqwest::Method; + /// # use std::time::Duration; + /// # + /// let region = "us-east-1"; + /// let s3 = AmazonS3Builder::new() + /// .with_region(region) + /// .with_bucket_name("my-bucket") + /// .with_access_key_id("my-access-key-id") + /// .with_secret_access_key("my-secret-access-key") + /// .build()?; + /// + /// let url = s3.signed_url( + /// Method::PUT, + /// &Path::from("some-folder/some-file.txt"), + /// Duration::from_secs(60 * 60) + /// ).await?; + /// # Ok(()) + /// # } + /// ``` + async fn signed_url( + &self, + method: Method, + path: &Path, + expires_in: Duration, + ) -> Result { + let credential = self.credentials().get_credential().await?; + let authorizer = + AwsAuthorizer::new(&credential, "s3", &self.client.config().region); + + let path_url = self.path_url(path); + let mut url = + Url::parse(&path_url).context(UnableToParseUrlSnafu { url: path_url })?; + + authorizer.sign(method, &mut url, expires_in); + + Ok(url) + } } #[async_trait] diff --git a/src/lib.rs b/src/lib.rs index 3fd363f..68e785b 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -267,6 +267,8 @@ pub mod local; pub mod memory; pub mod path; pub mod prefix; +#[cfg(feature = "cloud")] +pub mod signer; pub mod throttle; #[cfg(feature = "cloud")] diff --git a/src/signer.rs b/src/signer.rs new file mode 100644 index 0000000..f1f35de --- /dev/null +++ b/src/signer.rs @@ -0,0 +1,40 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Abstraction of signed URL generation for those object store implementations that support it + +use crate::{path::Path, Result}; +use async_trait::async_trait; +use reqwest::Method; +use std::{fmt, time::Duration}; +use url::Url; + +/// Universal API to presigned URLs generated from multiple object store services. Not supported by +/// all object store services. +#[async_trait] +pub trait Signer: Send + Sync + fmt::Debug + 'static { + /// Given the intended [`Method`] and [`Path`] to use and the desired length of time for which + /// the URL should be valid, return a signed [`Url`] created with the object store + /// implementation's credentials such that the URL can be handed to something that doesn't have + /// access to the object store's credentials, to allow limited access to the object store. + async fn signed_url( + &self, + method: Method, + path: &Path, + expires_in: Duration, + ) -> Result; +} From ac9bb172dd9b0b07b9d0436c2ac10d6e8478ba56 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Sun, 15 Oct 2023 11:04:14 +0100 Subject: [PATCH 201/397] Add GetOptions::head (#4931) --- src/aws/client.rs | 9 ++------- src/aws/mod.rs | 4 ---- src/azure/client.rs | 9 ++------- src/azure/mod.rs | 4 ---- src/client/get.rs | 24 +++--------------------- src/gcp/mod.rs | 13 ++----------- src/http/client.rs | 15 +++++---------- src/http/mod.rs | 4 ---- src/lib.rs | 12 +++++++++++- src/local.rs | 37 ++++--------------------------------- 10 files changed, 29 insertions(+), 102 deletions(-) diff --git a/src/aws/client.rs b/src/aws/client.rs index e3ac60e..ac07f9a 100644 --- a/src/aws/client.rs +++ b/src/aws/client.rs @@ -554,15 +554,10 @@ impl GetClient for S3Client { const STORE: &'static str = STORE; /// Make an S3 GET request - async fn get_request( - &self, - path: &Path, - options: GetOptions, - head: bool, - ) -> Result { + async fn get_request(&self, path: &Path, options: GetOptions) -> Result { let credential = self.get_credential().await?; let url = self.config.path_url(path); - let method = match head { + let method = match options.head { true => Method::HEAD, false => Method::GET, }; diff --git a/src/aws/mod.rs b/src/aws/mod.rs index 0028be9..285ee2f 100644 --- a/src/aws/mod.rs +++ b/src/aws/mod.rs @@ -307,10 +307,6 @@ impl ObjectStore for AmazonS3 { self.client.get_opts(location, options).await } - async fn head(&self, location: &Path) -> Result { - self.client.head(location).await - } - async fn delete(&self, location: &Path) -> Result<()> { self.client.delete_request(location, &()).await } diff --git a/src/azure/client.rs b/src/azure/client.rs index cd1a3a1..f65388b 100644 --- a/src/azure/client.rs +++ b/src/azure/client.rs @@ -264,15 +264,10 @@ impl GetClient for AzureClient { /// Make an Azure GET request /// /// - async fn get_request( - &self, - path: &Path, - options: GetOptions, - head: bool, - ) -> Result { + async fn get_request(&self, path: &Path, options: GetOptions) -> Result { let credential = self.get_credential().await?; let url = self.config.path_url(path); - let method = match head { + let method = match options.head { true => Method::HEAD, false => Method::GET, }; diff --git a/src/azure/mod.rs b/src/azure/mod.rs index b210d48..9017634 100644 --- a/src/azure/mod.rs +++ b/src/azure/mod.rs @@ -202,10 +202,6 @@ impl ObjectStore for MicrosoftAzure { self.client.get_opts(location, options).await } - async fn head(&self, location: &Path) -> Result { - self.client.head(location).await - } - async fn delete(&self, location: &Path) -> Result<()> { self.client.delete_request(location, &()).await } diff --git a/src/client/get.rs b/src/client/get.rs index 333f6fe..7f68b6d 100644 --- a/src/client/get.rs +++ b/src/client/get.rs @@ -17,7 +17,7 @@ use crate::client::header::{header_meta, HeaderConfig}; use crate::path::Path; -use crate::{Error, GetOptions, GetResult, ObjectMeta}; +use crate::{Error, GetOptions, GetResult}; use crate::{GetResultPayload, Result}; use async_trait::async_trait; use futures::{StreamExt, TryStreamExt}; @@ -34,27 +34,20 @@ pub trait GetClient: Send + Sync + 'static { last_modified_required: true, }; - async fn get_request( - &self, - path: &Path, - options: GetOptions, - head: bool, - ) -> Result; + async fn get_request(&self, path: &Path, options: GetOptions) -> Result; } /// Extension trait for [`GetClient`] that adds common retrieval functionality #[async_trait] pub trait GetClientExt { async fn get_opts(&self, location: &Path, options: GetOptions) -> Result; - - async fn head(&self, location: &Path) -> Result; } #[async_trait] impl GetClientExt for T { async fn get_opts(&self, location: &Path, options: GetOptions) -> Result { let range = options.range.clone(); - let response = self.get_request(location, options, false).await?; + let response = self.get_request(location, options).await?; let meta = header_meta(location, response.headers(), T::HEADER_CONFIG).map_err(|e| { Error::Generic { @@ -77,15 +70,4 @@ impl GetClientExt for T { meta, }) } - - async fn head(&self, location: &Path) -> Result { - let options = GetOptions::default(); - let response = self.get_request(location, options, true).await?; - header_meta(location, response.headers(), T::HEADER_CONFIG).map_err(|e| { - Error::Generic { - store: T::STORE, - source: Box::new(e), - } - }) - } } diff --git a/src/gcp/mod.rs b/src/gcp/mod.rs index a0a60f2..f80704b 100644 --- a/src/gcp/mod.rs +++ b/src/gcp/mod.rs @@ -389,16 +389,11 @@ impl GetClient for GoogleCloudStorageClient { const STORE: &'static str = STORE; /// Perform a get request - async fn get_request( - &self, - path: &Path, - options: GetOptions, - head: bool, - ) -> Result { + async fn get_request(&self, path: &Path, options: GetOptions) -> Result { let credential = self.get_credential().await?; let url = self.object_url(path); - let method = match head { + let method = match options.head { true => Method::HEAD, false => Method::GET, }; @@ -604,10 +599,6 @@ impl ObjectStore for GoogleCloudStorage { self.client.get_opts(location, options).await } - async fn head(&self, location: &Path) -> Result { - self.client.head(location).await - } - async fn delete(&self, location: &Path) -> Result<()> { self.client.delete_request(location).await } diff --git a/src/http/client.rs b/src/http/client.rs index 0bd2e56..b2a6ac0 100644 --- a/src/http/client.rs +++ b/src/http/client.rs @@ -288,14 +288,9 @@ impl GetClient for Client { last_modified_required: false, }; - async fn get_request( - &self, - location: &Path, - options: GetOptions, - head: bool, - ) -> Result { - let url = self.path_url(location); - let method = match head { + async fn get_request(&self, path: &Path, options: GetOptions) -> Result { + let url = self.path_url(path); + let method = match options.head { true => Method::HEAD, false => Method::GET, }; @@ -311,7 +306,7 @@ impl GetClient for Client { Some(StatusCode::NOT_FOUND | StatusCode::METHOD_NOT_ALLOWED) => { crate::Error::NotFound { source: Box::new(source), - path: location.to_string(), + path: path.to_string(), } } _ => Error::Request { source }.into(), @@ -322,7 +317,7 @@ impl GetClient for Client { if has_range && res.status() != StatusCode::PARTIAL_CONTENT { return Err(crate::Error::NotSupported { source: Box::new(Error::RangeNotSupported { - href: location.to_string(), + href: path.to_string(), }), }); } diff --git a/src/http/mod.rs b/src/http/mod.rs index e9ed590..6ffb623 100644 --- a/src/http/mod.rs +++ b/src/http/mod.rs @@ -118,10 +118,6 @@ impl ObjectStore for HttpStore { self.client.get_opts(location, options).await } - async fn head(&self, location: &Path) -> Result { - self.client.head(location).await - } - async fn delete(&self, location: &Path) -> Result<()> { self.client.delete(location).await } diff --git a/src/lib.rs b/src/lib.rs index 68e785b..ff0a465 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -410,7 +410,13 @@ pub trait ObjectStore: std::fmt::Display + Send + Sync + Debug + 'static { } /// Return the metadata for the specified location - async fn head(&self, location: &Path) -> Result; + async fn head(&self, location: &Path) -> Result { + let options = GetOptions { + head: true, + ..Default::default() + }; + Ok(self.get_opts(location, options).await?.meta) + } /// Delete the object at the specified location. async fn delete(&self, location: &Path) -> Result<()>; @@ -716,6 +722,10 @@ pub struct GetOptions { /// /// pub range: Option>, + /// Request transfer of no content + /// + /// + pub head: bool, } impl GetOptions { diff --git a/src/local.rs b/src/local.rs index 69da170..3ed63a4 100644 --- a/src/local.rs +++ b/src/local.rs @@ -419,35 +419,6 @@ impl ObjectStore for LocalFileSystem { .await } - async fn head(&self, location: &Path) -> Result { - let path = self.config.path_to_filesystem(location)?; - let location = location.clone(); - - maybe_spawn_blocking(move || { - let metadata = match metadata(&path) { - Err(e) => Err(match e.kind() { - ErrorKind::NotFound => Error::NotFound { - path: path.clone(), - source: e, - }, - _ => Error::Metadata { - source: e.into(), - path: location.to_string(), - }, - }), - Ok(m) => match !m.is_dir() { - true => Ok(m), - false => Err(Error::NotFound { - path, - source: io::Error::new(ErrorKind::NotFound, "is directory"), - }), - }, - }?; - convert_metadata(metadata, location) - }) - .await - } - async fn delete(&self, location: &Path) -> Result<()> { let path = self.config.path_to_filesystem(location)?; maybe_spawn_blocking(move || match std::fs::remove_file(&path) { @@ -1604,15 +1575,15 @@ mod unix_test { let path = root.path().join(filename); unistd::mkfifo(&path, stat::Mode::S_IRWXU).unwrap(); - let location = Path::from(filename); - integration.head(&location).await.unwrap(); - // Need to open read and write side in parallel let spawned = tokio::task::spawn_blocking(|| { - OpenOptions::new().write(true).open(path).unwrap(); + OpenOptions::new().write(true).open(path).unwrap() }); + let location = Path::from(filename); + integration.head(&location).await.unwrap(); integration.get(&location).await.unwrap(); + spawned.await.unwrap(); } } From 054faa6cabf541e7ad4ab0f9ee2b304c185fa164 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Mon, 16 Oct 2023 10:56:10 +0100 Subject: [PATCH 202/397] Allow opting out of request signing (#4927) (#4929) --- src/aws/client.rs | 24 +++++++++++++---------- src/aws/credential.rs | 21 +++++++++++++-------- src/aws/mod.rs | 44 +++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 71 insertions(+), 18 deletions(-) diff --git a/src/aws/client.rs b/src/aws/client.rs index ac07f9a..8a45a9f 100644 --- a/src/aws/client.rs +++ b/src/aws/client.rs @@ -207,6 +207,7 @@ pub struct S3Config { pub retry_config: RetryConfig, pub client_options: ClientOptions, pub sign_payload: bool, + pub skip_signature: bool, pub checksum: Option, pub copy_if_not_exists: Option, } @@ -234,8 +235,11 @@ impl S3Client { &self.config } - async fn get_credential(&self) -> Result> { - self.config.credentials.get_credential().await + async fn get_credential(&self) -> Result>> { + Ok(match self.config.skip_signature { + false => Some(self.config.credentials.get_credential().await?), + true => None, + }) } /// Make an S3 PUT request @@ -271,7 +275,7 @@ impl S3Client { let response = builder .query(query) .with_aws_sigv4( - credential.as_ref(), + credential.as_deref(), &self.config.region, "s3", self.config.sign_payload, @@ -299,7 +303,7 @@ impl S3Client { .request(Method::DELETE, url) .query(query) .with_aws_sigv4( - credential.as_ref(), + credential.as_deref(), &self.config.region, "s3", self.config.sign_payload, @@ -390,7 +394,7 @@ impl S3Client { .header(CONTENT_TYPE, "application/xml") .body(body) .with_aws_sigv4( - credential.as_ref(), + credential.as_deref(), &self.config.region, "s3", self.config.sign_payload, @@ -459,7 +463,7 @@ impl S3Client { builder .with_aws_sigv4( - credential.as_ref(), + credential.as_deref(), &self.config.region, "s3", self.config.sign_payload, @@ -490,7 +494,7 @@ impl S3Client { .client .request(Method::POST, url) .with_aws_sigv4( - credential.as_ref(), + credential.as_deref(), &self.config.region, "s3", self.config.sign_payload, @@ -535,7 +539,7 @@ impl S3Client { .query(&[("uploadId", upload_id)]) .body(body) .with_aws_sigv4( - credential.as_ref(), + credential.as_deref(), &self.config.region, "s3", self.config.sign_payload, @@ -567,7 +571,7 @@ impl GetClient for S3Client { let response = builder .with_get_options(options) .with_aws_sigv4( - credential.as_ref(), + credential.as_deref(), &self.config.region, "s3", self.config.sign_payload, @@ -621,7 +625,7 @@ impl ListClient for S3Client { .request(Method::GET, &url) .query(&query) .with_aws_sigv4( - credential.as_ref(), + credential.as_deref(), &self.config.region, "s3", self.config.sign_payload, diff --git a/src/aws/credential.rs b/src/aws/credential.rs index e27b71f..e0c5de5 100644 --- a/src/aws/credential.rs +++ b/src/aws/credential.rs @@ -291,7 +291,7 @@ pub trait CredentialExt { /// Sign a request fn with_aws_sigv4( self, - credential: &AwsCredential, + credential: Option<&AwsCredential>, region: &str, service: &str, sign_payload: bool, @@ -302,20 +302,25 @@ pub trait CredentialExt { impl CredentialExt for RequestBuilder { fn with_aws_sigv4( self, - credential: &AwsCredential, + credential: Option<&AwsCredential>, region: &str, service: &str, sign_payload: bool, payload_sha256: Option<&[u8]>, ) -> Self { - let (client, request) = self.build_split(); - let mut request = request.expect("request valid"); + match credential { + Some(credential) => { + let (client, request) = self.build_split(); + let mut request = request.expect("request valid"); - AwsAuthorizer::new(credential, service, region) - .with_sign_payload(sign_payload) - .authorize(&mut request, payload_sha256); + AwsAuthorizer::new(credential, service, region) + .with_sign_payload(sign_payload) + .authorize(&mut request, payload_sha256); - Self::from_parts(client, request) + Self::from_parts(client, request) + } + None => self, + } } } diff --git a/src/aws/mod.rs b/src/aws/mod.rs index 285ee2f..70170a3 100644 --- a/src/aws/mod.rs +++ b/src/aws/mod.rs @@ -448,6 +448,8 @@ pub struct AmazonS3Builder { client_options: ClientOptions, /// Credentials credentials: Option, + /// Skip signing requests + skip_signature: ConfigValue, /// Copy if not exists copy_if_not_exists: Option>, } @@ -586,6 +588,9 @@ pub enum AmazonS3ConfigKey { /// See [`S3CopyIfNotExists`] CopyIfNotExists, + /// Skip signing request + SkipSignature, + /// Client options Client(ClientConfigKey), } @@ -608,6 +613,7 @@ impl AsRef for AmazonS3ConfigKey { Self::ContainerCredentialsRelativeUri => { "aws_container_credentials_relative_uri" } + Self::SkipSignature => "aws_skip_signature", Self::CopyIfNotExists => "copy_if_not_exists", Self::Client(opt) => opt.as_ref(), } @@ -642,6 +648,7 @@ impl FromStr for AmazonS3ConfigKey { "aws_container_credentials_relative_uri" => { Ok(Self::ContainerCredentialsRelativeUri) } + "aws_skip_signature" | "skip_signature" => Ok(Self::SkipSignature), "copy_if_not_exists" => Ok(Self::CopyIfNotExists), // Backwards compatibility "aws_allow_http" => Ok(Self::Client(ClientConfigKey::AllowHttp)), @@ -753,6 +760,7 @@ impl AmazonS3Builder { AmazonS3ConfigKey::Client(key) => { self.client_options = self.client_options.with_config(key, value) } + AmazonS3ConfigKey::SkipSignature => self.skip_signature.parse(value), AmazonS3ConfigKey::CopyIfNotExists => { self.copy_if_not_exists = Some(ConfigValue::Deferred(value.into())) } @@ -823,6 +831,7 @@ impl AmazonS3Builder { AmazonS3ConfigKey::ContainerCredentialsRelativeUri => { self.container_credentials_relative_uri.clone() } + AmazonS3ConfigKey::SkipSignature => Some(self.skip_signature.to_string()), AmazonS3ConfigKey::CopyIfNotExists => { self.copy_if_not_exists.as_ref().map(ToString::to_string) } @@ -977,6 +986,14 @@ impl AmazonS3Builder { self } + /// If enabled, [`AmazonS3`] will not fetch credentials and will not sign requests + /// + /// This can be useful when interacting with public S3 buckets that deny authorized requests + pub fn with_skip_signature(mut self, skip_signature: bool) -> Self { + self.skip_signature = skip_signature.into(); + self + } + /// Sets the [checksum algorithm] which has to be used for object integrity check during upload. /// /// [checksum algorithm]: https://docs.aws.amazon.com/AmazonS3/latest/userguide/checking-object-integrity.html @@ -1146,6 +1163,7 @@ impl AmazonS3Builder { retry_config: self.retry_config, client_options: self.client_options, sign_payload: !self.unsigned_payload.get()?, + skip_signature: self.skip_signature.get()?, checksum, copy_if_not_exists, }; @@ -1505,4 +1523,30 @@ mod s3_resolve_bucket_region_tests { assert!(result.is_err()); } + + #[tokio::test] + #[ignore = "Tests shouldn't call use remote services by default"] + async fn test_disable_creds() { + // https://registry.opendata.aws/daylight-osm/ + let v1 = AmazonS3Builder::new() + .with_bucket_name("daylight-map-distribution") + .with_region("us-west-1") + .with_access_key_id("local") + .with_secret_access_key("development") + .build() + .unwrap(); + + let prefix = Path::from("release"); + + v1.list_with_delimiter(Some(&prefix)).await.unwrap_err(); + + let v2 = AmazonS3Builder::new() + .with_bucket_name("daylight-map-distribution") + .with_region("us-west-1") + .with_skip_signature(true) + .build() + .unwrap(); + + v2.list_with_delimiter(Some(&prefix)).await.unwrap(); + } } From 0272d0624c94cad626b2c213669b4266a5caf778 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Mon, 16 Oct 2023 10:56:25 +0100 Subject: [PATCH 203/397] Default connection and request timeouts of 5 seconds (#4928) * Default connection and request timeouts of 5 seconds * Clippy * Allow disabling timeouts --- src/aws/mod.rs | 3 +-- src/azure/mod.rs | 2 +- src/client/mod.rs | 66 +++++++++++++++++++++++++++++++++++++++++++++-- src/gcp/mod.rs | 2 +- 4 files changed, 67 insertions(+), 6 deletions(-) diff --git a/src/aws/mod.rs b/src/aws/mod.rs index 70170a3..3ddce08 100644 --- a/src/aws/mod.rs +++ b/src/aws/mod.rs @@ -1130,8 +1130,7 @@ impl AmazonS3Builder { Arc::new(TokenCredentialProvider::new( token, - // The instance metadata endpoint is access over HTTP - self.client_options.clone().with_allow_http(true).client()?, + self.client_options.metadata_client()?, self.retry_config.clone(), )) as _ }; diff --git a/src/azure/mod.rs b/src/azure/mod.rs index 9017634..190b73b 100644 --- a/src/azure/mod.rs +++ b/src/azure/mod.rs @@ -1070,7 +1070,7 @@ impl MicrosoftAzureBuilder { ); Arc::new(TokenCredentialProvider::new( msi_credential, - self.client_options.clone().with_allow_http(true).client()?, + self.client_options.metadata_client()?, self.retry_config.clone(), )) as _ }; diff --git a/src/client/mod.rs b/src/client/mod.rs index ee9d62a..137da2b 100644 --- a/src/client/mod.rs +++ b/src/client/mod.rs @@ -166,7 +166,7 @@ impl FromStr for ClientConfigKey { } /// HTTP client configuration for remote object stores -#[derive(Debug, Clone, Default)] +#[derive(Debug, Clone)] pub struct ClientOptions { user_agent: Option>, content_type_map: HashMap, @@ -188,6 +188,35 @@ pub struct ClientOptions { http2_only: ConfigValue, } +impl Default for ClientOptions { + fn default() -> Self { + // Defaults based on + // + // + // Which recommend a connection timeout of 3.1s and a request timeout of 2s + Self { + user_agent: None, + content_type_map: Default::default(), + default_content_type: None, + default_headers: None, + proxy_url: None, + proxy_ca_certificate: None, + proxy_excludes: None, + allow_http: Default::default(), + allow_insecure: Default::default(), + timeout: Some(Duration::from_secs(5).into()), + connect_timeout: Some(Duration::from_secs(5).into()), + pool_idle_timeout: None, + pool_max_idle_per_host: None, + http2_keep_alive_interval: None, + http2_keep_alive_timeout: None, + http2_keep_alive_while_idle: Default::default(), + http1_only: Default::default(), + http2_only: Default::default(), + } + } +} + impl ClientOptions { /// Create a new [`ClientOptions`] with default values pub fn new() -> Self { @@ -367,17 +396,37 @@ impl ClientOptions { /// /// The timeout is applied from when the request starts connecting until the /// response body has finished + /// + /// Default is 5 seconds pub fn with_timeout(mut self, timeout: Duration) -> Self { self.timeout = Some(ConfigValue::Parsed(timeout)); self } + /// Disables the request timeout + /// + /// See [`Self::with_timeout`] + pub fn with_timeout_disabled(mut self) -> Self { + self.timeout = None; + self + } + /// Set a timeout for only the connect phase of a Client + /// + /// Default is 5 seconds pub fn with_connect_timeout(mut self, timeout: Duration) -> Self { self.connect_timeout = Some(ConfigValue::Parsed(timeout)); self } + /// Disables the connection timeout + /// + /// See [`Self::with_connect_timeout`] + pub fn with_connect_timeout_disabled(mut self) -> Self { + self.timeout = None; + self + } + /// Set the pool max idle timeout /// /// This is the length of time an idle connection will be kept alive @@ -444,7 +493,20 @@ impl ClientOptions { } } - pub(crate) fn client(&self) -> super::Result { + /// Create a [`Client`] with overrides optimised for metadata endpoint access + /// + /// In particular: + /// * Allows HTTP as metadata endpoints do not use TLS + /// * Configures a low connection timeout to provide quick feedback if not present + #[cfg(any(feature = "aws", feature = "gcp", feature = "azure"))] + pub(crate) fn metadata_client(&self) -> Result { + self.clone() + .with_allow_http(true) + .with_connect_timeout(Duration::from_secs(1)) + .client() + } + + pub(crate) fn client(&self) -> Result { let mut builder = ClientBuilder::new(); match &self.user_agent { diff --git a/src/gcp/mod.rs b/src/gcp/mod.rs index f80704b..f8a1631 100644 --- a/src/gcp/mod.rs +++ b/src/gcp/mod.rs @@ -1071,7 +1071,7 @@ impl GoogleCloudStorageBuilder { } else { Arc::new(TokenCredentialProvider::new( InstanceCredentialProvider::new(audience), - self.client_options.clone().with_allow_http(true).client()?, + self.client_options.metadata_client()?, self.retry_config.clone(), )) as _ }; From ceb1261ad35b9da503e63bbda90dd961959298da Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Mon, 16 Oct 2023 14:18:53 +0100 Subject: [PATCH 204/397] Support service_account in ApplicationDefaultCredentials and Use SelfSignedJwt (#4926) * Support service_account in ApplicationDefaultCredentials * Use SelfSignedJwt for Service Accounts * Update CI * Apply suggestions from code review Co-authored-by: Marco Neumann --------- Co-authored-by: Marco Neumann --- src/gcp/credential.rs | 219 ++++++++++++++++-------------------------- src/gcp/mod.rs | 45 +++++---- 2 files changed, 107 insertions(+), 157 deletions(-) diff --git a/src/gcp/credential.rs b/src/gcp/credential.rs index ad21c33..87f8e24 100644 --- a/src/gcp/credential.rs +++ b/src/gcp/credential.rs @@ -17,10 +17,8 @@ use crate::client::retry::RetryExt; use crate::client::token::TemporaryToken; -use crate::client::{TokenCredentialProvider, TokenProvider}; -use crate::gcp::credential::Error::UnsupportedCredentialsType; -use crate::gcp::{GcpCredentialProvider, STORE}; -use crate::ClientOptions; +use crate::client::TokenProvider; +use crate::gcp::STORE; use crate::RetryConfig; use async_trait::async_trait; use base64::prelude::BASE64_URL_SAFE_NO_PAD; @@ -28,6 +26,7 @@ use base64::Engine; use futures::TryFutureExt; use reqwest::{Client, Method}; use ring::signature::RsaKeyPair; +use serde::Deserialize; use snafu::{ResultExt, Snafu}; use std::env; use std::fs::File; @@ -37,6 +36,10 @@ use std::sync::Arc; use std::time::{Duration, Instant}; use tracing::info; +pub const DEFAULT_SCOPE: &str = "https://www.googleapis.com/auth/devstorage.full_control"; + +pub const DEFAULT_GCS_BASE_URL: &str = "https://storage.googleapis.com"; + #[derive(Debug, Snafu)] pub enum Error { #[snafu(display("Unable to open service account file from {}: {}", path.display(), source))] @@ -68,9 +71,6 @@ pub enum Error { #[snafu(display("Error getting token response body: {}", source))] TokenResponseBody { source: reqwest::Error }, - - #[snafu(display("Unsupported ApplicationCredentials type: {}", type_))] - UnsupportedCredentialsType { type_: String }, } impl From for crate::Error { @@ -92,48 +92,48 @@ pub struct GcpCredential { pub type Result = std::result::Result; #[derive(Debug, Default, serde::Serialize)] -pub struct JwtHeader { +pub struct JwtHeader<'a> { /// The type of JWS: it can only be "JWT" here /// /// Defined in [RFC7515#4.1.9](https://tools.ietf.org/html/rfc7515#section-4.1.9). #[serde(skip_serializing_if = "Option::is_none")] - pub typ: Option, + pub typ: Option<&'a str>, /// The algorithm used /// /// Defined in [RFC7515#4.1.1](https://tools.ietf.org/html/rfc7515#section-4.1.1). - pub alg: String, + pub alg: &'a str, /// Content type /// /// Defined in [RFC7519#5.2](https://tools.ietf.org/html/rfc7519#section-5.2). #[serde(skip_serializing_if = "Option::is_none")] - pub cty: Option, + pub cty: Option<&'a str>, /// JSON Key URL /// /// Defined in [RFC7515#4.1.2](https://tools.ietf.org/html/rfc7515#section-4.1.2). #[serde(skip_serializing_if = "Option::is_none")] - pub jku: Option, + pub jku: Option<&'a str>, /// Key ID /// /// Defined in [RFC7515#4.1.4](https://tools.ietf.org/html/rfc7515#section-4.1.4). #[serde(skip_serializing_if = "Option::is_none")] - pub kid: Option, + pub kid: Option<&'a str>, /// X.509 URL /// /// Defined in [RFC7515#4.1.5](https://tools.ietf.org/html/rfc7515#section-4.1.5). #[serde(skip_serializing_if = "Option::is_none")] - pub x5u: Option, + pub x5u: Option<&'a str>, /// X.509 certificate thumbprint /// /// Defined in [RFC7515#4.1.7](https://tools.ietf.org/html/rfc7515#section-4.1.7). #[serde(skip_serializing_if = "Option::is_none")] - pub x5t: Option, + pub x5t: Option<&'a str>, } #[derive(serde::Serialize)] struct TokenClaims<'a> { iss: &'a str, + sub: &'a str, scope: &'a str, - aud: &'a str, exp: u64, iat: u64, } @@ -144,28 +144,32 @@ struct TokenResponse { expires_in: u64, } -/// Encapsulates the logic to perform an OAuth token challenge +/// Self-signed JWT (JSON Web Token). +/// +/// # References +/// - #[derive(Debug)] -pub struct OAuthProvider { +pub struct SelfSignedJwt { issuer: String, scope: String, - audience: String, key_pair: RsaKeyPair, jwt_header: String, random: ring::rand::SystemRandom, } -impl OAuthProvider { - /// Create a new [`OAuthProvider`] +impl SelfSignedJwt { + /// Create a new [`SelfSignedJwt`] pub fn new( + key_id: String, issuer: String, private_key_pem: String, scope: String, - audience: String, ) -> Result { let key_pair = decode_first_rsa_key(private_key_pem)?; let jwt_header = b64_encode_obj(&JwtHeader { - alg: "RS256".to_string(), + alg: "RS256", + typ: Some("JWT"), + kid: Some(&key_id), ..Default::default() })?; @@ -173,7 +177,6 @@ impl OAuthProvider { issuer, key_pair, scope, - audience, jwt_header, random: ring::rand::SystemRandom::new(), }) @@ -181,24 +184,24 @@ impl OAuthProvider { } #[async_trait] -impl TokenProvider for OAuthProvider { +impl TokenProvider for SelfSignedJwt { type Credential = GcpCredential; /// Fetch a fresh token async fn fetch_token( &self, - client: &Client, - retry: &RetryConfig, + _client: &Client, + _retry: &RetryConfig, ) -> crate::Result>> { let now = seconds_since_epoch(); let exp = now + 3600; let claims = TokenClaims { iss: &self.issuer, + sub: &self.issuer, scope: &self.scope, - aud: &self.audience, - exp, iat: now, + exp, }; let claim_str = b64_encode_obj(&claims)?; @@ -214,28 +217,11 @@ impl TokenProvider for OAuthProvider { .context(SignSnafu)?; let signature = BASE64_URL_SAFE_NO_PAD.encode(sig_bytes); - let jwt = [message, signature].join("."); - - let body = [ - ("grant_type", "urn:ietf:params:oauth:grant-type:jwt-bearer"), - ("assertion", &jwt), - ]; - - let response: TokenResponse = client - .request(Method::POST, &self.audience) - .form(&body) - .send_retry(retry) - .await - .context(TokenRequestSnafu)? - .json() - .await - .context(TokenResponseBodySnafu)?; + let bearer = [message, signature].join("."); Ok(TemporaryToken { - token: Arc::new(GcpCredential { - bearer: response.access_token, - }), - expiry: Some(Instant::now() + Duration::from_secs(response.expires_in)), + token: Arc::new(GcpCredential { bearer }), + expiry: Some(Instant::now() + Duration::from_secs(3600)), }) } } @@ -259,29 +245,24 @@ pub struct ServiceAccountCredentials { /// The private key in RSA format. pub private_key: String, + /// The private key ID + pub private_key_id: String, + /// The email address associated with the service account. pub client_email: String, /// Base URL for GCS - #[serde(default = "default_gcs_base_url")] - pub gcs_base_url: String, + #[serde(default)] + pub gcs_base_url: Option, /// Disable oauth and use empty tokens. - #[serde(default = "default_disable_oauth")] + #[serde(default)] pub disable_oauth: bool, } -pub fn default_gcs_base_url() -> String { - "https://storage.googleapis.com".to_owned() -} - -pub fn default_disable_oauth() -> bool { - false -} - impl ServiceAccountCredentials { /// Create a new [`ServiceAccountCredentials`] from a file. - pub fn from_file>(path: P) -> Result { + pub fn from_file>(path: P) -> Result { read_credentials_file(path) } @@ -290,17 +271,20 @@ impl ServiceAccountCredentials { serde_json::from_str(key).context(DecodeCredentialsSnafu) } - /// Create an [`OAuthProvider`] from this credentials struct. - pub fn oauth_provider( - self, - scope: &str, - audience: &str, - ) -> crate::Result { - Ok(OAuthProvider::new( + /// Create a [`SelfSignedJwt`] from this credentials struct. + /// + /// We use a scope of [`DEFAULT_SCOPE`] as opposed to an audience + /// as GCS appears to not support audience + /// + /// # References + /// - + /// - + pub fn token_provider(self) -> crate::Result { + Ok(SelfSignedJwt::new( + self.private_key_id, self.client_email, self.private_key, - scope.to_string(), - audience.to_string(), + DEFAULT_SCOPE.to_string(), )?) } } @@ -337,25 +321,13 @@ fn b64_encode_obj(obj: &T) -> Result { /// /// #[derive(Debug, Default)] -pub struct InstanceCredentialProvider { - audience: String, -} - -impl InstanceCredentialProvider { - /// Create a new [`InstanceCredentialProvider`], we need to control the client in order to enable http access so save the options. - pub fn new>(audience: T) -> Self { - Self { - audience: audience.into(), - } - } -} +pub struct InstanceCredentialProvider {} /// Make a request to the metadata server to fetch a token, using a a given hostname. async fn make_metadata_request( client: &Client, hostname: &str, retry: &RetryConfig, - audience: &str, ) -> crate::Result { let url = format!( "http://{hostname}/computeMetadata/v1/instance/service-accounts/default/token" @@ -363,7 +335,7 @@ async fn make_metadata_request( let response: TokenResponse = client .request(Method::GET, url) .header("Metadata-Flavor", "Google") - .query(&[("audience", audience)]) + .query(&[("audience", "https://www.googleapis.com/oauth2/v4/token")]) .send_retry(retry) .await .context(TokenRequestSnafu)? @@ -388,12 +360,9 @@ impl TokenProvider for InstanceCredentialProvider { const METADATA_HOST: &str = "metadata"; info!("fetching token from metadata server"); - let response = - make_metadata_request(client, METADATA_HOST, retry, &self.audience) - .or_else(|_| { - make_metadata_request(client, METADATA_IP, retry, &self.audience) - }) - .await?; + let response = make_metadata_request(client, METADATA_HOST, retry) + .or_else(|_| make_metadata_request(client, METADATA_IP, retry)) + .await?; let token = TemporaryToken { token: Arc::new(GcpCredential { bearer: response.access_token, @@ -404,62 +373,36 @@ impl TokenProvider for InstanceCredentialProvider { } } -/// ApplicationDefaultCredentials -/// -pub fn application_default_credentials( - path: Option<&str>, - client: &ClientOptions, - retry: &RetryConfig, -) -> crate::Result> { - let file = match ApplicationDefaultCredentialsFile::read(path)? { - Some(x) => x, - None => return Ok(None), - }; - - match file.type_.as_str() { - // - "authorized_user" => { - let token = AuthorizedUserCredentials { - client_id: file.client_id, - client_secret: file.client_secret, - refresh_token: file.refresh_token, - }; - - Ok(Some(Arc::new(TokenCredentialProvider::new( - token, - client.client()?, - retry.clone(), - )))) - } - type_ => Err(UnsupportedCredentialsType { - type_: type_.to_string(), - } - .into()), - } -} - /// A deserialized `application_default_credentials.json`-file. -/// +/// +/// # References +/// - +/// - #[derive(serde::Deserialize)] -struct ApplicationDefaultCredentialsFile { - #[serde(default)] - client_id: String, - #[serde(default)] - client_secret: String, - #[serde(default)] - refresh_token: String, - #[serde(rename = "type")] - type_: String, +#[serde(tag = "type")] +pub enum ApplicationDefaultCredentials { + /// Service Account. + /// + /// # References + /// - + #[serde(rename = "service_account")] + ServiceAccount(ServiceAccountCredentials), + /// Authorized user via "gcloud CLI Integration". + /// + /// # References + /// - + #[serde(rename = "authorized_user")] + AuthorizedUser(AuthorizedUserCredentials), } -impl ApplicationDefaultCredentialsFile { +impl ApplicationDefaultCredentials { const CREDENTIALS_PATH: &'static str = ".config/gcloud/application_default_credentials.json"; // Create a new application default credential in the following situations: // 1. a file is passed in and the type matches. // 2. without argument if the well-known configuration file is present. - fn read(path: Option<&str>) -> Result, Error> { + pub fn read(path: Option<&str>) -> Result, Error> { if let Some(path) = path { return read_credentials_file::(path).map(Some); } @@ -478,8 +421,8 @@ impl ApplicationDefaultCredentialsFile { const DEFAULT_TOKEN_GCP_URI: &str = "https://accounts.google.com/o/oauth2/token"; /// -#[derive(Debug)] -struct AuthorizedUserCredentials { +#[derive(Debug, Deserialize)] +pub struct AuthorizedUserCredentials { client_id: String, client_secret: String, refresh_token: String, diff --git a/src/gcp/mod.rs b/src/gcp/mod.rs index f8a1631..a75527f 100644 --- a/src/gcp/mod.rs +++ b/src/gcp/mod.rs @@ -57,10 +57,7 @@ use crate::{ ObjectStore, Result, RetryConfig, }; -use credential::{ - application_default_credentials, default_gcs_base_url, InstanceCredentialProvider, - ServiceAccountCredentials, -}; +use credential::{InstanceCredentialProvider, ServiceAccountCredentials}; mod credential; @@ -68,6 +65,7 @@ const STORE: &str = "GCS"; /// [`CredentialProvider`] for [`GoogleCloudStorage`] pub type GcpCredentialProvider = Arc>; +use crate::gcp::credential::{ApplicationDefaultCredentials, DEFAULT_GCS_BASE_URL}; pub use credential::GcpCredential; #[derive(Debug, Snafu)] @@ -1034,10 +1032,8 @@ impl GoogleCloudStorageBuilder { }; // Then try to initialize from the application credentials file, or the environment. - let application_default_credentials = application_default_credentials( + let application_default_credentials = ApplicationDefaultCredentials::read( self.application_credentials_path.as_deref(), - &self.client_options, - &self.retry_config, )?; let disable_oauth = service_account_credentials @@ -1045,14 +1041,10 @@ impl GoogleCloudStorageBuilder { .map(|c| c.disable_oauth) .unwrap_or(false); - let gcs_base_url = service_account_credentials + let gcs_base_url: String = service_account_credentials .as_ref() - .map(|c| c.gcs_base_url.clone()) - .unwrap_or_else(default_gcs_base_url); - - // TODO: https://cloud.google.com/storage/docs/authentication#oauth-scopes - let scope = "https://www.googleapis.com/auth/devstorage.full_control"; - let audience = "https://www.googleapis.com/oauth2/v4/token"; + .and_then(|c| c.gcs_base_url.clone()) + .unwrap_or_else(|| DEFAULT_GCS_BASE_URL.to_string()); let credentials = if let Some(credentials) = self.credentials { credentials @@ -1062,15 +1054,30 @@ impl GoogleCloudStorageBuilder { })) as _ } else if let Some(credentials) = service_account_credentials { Arc::new(TokenCredentialProvider::new( - credentials.oauth_provider(scope, audience)?, + credentials.token_provider()?, self.client_options.client()?, self.retry_config.clone(), )) as _ } else if let Some(credentials) = application_default_credentials { - credentials + match credentials { + ApplicationDefaultCredentials::AuthorizedUser(token) => { + Arc::new(TokenCredentialProvider::new( + token, + self.client_options.client()?, + self.retry_config.clone(), + )) as _ + } + ApplicationDefaultCredentials::ServiceAccount(token) => { + Arc::new(TokenCredentialProvider::new( + token.token_provider()?, + self.client_options.client()?, + self.retry_config.clone(), + )) as _ + } + } } else { Arc::new(TokenCredentialProvider::new( - InstanceCredentialProvider::new(audience), + InstanceCredentialProvider::default(), self.client_options.metadata_client()?, self.retry_config.clone(), )) as _ @@ -1105,7 +1112,7 @@ mod test { use super::*; - const FAKE_KEY: &str = r#"{"private_key": "private_key", "client_email":"client_email", "disable_oauth":true}"#; + const FAKE_KEY: &str = r#"{"private_key": "private_key", "private_key_id": "private_key_id", "client_email":"client_email", "disable_oauth":true}"#; const NON_EXISTENT_NAME: &str = "nonexistentname"; #[tokio::test] @@ -1117,7 +1124,7 @@ mod test { list_uses_directories_correctly(&integration).await; list_with_delimiter(&integration).await; rename_and_copy(&integration).await; - if integration.client.base_url == default_gcs_base_url() { + if integration.client.base_url == DEFAULT_GCS_BASE_URL { // Fake GCS server doesn't currently honor ifGenerationMatch // https://github.com/fsouza/fake-gcs-server/issues/994 copy_if_not_exists(&integration).await; From 54270c70d230338f7bf7903b6a6a15f919a1bddc Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Tue, 17 Oct 2023 12:39:34 +0100 Subject: [PATCH 205/397] Generate `ETag`s for `InMemory` and `LocalFileSystem` (#4879) (#4922) * Support ETag in InMemory (#4879) * Add LocalFileSystem Etag * Review feedback * Review feedback --- src/lib.rs | 206 ++++++++++++++++++++++++++++++++++++++------------ src/local.rs | 37 +++++---- src/memory.rs | 149 +++++++++++++++++++++--------------- 3 files changed, 268 insertions(+), 124 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index ff0a465..b79042e 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -698,12 +698,28 @@ pub struct GetOptions { /// Request will succeed if the `ObjectMeta::e_tag` matches /// otherwise returning [`Error::Precondition`] /// - /// + /// See + /// + /// Examples: + /// + /// ```text + /// If-Match: "xyzzy" + /// If-Match: "xyzzy", "r2d2xxxx", "c3piozzzz" + /// If-Match: * + /// ``` pub if_match: Option, /// Request will succeed if the `ObjectMeta::e_tag` does not match /// otherwise returning [`Error::NotModified`] /// - /// + /// See + /// + /// Examples: + /// + /// ```text + /// If-None-Match: "xyzzy" + /// If-None-Match: "xyzzy", "r2d2xxxx", "c3piozzzz" + /// If-None-Match: * + /// ``` pub if_none_match: Option, /// Request will succeed if the object has been modified since /// @@ -730,25 +746,41 @@ pub struct GetOptions { impl GetOptions { /// Returns an error if the modification conditions on this request are not satisfied - fn check_modified( - &self, - location: &Path, - last_modified: DateTime, - ) -> Result<()> { - if let Some(date) = self.if_modified_since { - if last_modified <= date { - return Err(Error::NotModified { - path: location.to_string(), - source: format!("{} >= {}", date, last_modified).into(), + /// + /// + fn check_preconditions(&self, meta: &ObjectMeta) -> Result<()> { + // The use of the invalid etag "*" means no ETag is equivalent to never matching + let etag = meta.e_tag.as_deref().unwrap_or("*"); + let last_modified = meta.last_modified; + + if let Some(m) = &self.if_match { + if m != "*" && m.split(',').map(str::trim).all(|x| x != etag) { + return Err(Error::Precondition { + path: meta.location.to_string(), + source: format!("{etag} does not match {m}").into(), }); } - } - - if let Some(date) = self.if_unmodified_since { + } else if let Some(date) = self.if_unmodified_since { if last_modified > date { return Err(Error::Precondition { - path: location.to_string(), - source: format!("{} < {}", date, last_modified).into(), + path: meta.location.to_string(), + source: format!("{date} < {last_modified}").into(), + }); + } + } + + if let Some(m) = &self.if_none_match { + if m == "*" || m.split(',').map(str::trim).any(|x| x == etag) { + return Err(Error::NotModified { + path: meta.location.to_string(), + source: format!("{etag} matches {m}").into(), + }); + } + } else if let Some(date) = self.if_modified_since { + if last_modified <= date { + return Err(Error::NotModified { + path: meta.location.to_string(), + source: format!("{date} >= {last_modified}").into(), }); } } @@ -952,6 +984,7 @@ mod test_util { mod tests { use super::*; use crate::test_util::flatten_list_stream; + use chrono::TimeZone; use rand::{thread_rng, Rng}; use tokio::io::AsyncWriteExt; @@ -1359,33 +1392,32 @@ mod tests { Err(e) => panic!("{e}"), } - if let Some(tag) = meta.e_tag { - let options = GetOptions { - if_match: Some(tag.clone()), - ..GetOptions::default() - }; - storage.get_opts(&path, options).await.unwrap(); - - let options = GetOptions { - if_match: Some("invalid".to_string()), - ..GetOptions::default() - }; - let err = storage.get_opts(&path, options).await.unwrap_err(); - assert!(matches!(err, Error::Precondition { .. }), "{err}"); - - let options = GetOptions { - if_none_match: Some(tag.clone()), - ..GetOptions::default() - }; - let err = storage.get_opts(&path, options).await.unwrap_err(); - assert!(matches!(err, Error::NotModified { .. }), "{err}"); - - let options = GetOptions { - if_none_match: Some("invalid".to_string()), - ..GetOptions::default() - }; - storage.get_opts(&path, options).await.unwrap(); - } + let tag = meta.e_tag.unwrap(); + let options = GetOptions { + if_match: Some(tag.clone()), + ..GetOptions::default() + }; + storage.get_opts(&path, options).await.unwrap(); + + let options = GetOptions { + if_match: Some("invalid".to_string()), + ..GetOptions::default() + }; + let err = storage.get_opts(&path, options).await.unwrap_err(); + assert!(matches!(err, Error::Precondition { .. }), "{err}"); + + let options = GetOptions { + if_none_match: Some(tag.clone()), + ..GetOptions::default() + }; + let err = storage.get_opts(&path, options).await.unwrap_err(); + assert!(matches!(err, Error::NotModified { .. }), "{err}"); + + let options = GetOptions { + if_none_match: Some("invalid".to_string()), + ..GetOptions::default() + }; + storage.get_opts(&path, options).await.unwrap(); } /// Returns a chunk of length `chunk_length` @@ -1697,8 +1729,86 @@ mod tests { assert!(stream.next().await.is_none()); } - // Tests TODO: - // GET nonexisting location (in_memory/file) - // DELETE nonexisting location - // PUT overwriting + #[test] + fn test_preconditions() { + let mut meta = ObjectMeta { + location: Path::from("test"), + last_modified: Utc.timestamp_nanos(100), + size: 100, + e_tag: Some("123".to_string()), + }; + + let mut options = GetOptions::default(); + options.check_preconditions(&meta).unwrap(); + + options.if_modified_since = Some(Utc.timestamp_nanos(50)); + options.check_preconditions(&meta).unwrap(); + + options.if_modified_since = Some(Utc.timestamp_nanos(100)); + options.check_preconditions(&meta).unwrap_err(); + + options.if_modified_since = Some(Utc.timestamp_nanos(101)); + options.check_preconditions(&meta).unwrap_err(); + + options = GetOptions::default(); + + options.if_unmodified_since = Some(Utc.timestamp_nanos(50)); + options.check_preconditions(&meta).unwrap_err(); + + options.if_unmodified_since = Some(Utc.timestamp_nanos(100)); + options.check_preconditions(&meta).unwrap(); + + options.if_unmodified_since = Some(Utc.timestamp_nanos(101)); + options.check_preconditions(&meta).unwrap(); + + options = GetOptions::default(); + + options.if_match = Some("123".to_string()); + options.check_preconditions(&meta).unwrap(); + + options.if_match = Some("123,354".to_string()); + options.check_preconditions(&meta).unwrap(); + + options.if_match = Some("354, 123,".to_string()); + options.check_preconditions(&meta).unwrap(); + + options.if_match = Some("354".to_string()); + options.check_preconditions(&meta).unwrap_err(); + + options.if_match = Some("*".to_string()); + options.check_preconditions(&meta).unwrap(); + + // If-Match takes precedence + options.if_unmodified_since = Some(Utc.timestamp_nanos(200)); + options.check_preconditions(&meta).unwrap(); + + options = GetOptions::default(); + + options.if_none_match = Some("123".to_string()); + options.check_preconditions(&meta).unwrap_err(); + + options.if_none_match = Some("*".to_string()); + options.check_preconditions(&meta).unwrap_err(); + + options.if_none_match = Some("1232".to_string()); + options.check_preconditions(&meta).unwrap(); + + options.if_none_match = Some("23, 123".to_string()); + options.check_preconditions(&meta).unwrap_err(); + + // If-None-Match takes precedence + options.if_modified_since = Some(Utc.timestamp_nanos(10)); + options.check_preconditions(&meta).unwrap_err(); + + // Check missing ETag + meta.e_tag = None; + options = GetOptions::default(); + + options.if_none_match = Some("*".to_string()); // Fails if any file exists + options.check_preconditions(&meta).unwrap_err(); + + options = GetOptions::default(); + options.if_match = Some("*".to_string()); // Passes if file exists + options.check_preconditions(&meta).unwrap(); + } } diff --git a/src/local.rs b/src/local.rs index 3ed63a4..3d4a02a 100644 --- a/src/local.rs +++ b/src/local.rs @@ -365,23 +365,12 @@ impl ObjectStore for LocalFileSystem { } async fn get_opts(&self, location: &Path, options: GetOptions) -> Result { - if options.if_match.is_some() || options.if_none_match.is_some() { - return Err(super::Error::NotSupported { - source: "ETags not supported by LocalFileSystem".to_string().into(), - }); - } - let location = location.clone(); let path = self.config.path_to_filesystem(&location)?; maybe_spawn_blocking(move || { let (file, metadata) = open_file(&path)?; - if options.if_unmodified_since.is_some() - || options.if_modified_since.is_some() - { - options.check_modified(&location, last_modified(&metadata))?; - } - let meta = convert_metadata(metadata, location)?; + options.check_preconditions(&meta)?; Ok(GetResult { payload: GetResultPayload::File(file, path), @@ -965,7 +954,7 @@ fn convert_entry(entry: DirEntry, location: Path) -> Result { convert_metadata(metadata, location) } -fn last_modified(metadata: &std::fs::Metadata) -> DateTime { +fn last_modified(metadata: &Metadata) -> DateTime { metadata .modified() .expect("Modified file time should be supported on this platform") @@ -977,15 +966,35 @@ fn convert_metadata(metadata: Metadata, location: Path) -> Result { let size = usize::try_from(metadata.len()).context(FileSizeOverflowedUsizeSnafu { path: location.as_ref(), })?; + let inode = get_inode(&metadata); + let mtime = last_modified.timestamp_micros(); + + // Use an ETag scheme based on that used by many popular HTTP servers + // + // + let etag = format!("{inode:x}-{mtime:x}-{size:x}"); Ok(ObjectMeta { location, last_modified, size, - e_tag: None, + e_tag: Some(etag), }) } +#[cfg(unix)] +/// We include the inode when available to yield an ETag more resistant to collisions +/// and as used by popular web servers such as [Apache](https://httpd.apache.org/docs/2.2/mod/core.html#fileetag) +fn get_inode(metadata: &Metadata) -> u64 { + std::os::unix::fs::MetadataExt::ino(metadata) +} + +#[cfg(not(unix))] +/// On platforms where an inode isn't available, fallback to just relying on size and mtime +fn get_inode(metadata: &Metadata) -> u64 { + 0 +} + /// Convert walkdir results and converts not-found errors into `None`. /// Convert broken symlinks to `None`. fn convert_walkdir_result( diff --git a/src/memory.rs b/src/memory.rs index 0e22988..f638ed6 100644 --- a/src/memory.rs +++ b/src/memory.rs @@ -35,9 +35,6 @@ use std::sync::Arc; use std::task::Poll; use tokio::io::AsyncWrite; -type Entry = (Bytes, DateTime); -type StorageType = Arc>>; - /// A specialized `Error` for in-memory object store-related errors #[derive(Debug, Snafu)] #[allow(missing_docs)] @@ -80,7 +77,41 @@ impl From for super::Error { /// storage provider. #[derive(Debug, Default)] pub struct InMemory { - storage: StorageType, + storage: SharedStorage, +} + +#[derive(Debug, Clone)] +struct Entry { + data: Bytes, + last_modified: DateTime, + e_tag: usize, +} + +impl Entry { + fn new(data: Bytes, last_modified: DateTime, e_tag: usize) -> Self { + Self { + data, + last_modified, + e_tag, + } + } +} + +#[derive(Debug, Default, Clone)] +struct Storage { + next_etag: usize, + map: BTreeMap, +} + +type SharedStorage = Arc>; + +impl Storage { + fn insert(&mut self, location: &Path, bytes: Bytes) { + let etag = self.next_etag; + self.next_etag += 1; + let entry = Entry::new(bytes, Utc::now(), etag); + self.map.insert(location.clone(), entry); + } } impl std::fmt::Display for InMemory { @@ -92,9 +123,7 @@ impl std::fmt::Display for InMemory { #[async_trait] impl ObjectStore for InMemory { async fn put(&self, location: &Path, bytes: Bytes) -> Result<()> { - self.storage - .write() - .insert(location.clone(), (bytes, Utc::now())); + self.storage.write().insert(location, bytes); Ok(()) } @@ -128,33 +157,30 @@ impl ObjectStore for InMemory { Ok(Box::new(InMemoryAppend { location: location.clone(), data: Vec::::new(), - storage: StorageType::clone(&self.storage), + storage: SharedStorage::clone(&self.storage), })) } async fn get_opts(&self, location: &Path, options: GetOptions) -> Result { - if options.if_match.is_some() || options.if_none_match.is_some() { - return Err(super::Error::NotSupported { - source: "ETags not supported by InMemory".to_string().into(), - }); - } - let (data, last_modified) = self.entry(location).await?; - options.check_modified(location, last_modified)?; + let entry = self.entry(location).await?; + let e_tag = entry.e_tag.to_string(); + let meta = ObjectMeta { location: location.clone(), - last_modified, - size: data.len(), - e_tag: None, + last_modified: entry.last_modified, + size: entry.data.len(), + e_tag: Some(e_tag), }; + options.check_preconditions(&meta)?; let (range, data) = match options.range { Some(range) => { - let len = data.len(); + let len = entry.data.len(); ensure!(range.end <= len, OutOfRangeSnafu { range, len }); ensure!(range.start <= range.end, BadRangeSnafu { range }); - (range.clone(), data.slice(range)) + (range.clone(), entry.data.slice(range)) } - None => (0..data.len(), data), + None => (0..entry.data.len(), entry.data), }; let stream = futures::stream::once(futures::future::ready(Ok(data))); @@ -170,15 +196,18 @@ impl ObjectStore for InMemory { location: &Path, ranges: &[Range], ) -> Result> { - let data = self.entry(location).await?; + let entry = self.entry(location).await?; ranges .iter() .map(|range| { let range = range.clone(); - let len = data.0.len(); - ensure!(range.end <= data.0.len(), OutOfRangeSnafu { range, len }); + let len = entry.data.len(); + ensure!( + range.end <= entry.data.len(), + OutOfRangeSnafu { range, len } + ); ensure!(range.start <= range.end, BadRangeSnafu { range }); - Ok(data.0.slice(range)) + Ok(entry.data.slice(range)) }) .collect() } @@ -188,14 +217,14 @@ impl ObjectStore for InMemory { Ok(ObjectMeta { location: location.clone(), - last_modified: entry.1, - size: entry.0.len(), - e_tag: None, + last_modified: entry.last_modified, + size: entry.data.len(), + e_tag: Some(entry.e_tag.to_string()), }) } async fn delete(&self, location: &Path) -> Result<()> { - self.storage.write().remove(location); + self.storage.write().map.remove(location); Ok(()) } @@ -208,6 +237,7 @@ impl ObjectStore for InMemory { let storage = self.storage.read(); let values: Vec<_> = storage + .map .range((prefix)..) .take_while(|(key, _)| key.as_ref().starts_with(prefix.as_ref())) .filter(|(key, _)| { @@ -219,9 +249,9 @@ impl ObjectStore for InMemory { .map(|(key, value)| { Ok(ObjectMeta { location: key.clone(), - last_modified: value.1, - size: value.0.len(), - e_tag: None, + last_modified: value.last_modified, + size: value.data.len(), + e_tag: Some(value.e_tag.to_string()), }) }) .collect(); @@ -241,7 +271,7 @@ impl ObjectStore for InMemory { // Only objects in this base level should be returned in the // response. Otherwise, we just collect the common prefixes. let mut objects = vec![]; - for (k, v) in self.storage.read().range((prefix)..) { + for (k, v) in self.storage.read().map.range((prefix)..) { if !k.as_ref().starts_with(prefix.as_ref()) { break; } @@ -263,9 +293,9 @@ impl ObjectStore for InMemory { } else { let object = ObjectMeta { location: k.clone(), - last_modified: v.1, - size: v.0.len(), - e_tag: None, + last_modified: v.last_modified, + size: v.data.len(), + e_tag: Some(v.e_tag.to_string()), }; objects.push(object); } @@ -278,23 +308,21 @@ impl ObjectStore for InMemory { } async fn copy(&self, from: &Path, to: &Path) -> Result<()> { - let data = self.entry(from).await?; - self.storage - .write() - .insert(to.clone(), (data.0, Utc::now())); + let entry = self.entry(from).await?; + self.storage.write().insert(to, entry.data); Ok(()) } async fn copy_if_not_exists(&self, from: &Path, to: &Path) -> Result<()> { - let data = self.entry(from).await?; + let entry = self.entry(from).await?; let mut storage = self.storage.write(); - if storage.contains_key(to) { + if storage.map.contains_key(to) { return Err(Error::AlreadyExists { path: to.to_string(), } .into()); } - storage.insert(to.clone(), (data.0, Utc::now())); + storage.insert(to, entry.data); Ok(()) } } @@ -319,9 +347,10 @@ impl InMemory { self.fork() } - async fn entry(&self, location: &Path) -> Result<(Bytes, DateTime)> { + async fn entry(&self, location: &Path) -> Result { let storage = self.storage.read(); let value = storage + .map .get(location) .cloned() .context(NoDataInMemorySnafu { @@ -335,7 +364,7 @@ impl InMemory { struct InMemoryUpload { location: Path, data: Vec, - storage: StorageType, + storage: Arc>, } impl AsyncWrite for InMemoryUpload { @@ -343,7 +372,7 @@ impl AsyncWrite for InMemoryUpload { mut self: Pin<&mut Self>, _cx: &mut std::task::Context<'_>, buf: &[u8], - ) -> std::task::Poll> { + ) -> Poll> { self.data.extend_from_slice(buf); Poll::Ready(Ok(buf.len())) } @@ -351,18 +380,16 @@ impl AsyncWrite for InMemoryUpload { fn poll_flush( self: Pin<&mut Self>, _cx: &mut std::task::Context<'_>, - ) -> std::task::Poll> { + ) -> Poll> { Poll::Ready(Ok(())) } fn poll_shutdown( mut self: Pin<&mut Self>, _cx: &mut std::task::Context<'_>, - ) -> std::task::Poll> { + ) -> Poll> { let data = Bytes::from(std::mem::take(&mut self.data)); - self.storage - .write() - .insert(self.location.clone(), (data, Utc::now())); + self.storage.write().insert(&self.location, data); Poll::Ready(Ok(())) } } @@ -370,7 +397,7 @@ impl AsyncWrite for InMemoryUpload { struct InMemoryAppend { location: Path, data: Vec, - storage: StorageType, + storage: Arc>, } impl AsyncWrite for InMemoryAppend { @@ -378,7 +405,7 @@ impl AsyncWrite for InMemoryAppend { mut self: Pin<&mut Self>, _cx: &mut std::task::Context<'_>, buf: &[u8], - ) -> std::task::Poll> { + ) -> Poll> { self.data.extend_from_slice(buf); Poll::Ready(Ok(buf.len())) } @@ -386,20 +413,18 @@ impl AsyncWrite for InMemoryAppend { fn poll_flush( mut self: Pin<&mut Self>, _cx: &mut std::task::Context<'_>, - ) -> std::task::Poll> { - let storage = StorageType::clone(&self.storage); + ) -> Poll> { + let storage = Arc::clone(&self.storage); let mut writer = storage.write(); - if let Some((bytes, _)) = writer.remove(&self.location) { + if let Some(entry) = writer.map.remove(&self.location) { let buf = std::mem::take(&mut self.data); - let concat = Bytes::from_iter(bytes.into_iter().chain(buf)); - writer.insert(self.location.clone(), (concat, Utc::now())); + let concat = Bytes::from_iter(entry.data.into_iter().chain(buf)); + writer.insert(&self.location, concat); } else { - writer.insert( - self.location.clone(), - (Bytes::from(std::mem::take(&mut self.data)), Utc::now()), - ); + let data = Bytes::from(std::mem::take(&mut self.data)); + writer.insert(&self.location, data); }; Poll::Ready(Ok(())) } From 4e11875606e53f1361be4afa5e54327bd2852ab6 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Tue, 17 Oct 2023 22:10:31 +0100 Subject: [PATCH 206/397] Remove Nested async and Fallibility from ObjectStore::list (#4930) * Remove nested async and fallibility from ObjectStore::list * Clippy * Update limit test * Update docs --- src/aws/mod.rs | 13 ++- src/azure/mod.rs | 7 +- src/chunked.rs | 13 ++- src/client/list.rs | 32 +++----- src/gcp/mod.rs | 7 +- src/http/mod.rs | 24 +++--- src/lib.rs | 178 ++++++++++++++++------------------------ src/limit.rs | 44 ++++++---- src/local.rs | 82 ++++++++---------- src/memory.rs | 7 +- src/prefix.rs | 17 ++-- src/throttle.rs | 47 +++++------ tests/get_range_file.rs | 5 +- 13 files changed, 197 insertions(+), 279 deletions(-) diff --git a/src/aws/mod.rs b/src/aws/mod.rs index 3ddce08..d3c5086 100644 --- a/src/aws/mod.rs +++ b/src/aws/mod.rs @@ -331,19 +331,16 @@ impl ObjectStore for AmazonS3 { .boxed() } - async fn list( - &self, - prefix: Option<&Path>, - ) -> Result>> { - self.client.list(prefix).await + fn list(&self, prefix: Option<&Path>) -> BoxStream<'_, Result> { + self.client.list(prefix) } - async fn list_with_offset( + fn list_with_offset( &self, prefix: Option<&Path>, offset: &Path, - ) -> Result>> { - self.client.list_with_offset(prefix, offset).await + ) -> BoxStream<'_, Result> { + self.client.list_with_offset(prefix, offset) } async fn list_with_delimiter(&self, prefix: Option<&Path>) -> Result { diff --git a/src/azure/mod.rs b/src/azure/mod.rs index 190b73b..2a08c67 100644 --- a/src/azure/mod.rs +++ b/src/azure/mod.rs @@ -206,11 +206,8 @@ impl ObjectStore for MicrosoftAzure { self.client.delete_request(location, &()).await } - async fn list( - &self, - prefix: Option<&Path>, - ) -> Result>> { - self.client.list(prefix).await + fn list(&self, prefix: Option<&Path>) -> BoxStream<'_, Result> { + self.client.list(prefix) } async fn list_with_delimiter(&self, prefix: Option<&Path>) -> Result { diff --git a/src/chunked.rs b/src/chunked.rs index 008dec6..d3e02b4 100644 --- a/src/chunked.rs +++ b/src/chunked.rs @@ -147,19 +147,16 @@ impl ObjectStore for ChunkedStore { self.inner.delete(location).await } - async fn list( - &self, - prefix: Option<&Path>, - ) -> Result>> { - self.inner.list(prefix).await + fn list(&self, prefix: Option<&Path>) -> BoxStream<'_, Result> { + self.inner.list(prefix) } - async fn list_with_offset( + fn list_with_offset( &self, prefix: Option<&Path>, offset: &Path, - ) -> Result>> { - self.inner.list_with_offset(prefix, offset).await + ) -> BoxStream<'_, Result> { + self.inner.list_with_offset(prefix, offset) } async fn list_with_delimiter(&self, prefix: Option<&Path>) -> Result { diff --git a/src/client/list.rs b/src/client/list.rs index b2dbee2..371894d 100644 --- a/src/client/list.rs +++ b/src/client/list.rs @@ -46,16 +46,13 @@ pub trait ListClientExt { offset: Option<&Path>, ) -> BoxStream<'_, Result>; - async fn list( - &self, - prefix: Option<&Path>, - ) -> Result>>; + fn list(&self, prefix: Option<&Path>) -> BoxStream<'_, Result>; - async fn list_with_offset( + fn list_with_offset( &self, prefix: Option<&Path>, offset: &Path, - ) -> Result>>; + ) -> BoxStream<'_, Result>; async fn list_with_delimiter(&self, prefix: Option<&Path>) -> Result; } @@ -90,31 +87,22 @@ impl ListClientExt for T { .boxed() } - async fn list( - &self, - prefix: Option<&Path>, - ) -> Result>> { - let stream = self - .list_paginated(prefix, false, None) + fn list(&self, prefix: Option<&Path>) -> BoxStream<'_, Result> { + self.list_paginated(prefix, false, None) .map_ok(|r| futures::stream::iter(r.objects.into_iter().map(Ok))) .try_flatten() - .boxed(); - - Ok(stream) + .boxed() } - async fn list_with_offset( + fn list_with_offset( &self, prefix: Option<&Path>, offset: &Path, - ) -> Result>> { - let stream = self - .list_paginated(prefix, false, Some(offset)) + ) -> BoxStream<'_, Result> { + self.list_paginated(prefix, false, Some(offset)) .map_ok(|r| futures::stream::iter(r.objects.into_iter().map(Ok))) .try_flatten() - .boxed(); - - Ok(stream) + .boxed() } async fn list_with_delimiter(&self, prefix: Option<&Path>) -> Result { diff --git a/src/gcp/mod.rs b/src/gcp/mod.rs index a75527f..513e396 100644 --- a/src/gcp/mod.rs +++ b/src/gcp/mod.rs @@ -601,11 +601,8 @@ impl ObjectStore for GoogleCloudStorage { self.client.delete_request(location).await } - async fn list( - &self, - prefix: Option<&Path>, - ) -> Result>> { - self.client.list(prefix).await + fn list(&self, prefix: Option<&Path>) -> BoxStream<'_, Result> { + self.client.list(prefix) } async fn list_with_delimiter(&self, prefix: Option<&Path>) -> Result { diff --git a/src/http/mod.rs b/src/http/mod.rs index 6ffb623..2fd7850 100644 --- a/src/http/mod.rs +++ b/src/http/mod.rs @@ -34,7 +34,7 @@ use async_trait::async_trait; use bytes::Bytes; use futures::stream::BoxStream; -use futures::StreamExt; +use futures::{StreamExt, TryStreamExt}; use itertools::Itertools; use snafu::{OptionExt, ResultExt, Snafu}; use tokio::io::AsyncWrite; @@ -122,14 +122,13 @@ impl ObjectStore for HttpStore { self.client.delete(location).await } - async fn list( - &self, - prefix: Option<&Path>, - ) -> Result>> { + fn list(&self, prefix: Option<&Path>) -> BoxStream<'_, Result> { let prefix_len = prefix.map(|p| p.as_ref().len()).unwrap_or_default(); - let status = self.client.list(prefix, "infinity").await?; - Ok(futures::stream::iter( - status + let prefix = prefix.cloned(); + futures::stream::once(async move { + let status = self.client.list(prefix.as_ref(), "infinity").await?; + + let iter = status .response .into_iter() .filter(|r| !r.is_dir()) @@ -138,9 +137,12 @@ impl ObjectStore for HttpStore { response.object_meta(self.client.base_url()) }) // Filter out exact prefix matches - .filter_ok(move |r| r.location.as_ref().len() > prefix_len), - ) - .boxed()) + .filter_ok(move |r| r.location.as_ref().len() > prefix_len); + + Ok::<_, crate::Error>(futures::stream::iter(iter)) + }) + .try_flatten() + .boxed() } async fn list_with_delimiter(&self, prefix: Option<&Path>) -> Result { diff --git a/src/lib.rs b/src/lib.rs index b79042e..9b39644 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -95,18 +95,18 @@ //! //! ``` //! # use object_store::local::LocalFileSystem; +//! # use std::sync::Arc; +//! # use object_store::{path::Path, ObjectStore}; +//! # use futures::stream::StreamExt; //! # // use LocalFileSystem for example -//! # fn get_object_store() -> LocalFileSystem { -//! # LocalFileSystem::new_with_prefix("/tmp").unwrap() +//! # fn get_object_store() -> Arc { +//! # Arc::new(LocalFileSystem::new()) //! # } -//! +//! # //! # async fn example() { -//! use std::sync::Arc; -//! use object_store::{path::Path, ObjectStore}; -//! use futures::stream::StreamExt; -//! +//! # //! // create an ObjectStore -//! let object_store: Arc = Arc::new(get_object_store()); +//! let object_store: Arc = get_object_store(); //! //! // Recursively list all files below the 'data' path. //! // 1. On AWS S3 this would be the 'data/' prefix @@ -114,21 +114,12 @@ //! let prefix: Path = "data".try_into().unwrap(); //! //! // Get an `async` stream of Metadata objects: -//! let list_stream = object_store -//! .list(Some(&prefix)) -//! .await -//! .expect("Error listing files"); +//! let mut list_stream = object_store.list(Some(&prefix)); //! -//! // Print a line about each object based on its metadata -//! // using for_each from `StreamExt` trait. -//! list_stream -//! .for_each(move |meta| { -//! async { -//! let meta = meta.expect("Error listing"); -//! println!("Name: {}, size: {}", meta.location, meta.size); -//! } -//! }) -//! .await; +//! // Print a line about each object +//! while let Some(meta) = list_stream.next().await.transpose().unwrap() { +//! println!("Name: {}, size: {}", meta.location, meta.size); +//! } //! # } //! ``` //! @@ -147,19 +138,18 @@ //! from remote storage or files in the local filesystem as a stream. //! //! ``` +//! # use futures::TryStreamExt; //! # use object_store::local::LocalFileSystem; -//! # // use LocalFileSystem for example -//! # fn get_object_store() -> LocalFileSystem { -//! # LocalFileSystem::new_with_prefix("/tmp").unwrap() +//! # use std::sync::Arc; +//! # use object_store::{path::Path, ObjectStore}; +//! # fn get_object_store() -> Arc { +//! # Arc::new(LocalFileSystem::new()) //! # } -//! +//! # //! # async fn example() { -//! use std::sync::Arc; -//! use object_store::{path::Path, ObjectStore}; -//! use futures::stream::StreamExt; -//! +//! # //! // create an ObjectStore -//! let object_store: Arc = Arc::new(get_object_store()); +//! let object_store: Arc = get_object_store(); //! //! // Retrieve a specific file //! let path: Path = "data/file01.parquet".try_into().unwrap(); @@ -171,16 +161,11 @@ //! .unwrap() //! .into_stream(); //! -//! // Count the '0's using `map` from `StreamExt` trait +//! // Count the '0's using `try_fold` from `TryStreamExt` trait //! let num_zeros = stream -//! .map(|bytes| { -//! let bytes = bytes.unwrap(); -//! bytes.iter().filter(|b| **b == 0).count() -//! }) -//! .collect::>() -//! .await -//! .into_iter() -//! .sum::(); +//! .try_fold(0, |acc, bytes| async move { +//! Ok(acc + bytes.iter().filter(|b| **b == 0).count()) +//! }).await.unwrap(); //! //! println!("Num zeros in {} is {}", path, num_zeros); //! # } @@ -196,22 +181,19 @@ //! //! ``` //! # use object_store::local::LocalFileSystem; -//! # fn get_object_store() -> LocalFileSystem { -//! # LocalFileSystem::new_with_prefix("/tmp").unwrap() +//! # use object_store::ObjectStore; +//! # use std::sync::Arc; +//! # use bytes::Bytes; +//! # use object_store::path::Path; +//! # fn get_object_store() -> Arc { +//! # Arc::new(LocalFileSystem::new()) //! # } //! # async fn put() { -//! use object_store::ObjectStore; -//! use std::sync::Arc; -//! use bytes::Bytes; -//! use object_store::path::Path; -//! -//! let object_store: Arc = Arc::new(get_object_store()); +//! # +//! let object_store: Arc = get_object_store(); //! let path: Path = "data/file1".try_into().unwrap(); //! let bytes = Bytes::from_static(b"hello"); -//! object_store -//! .put(&path, bytes) -//! .await -//! .unwrap(); +//! object_store.put(&path, bytes).await.unwrap(); //! # } //! ``` //! @@ -220,22 +202,20 @@ //! //! ``` //! # use object_store::local::LocalFileSystem; -//! # fn get_object_store() -> LocalFileSystem { -//! # LocalFileSystem::new_with_prefix("/tmp").unwrap() +//! # use object_store::ObjectStore; +//! # use std::sync::Arc; +//! # use bytes::Bytes; +//! # use tokio::io::AsyncWriteExt; +//! # use object_store::path::Path; +//! # fn get_object_store() -> Arc { +//! # Arc::new(LocalFileSystem::new()) //! # } //! # async fn multi_upload() { -//! use object_store::ObjectStore; -//! use std::sync::Arc; -//! use bytes::Bytes; -//! use tokio::io::AsyncWriteExt; -//! use object_store::path::Path; -//! -//! let object_store: Arc = Arc::new(get_object_store()); +//! # +//! let object_store: Arc = get_object_store(); //! let path: Path = "data/large_file".try_into().unwrap(); -//! let (_id, mut writer) = object_store -//! .put_multipart(&path) -//! .await -//! .unwrap(); +//! let (_id, mut writer) = object_store.put_multipart(&path).await.unwrap(); +//! //! let bytes = Bytes::from_static(b"hello"); //! writer.write_all(&bytes).await.unwrap(); //! writer.flush().await.unwrap(); @@ -439,23 +419,22 @@ pub trait ObjectStore: std::fmt::Display + Send + Sync + Debug + 'static { /// return Ok. If it is an error, it will be [`Error::NotFound`]. /// /// ``` + /// # use futures::{StreamExt, TryStreamExt}; /// # use object_store::local::LocalFileSystem; /// # async fn example() -> Result<(), Box> { /// # let root = tempfile::TempDir::new().unwrap(); /// # let store = LocalFileSystem::new_with_prefix(root.path()).unwrap(); - /// use object_store::{ObjectStore, ObjectMeta}; - /// use object_store::path::Path; - /// use futures::{StreamExt, TryStreamExt}; - /// use bytes::Bytes; - /// + /// # use object_store::{ObjectStore, ObjectMeta}; + /// # use object_store::path::Path; + /// # use futures::{StreamExt, TryStreamExt}; + /// # use bytes::Bytes; + /// # /// // Create two objects /// store.put(&Path::from("foo"), Bytes::from("foo")).await?; /// store.put(&Path::from("bar"), Bytes::from("bar")).await?; /// /// // List object - /// let locations = store.list(None).await? - /// .map(|meta: Result| meta.map(|m| m.location)) - /// .boxed(); + /// let locations = store.list(None).map_ok(|m| m.location).boxed(); /// /// // Delete them /// store.delete_stream(locations).try_collect::>().await?; @@ -484,10 +463,7 @@ pub trait ObjectStore: std::fmt::Display + Send + Sync + Debug + 'static { /// `foo/bar_baz/x`. /// /// Note: the order of returned [`ObjectMeta`] is not guaranteed - async fn list( - &self, - prefix: Option<&Path>, - ) -> Result>>; + fn list(&self, prefix: Option<&Path>) -> BoxStream<'_, Result>; /// List all the objects with the given prefix and a location greater than `offset` /// @@ -495,18 +471,15 @@ pub trait ObjectStore: std::fmt::Display + Send + Sync + Debug + 'static { /// the number of network requests required /// /// Note: the order of returned [`ObjectMeta`] is not guaranteed - async fn list_with_offset( + fn list_with_offset( &self, prefix: Option<&Path>, offset: &Path, - ) -> Result>> { + ) -> BoxStream<'_, Result> { let offset = offset.clone(); - let stream = self - .list(prefix) - .await? + self.list(prefix) .try_filter(move |f| futures::future::ready(f.location > offset)) - .boxed(); - Ok(stream) + .boxed() } /// List objects with the given prefix and an implementation specific @@ -624,19 +597,16 @@ macro_rules! as_ref_impl { self.as_ref().delete_stream(locations) } - async fn list( - &self, - prefix: Option<&Path>, - ) -> Result>> { - self.as_ref().list(prefix).await + fn list(&self, prefix: Option<&Path>) -> BoxStream<'_, Result> { + self.as_ref().list(prefix) } - async fn list_with_offset( + fn list_with_offset( &self, prefix: Option<&Path>, offset: &Path, - ) -> Result>> { - self.as_ref().list_with_offset(prefix, offset).await + ) -> BoxStream<'_, Result> { + self.as_ref().list_with_offset(prefix, offset) } async fn list_with_delimiter( @@ -973,7 +943,6 @@ mod test_util { ) -> Result> { storage .list(prefix) - .await? .map_ok(|meta| meta.location) .try_collect::>() .await @@ -1264,11 +1233,7 @@ mod tests { ]; for (prefix, offset) in cases { - let s = storage - .list_with_offset(prefix.as_ref(), &offset) - .await - .unwrap(); - + let s = storage.list_with_offset(prefix.as_ref(), &offset); let mut actual: Vec<_> = s.map_ok(|x| x.location).try_collect().await.unwrap(); @@ -1700,12 +1665,7 @@ mod tests { } async fn delete_fixtures(storage: &DynObjectStore) { - let paths = storage - .list(None) - .await - .unwrap() - .map_ok(|meta| meta.location) - .boxed(); + let paths = storage.list(None).map_ok(|meta| meta.location).boxed(); storage .delete_stream(paths) .try_collect::>() @@ -1714,18 +1674,18 @@ mod tests { } /// Test that the returned stream does not borrow the lifetime of Path - async fn list_store<'a, 'b>( + fn list_store<'a>( store: &'a dyn ObjectStore, - path_str: &'b str, - ) -> super::Result>> { + path_str: &str, + ) -> BoxStream<'a, Result> { let path = Path::from(path_str); - store.list(Some(&path)).await + store.list(Some(&path)) } #[tokio::test] async fn test_list_lifetimes() { let store = memory::InMemory::new(); - let mut stream = list_store(&store, "path").await.unwrap(); + let mut stream = list_store(&store, "path"); assert!(stream.next().await.is_none()); } diff --git a/src/limit.rs b/src/limit.rs index a9b8c4b..00cbce0 100644 --- a/src/limit.rs +++ b/src/limit.rs @@ -23,7 +23,7 @@ use crate::{ }; use async_trait::async_trait; use bytes::Bytes; -use futures::Stream; +use futures::{FutureExt, Stream}; use std::io::{Error, IoSlice}; use std::ops::Range; use std::pin::Pin; @@ -147,23 +147,31 @@ impl ObjectStore for LimitStore { self.inner.delete_stream(locations) } - async fn list( - &self, - prefix: Option<&Path>, - ) -> Result>> { - let permit = Arc::clone(&self.semaphore).acquire_owned().await.unwrap(); - let s = self.inner.list(prefix).await?; - Ok(PermitWrapper::new(s, permit).boxed()) + fn list(&self, prefix: Option<&Path>) -> BoxStream<'_, Result> { + let prefix = prefix.cloned(); + let fut = Arc::clone(&self.semaphore) + .acquire_owned() + .map(move |permit| { + let s = self.inner.list(prefix.as_ref()); + PermitWrapper::new(s, permit.unwrap()) + }); + fut.into_stream().flatten().boxed() } - async fn list_with_offset( + fn list_with_offset( &self, prefix: Option<&Path>, offset: &Path, - ) -> Result>> { - let permit = Arc::clone(&self.semaphore).acquire_owned().await.unwrap(); - let s = self.inner.list_with_offset(prefix, offset).await?; - Ok(PermitWrapper::new(s, permit).boxed()) + ) -> BoxStream<'_, Result> { + let prefix = prefix.cloned(); + let offset = offset.clone(); + let fut = Arc::clone(&self.semaphore) + .acquire_owned() + .map(move |permit| { + let s = self.inner.list_with_offset(prefix.as_ref(), &offset); + PermitWrapper::new(s, permit.unwrap()) + }); + fut.into_stream().flatten().boxed() } async fn list_with_delimiter(&self, prefix: Option<&Path>) -> Result { @@ -272,6 +280,8 @@ mod tests { use crate::memory::InMemory; use crate::tests::*; use crate::ObjectStore; + use futures::stream::StreamExt; + use std::pin::Pin; use std::time::Duration; use tokio::time::timeout; @@ -290,19 +300,21 @@ mod tests { let mut streams = Vec::with_capacity(max_requests); for _ in 0..max_requests { - let stream = integration.list(None).await.unwrap(); + let mut stream = integration.list(None).peekable(); + Pin::new(&mut stream).peek().await; // Ensure semaphore is acquired streams.push(stream); } let t = Duration::from_millis(20); // Expect to not be able to make another request - assert!(timeout(t, integration.list(None)).await.is_err()); + let fut = integration.list(None).collect::>(); + assert!(timeout(t, fut).await.is_err()); // Drop one of the streams streams.pop(); // Can now make another request - integration.list(None).await.unwrap(); + integration.list(None).collect::>().await; } } diff --git a/src/local.rs b/src/local.rs index 3d4a02a..38467c3 100644 --- a/src/local.rs +++ b/src/local.rs @@ -420,14 +420,14 @@ impl ObjectStore for LocalFileSystem { .await } - async fn list( - &self, - prefix: Option<&Path>, - ) -> Result>> { + fn list(&self, prefix: Option<&Path>) -> BoxStream<'_, Result> { let config = Arc::clone(&self.config); let root_path = match prefix { - Some(prefix) => config.path_to_filesystem(prefix)?, + Some(prefix) => match config.path_to_filesystem(prefix) { + Ok(path) => path, + Err(e) => return futures::future::ready(Err(e)).into_stream().boxed(), + }, None => self.config.root.to_file_path().unwrap(), }; @@ -457,36 +457,34 @@ impl ObjectStore for LocalFileSystem { // If no tokio context, return iterator directly as no // need to perform chunked spawn_blocking reads if tokio::runtime::Handle::try_current().is_err() { - return Ok(futures::stream::iter(s).boxed()); + return futures::stream::iter(s).boxed(); } // Otherwise list in batches of CHUNK_SIZE const CHUNK_SIZE: usize = 1024; let buffer = VecDeque::with_capacity(CHUNK_SIZE); - let stream = - futures::stream::try_unfold((s, buffer), |(mut s, mut buffer)| async move { - if buffer.is_empty() { - (s, buffer) = tokio::task::spawn_blocking(move || { - for _ in 0..CHUNK_SIZE { - match s.next() { - Some(r) => buffer.push_back(r), - None => break, - } + futures::stream::try_unfold((s, buffer), |(mut s, mut buffer)| async move { + if buffer.is_empty() { + (s, buffer) = tokio::task::spawn_blocking(move || { + for _ in 0..CHUNK_SIZE { + match s.next() { + Some(r) => buffer.push_back(r), + None => break, } - (s, buffer) - }) - .await?; - } - - match buffer.pop_front() { - Some(Err(e)) => Err(e), - Some(Ok(meta)) => Ok(Some((meta, (s, buffer)))), - None => Ok(None), - } - }); + } + (s, buffer) + }) + .await?; + } - Ok(stream.boxed()) + match buffer.pop_front() { + Some(Err(e)) => Err(e), + Some(Ok(meta)) => Ok(Some((meta, (s, buffer)))), + None => Ok(None), + } + }) + .boxed() } async fn list_with_delimiter(&self, prefix: Option<&Path>) -> Result { @@ -1138,21 +1136,14 @@ mod tests { let store = LocalFileSystem::new_with_prefix(root.path()).unwrap(); - // `list` must fail - match store.list(None).await { - Err(_) => { - // ok, error found - } - Ok(mut stream) => { - let mut any_err = false; - while let Some(res) = stream.next().await { - if res.is_err() { - any_err = true; - } - } - assert!(any_err); + let mut stream = store.list(None); + let mut any_err = false; + while let Some(res) = stream.next().await { + if res.is_err() { + any_err = true; } } + assert!(any_err); // `list_with_delimiter assert!(store.list_with_delimiter(None).await.is_err()); @@ -1226,13 +1217,7 @@ mod tests { prefix: Option<&Path>, expected: &[&str], ) { - let result: Vec<_> = integration - .list(prefix) - .await - .unwrap() - .try_collect() - .await - .unwrap(); + let result: Vec<_> = integration.list(prefix).try_collect().await.unwrap(); let mut strings: Vec<_> = result.iter().map(|x| x.location.as_ref()).collect(); strings.sort_unstable(); @@ -1428,8 +1413,7 @@ mod tests { std::fs::write(temp_dir.path().join(filename), "foo").unwrap(); - let list_stream = integration.list(None).await.unwrap(); - let res: Vec<_> = list_stream.try_collect().await.unwrap(); + let res: Vec<_> = integration.list(None).try_collect().await.unwrap(); assert_eq!(res.len(), 1); assert_eq!(res[0].location.as_ref(), filename); diff --git a/src/memory.rs b/src/memory.rs index f638ed6..00b330b 100644 --- a/src/memory.rs +++ b/src/memory.rs @@ -228,10 +228,7 @@ impl ObjectStore for InMemory { Ok(()) } - async fn list( - &self, - prefix: Option<&Path>, - ) -> Result>> { + fn list(&self, prefix: Option<&Path>) -> BoxStream<'_, Result> { let root = Path::default(); let prefix = prefix.unwrap_or(&root); @@ -256,7 +253,7 @@ impl ObjectStore for InMemory { }) .collect(); - Ok(futures::stream::iter(values).boxed()) + futures::stream::iter(values).boxed() } /// The memory implementation returns all results, as opposed to the cloud diff --git a/src/prefix.rs b/src/prefix.rs index 39585f7..3776dec 100644 --- a/src/prefix.rs +++ b/src/prefix.rs @@ -144,24 +144,21 @@ impl ObjectStore for PrefixStore { self.inner.delete(&full_path).await } - async fn list( - &self, - prefix: Option<&Path>, - ) -> Result>> { + fn list(&self, prefix: Option<&Path>) -> BoxStream<'_, Result> { let prefix = self.full_path(prefix.unwrap_or(&Path::default())); - let s = self.inner.list(Some(&prefix)).await?; - Ok(s.map_ok(|meta| self.strip_meta(meta)).boxed()) + let s = self.inner.list(Some(&prefix)); + s.map_ok(|meta| self.strip_meta(meta)).boxed() } - async fn list_with_offset( + fn list_with_offset( &self, prefix: Option<&Path>, offset: &Path, - ) -> Result>> { + ) -> BoxStream<'_, Result> { let offset = self.full_path(offset); let prefix = self.full_path(prefix.unwrap_or(&Path::default())); - let s = self.inner.list_with_offset(Some(&prefix), &offset).await?; - Ok(s.map_ok(|meta| self.strip_meta(meta)).boxed()) + let s = self.inner.list_with_offset(Some(&prefix), &offset); + s.map_ok(|meta| self.strip_meta(meta)).boxed() } async fn list_with_delimiter(&self, prefix: Option<&Path>) -> Result { diff --git a/src/throttle.rs b/src/throttle.rs index 58c476a..f716a11 100644 --- a/src/throttle.rs +++ b/src/throttle.rs @@ -233,29 +233,30 @@ impl ObjectStore for ThrottledStore { self.inner.delete(location).await } - async fn list( - &self, - prefix: Option<&Path>, - ) -> Result>> { - sleep(self.config().wait_list_per_call).await; - - // need to copy to avoid moving / referencing `self` - let wait_list_per_entry = self.config().wait_list_per_entry; - let stream = self.inner.list(prefix).await?; - Ok(throttle_stream(stream, move |_| wait_list_per_entry)) + fn list(&self, prefix: Option<&Path>) -> BoxStream<'_, Result> { + let stream = self.inner.list(prefix); + futures::stream::once(async move { + let wait_list_per_entry = self.config().wait_list_per_entry; + sleep(self.config().wait_list_per_call).await; + throttle_stream(stream, move |_| wait_list_per_entry) + }) + .flatten() + .boxed() } - async fn list_with_offset( + fn list_with_offset( &self, prefix: Option<&Path>, offset: &Path, - ) -> Result>> { - sleep(self.config().wait_list_per_call).await; - - // need to copy to avoid moving / referencing `self` - let wait_list_per_entry = self.config().wait_list_per_entry; - let stream = self.inner.list_with_offset(prefix, offset).await?; - Ok(throttle_stream(stream, move |_| wait_list_per_entry)) + ) -> BoxStream<'_, Result> { + let stream = self.inner.list_with_offset(prefix, offset); + futures::stream::once(async move { + let wait_list_per_entry = self.config().wait_list_per_entry; + sleep(self.config().wait_list_per_call).await; + throttle_stream(stream, move |_| wait_list_per_entry) + }) + .flatten() + .boxed() } async fn list_with_delimiter(&self, prefix: Option<&Path>) -> Result { @@ -511,13 +512,7 @@ mod tests { let prefix = Path::from("foo"); // clean up store - let entries: Vec<_> = store - .list(Some(&prefix)) - .await - .unwrap() - .try_collect() - .await - .unwrap(); + let entries: Vec<_> = store.list(Some(&prefix)).try_collect().await.unwrap(); for entry in entries { store.delete(&entry.location).await.unwrap(); @@ -583,8 +578,6 @@ mod tests { let t0 = Instant::now(); store .list(Some(&prefix)) - .await - .unwrap() .try_collect::>() .await .unwrap(); diff --git a/tests/get_range_file.rs b/tests/get_range_file.rs index f926e3b..25c4692 100644 --- a/tests/get_range_file.rs +++ b/tests/get_range_file.rs @@ -75,10 +75,7 @@ impl ObjectStore for MyStore { todo!() } - async fn list( - &self, - _: Option<&Path>, - ) -> object_store::Result>> { + fn list(&self, _: Option<&Path>) -> BoxStream<'_, object_store::Result> { todo!() } From 649ac0aa02a2fc2a465fa345161f978a78df2052 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Tue, 17 Oct 2023 22:27:16 +0100 Subject: [PATCH 207/397] Fix object_store docs (#4947) --- src/parse.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/parse.rs b/src/parse.rs index 1159e9a..2e72a71 100644 --- a/src/parse.rs +++ b/src/parse.rs @@ -47,12 +47,12 @@ impl From for super::Error { } } -/// Recognises various URL formats, identifying the relevant [`ObjectStore`](crate::ObjectStore) +/// Recognises various URL formats, identifying the relevant [`ObjectStore`] #[derive(Debug, Eq, PartialEq)] enum ObjectStoreScheme { - /// Url corresponding to [`LocalFileSystem`](crate::local::LocalFileSystem) + /// Url corresponding to [`LocalFileSystem`] Local, - /// Url corresponding to [`InMemory`](crate::memory::InMemory) + /// Url corresponding to [`InMemory`] Memory, /// Url corresponding to [`AmazonS3`](crate::aws::AmazonS3) AmazonS3, From e0c9873e57ebe97d40f707b8101d818a2b0b33ee Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 19 Oct 2023 09:44:46 +0100 Subject: [PATCH 208/397] Return `PutResult` with an ETag from ObjectStore::put (#4934) (#4944) * Return ETag from ObjectStore::put (#4934) * Further tests * Clippy * Review feedback --- src/aws/client.rs | 12 +++++- src/aws/mod.rs | 25 +++--------- src/azure/mod.rs | 20 +++++----- src/chunked.rs | 3 +- src/client/header.rs | 17 ++++---- src/gcp/mod.rs | 87 +++++++++++++++++------------------------ src/http/client.rs | 4 +- src/http/mod.rs | 13 ++++-- src/lib.rs | 35 ++++++++++++++++- src/limit.rs | 4 +- src/local.rs | 43 ++++++++++++++------ src/memory.rs | 14 ++++--- src/prefix.rs | 5 ++- src/throttle.rs | 5 ++- tests/get_range_file.rs | 4 +- 15 files changed, 169 insertions(+), 122 deletions(-) diff --git a/src/aws/client.rs b/src/aws/client.rs index 8a45a9f..eb81e92 100644 --- a/src/aws/client.rs +++ b/src/aws/client.rs @@ -21,6 +21,7 @@ use crate::aws::{ AwsCredentialProvider, S3CopyIfNotExists, STORE, STRICT_PATH_ENCODE_SET, }; use crate::client::get::GetClient; +use crate::client::header::get_etag; use crate::client::list::ListClient; use crate::client::list_response::ListResponse; use crate::client::retry::RetryExt; @@ -122,6 +123,11 @@ pub(crate) enum Error { #[snafu(display("Got invalid multipart response: {}", source))] InvalidMultipartResponse { source: quick_xml::de::DeError }, + + #[snafu(display("Unable to extract metadata from headers: {}", source))] + Metadata { + source: crate::client::header::Error, + }, } impl From for crate::Error { @@ -243,12 +249,14 @@ impl S3Client { } /// Make an S3 PUT request + /// + /// Returns the ETag pub async fn put_request( &self, path: &Path, bytes: Bytes, query: &T, - ) -> Result { + ) -> Result { let credential = self.get_credential().await?; let url = self.config.path_url(path); let mut builder = self.client.request(Method::PUT, url); @@ -287,7 +295,7 @@ impl S3Client { path: path.as_ref(), })?; - Ok(response) + Ok(get_etag(response.headers()).context(MetadataSnafu)?) } /// Make an S3 Delete request diff --git a/src/aws/mod.rs b/src/aws/mod.rs index d3c5086..6d5aece 100644 --- a/src/aws/mod.rs +++ b/src/aws/mod.rs @@ -59,7 +59,7 @@ use crate::multipart::{PartId, PutPart, WriteMultiPart}; use crate::signer::Signer; use crate::{ ClientOptions, GetOptions, GetResult, ListResult, MultipartId, ObjectMeta, - ObjectStore, Path, Result, RetryConfig, + ObjectStore, Path, PutResult, Result, RetryConfig, }; mod checksum; @@ -109,12 +109,6 @@ enum Error { #[snafu(display("Missing SecretAccessKey"))] MissingSecretAccessKey, - #[snafu(display("ETag Header missing from response"))] - MissingEtag, - - #[snafu(display("Received header containing non-ASCII data"))] - BadHeader { source: reqwest::header::ToStrError }, - #[snafu(display("Unable parse source url. Url: {}, Error: {}", url, source))] UnableToParseUrl { source: url::ParseError, @@ -273,9 +267,9 @@ impl Signer for AmazonS3 { #[async_trait] impl ObjectStore for AmazonS3 { - async fn put(&self, location: &Path, bytes: Bytes) -> Result<()> { - self.client.put_request(location, bytes, &()).await?; - Ok(()) + async fn put(&self, location: &Path, bytes: Bytes) -> Result { + let e_tag = self.client.put_request(location, bytes, &()).await?; + Ok(PutResult { e_tag: Some(e_tag) }) } async fn put_multipart( @@ -365,10 +359,9 @@ struct S3MultiPartUpload { #[async_trait] impl PutPart for S3MultiPartUpload { async fn put_part(&self, buf: Vec, part_idx: usize) -> Result { - use reqwest::header::ETAG; let part = (part_idx + 1).to_string(); - let response = self + let content_id = self .client .put_request( &self.location, @@ -377,13 +370,7 @@ impl PutPart for S3MultiPartUpload { ) .await?; - let etag = response.headers().get(ETAG).context(MissingEtagSnafu)?; - - let etag = etag.to_str().context(BadHeaderSnafu)?; - - Ok(PartId { - content_id: etag.to_string(), - }) + Ok(PartId { content_id }) } async fn complete(&self, completed_parts: Vec) -> Result<()> { diff --git a/src/azure/mod.rs b/src/azure/mod.rs index 2a08c67..0e638ef 100644 --- a/src/azure/mod.rs +++ b/src/azure/mod.rs @@ -31,7 +31,7 @@ use crate::{ multipart::{PartId, PutPart, WriteMultiPart}, path::Path, ClientOptions, GetOptions, GetResult, ListResult, MultipartId, ObjectMeta, - ObjectStore, Result, RetryConfig, + ObjectStore, PutResult, Result, RetryConfig, }; use async_trait::async_trait; use base64::prelude::BASE64_STANDARD; @@ -62,6 +62,7 @@ mod credential; /// [`CredentialProvider`] for [`MicrosoftAzure`] pub type AzureCredentialProvider = Arc>; +use crate::client::header::get_etag; pub use credential::AzureCredential; const STORE: &str = "MicrosoftAzure"; @@ -81,9 +82,6 @@ const MSI_ENDPOINT_ENV_KEY: &str = "IDENTITY_ENDPOINT"; #[derive(Debug, Snafu)] #[allow(missing_docs)] enum Error { - #[snafu(display("Received header containing non-ASCII data"))] - BadHeader { source: reqwest::header::ToStrError }, - #[snafu(display("Unable parse source url. Url: {}, Error: {}", url, source))] UnableToParseUrl { source: url::ParseError, @@ -126,8 +124,10 @@ enum Error { #[snafu(display("Configuration key: '{}' is not known.", key))] UnknownConfigurationKey { key: String }, - #[snafu(display("ETag Header missing from response"))] - MissingEtag, + #[snafu(display("Unable to extract metadata from headers: {}", source))] + Metadata { + source: crate::client::header::Error, + }, } impl From for super::Error { @@ -170,11 +170,13 @@ impl std::fmt::Display for MicrosoftAzure { #[async_trait] impl ObjectStore for MicrosoftAzure { - async fn put(&self, location: &Path, bytes: Bytes) -> Result<()> { - self.client + async fn put(&self, location: &Path, bytes: Bytes) -> Result { + let response = self + .client .put_request(location, Some(bytes), false, &()) .await?; - Ok(()) + let e_tag = Some(get_etag(response.headers()).context(MetadataSnafu)?); + Ok(PutResult { e_tag }) } async fn put_multipart( diff --git a/src/chunked.rs b/src/chunked.rs index d3e02b4..5694c55 100644 --- a/src/chunked.rs +++ b/src/chunked.rs @@ -30,6 +30,7 @@ use tokio::io::AsyncWrite; use crate::path::Path; use crate::{ GetOptions, GetResult, GetResultPayload, ListResult, ObjectMeta, ObjectStore, + PutResult, }; use crate::{MultipartId, Result}; @@ -62,7 +63,7 @@ impl Display for ChunkedStore { #[async_trait] impl ObjectStore for ChunkedStore { - async fn put(&self, location: &Path, bytes: Bytes) -> Result<()> { + async fn put(&self, location: &Path, bytes: Bytes) -> Result { self.inner.put(location, bytes).await } diff --git a/src/client/header.rs b/src/client/header.rs index 6499eff..17f83a2 100644 --- a/src/client/header.rs +++ b/src/client/header.rs @@ -64,6 +64,12 @@ pub enum Error { }, } +/// Extracts an etag from the provided [`HeaderMap`] +pub fn get_etag(headers: &HeaderMap) -> Result { + let e_tag = headers.get(ETAG).ok_or(Error::MissingEtag)?; + Ok(e_tag.to_str().context(BadHeaderSnafu)?.to_string()) +} + /// Extracts [`ObjectMeta`] from the provided [`HeaderMap`] pub fn header_meta( location: &Path, @@ -81,13 +87,10 @@ pub fn header_meta( None => Utc.timestamp_nanos(0), }; - let e_tag = match headers.get(ETAG) { - Some(e_tag) => { - let e_tag = e_tag.to_str().context(BadHeaderSnafu)?; - Some(e_tag.to_string()) - } - None if cfg.etag_required => return Err(Error::MissingEtag), - None => None, + let e_tag = match get_etag(headers) { + Ok(e_tag) => Some(e_tag), + Err(Error::MissingEtag) if !cfg.etag_required => None, + Err(e) => return Err(e), }; let content_length = headers diff --git a/src/gcp/mod.rs b/src/gcp/mod.rs index 513e396..97755c0 100644 --- a/src/gcp/mod.rs +++ b/src/gcp/mod.rs @@ -54,7 +54,7 @@ use crate::{ multipart::{PartId, PutPart, WriteMultiPart}, path::{Path, DELIMITER}, ClientOptions, GetOptions, GetResult, ListResult, MultipartId, ObjectMeta, - ObjectStore, Result, RetryConfig, + ObjectStore, PutResult, Result, RetryConfig, }; use credential::{InstanceCredentialProvider, ServiceAccountCredentials}; @@ -65,6 +65,7 @@ const STORE: &str = "GCS"; /// [`CredentialProvider`] for [`GoogleCloudStorage`] pub type GcpCredentialProvider = Arc>; +use crate::client::header::get_etag; use crate::gcp::credential::{ApplicationDefaultCredentials, DEFAULT_GCS_BASE_URL}; pub use credential::GcpCredential; @@ -155,11 +156,10 @@ enum Error { #[snafu(display("Configuration key: '{}' is not known.", key))] UnknownConfigurationKey { key: String }, - #[snafu(display("ETag Header missing from response"))] - MissingEtag, - - #[snafu(display("Received header containing non-ASCII data"))] - BadHeader { source: header::ToStrError }, + #[snafu(display("Unable to extract metadata from headers: {}", source))] + Metadata { + source: crate::client::header::Error, + }, } impl From for super::Error { @@ -247,7 +247,14 @@ impl GoogleCloudStorageClient { } /// Perform a put request - async fn put_request(&self, path: &Path, payload: Bytes) -> Result<()> { + /// + /// Returns the new ETag + async fn put_request( + &self, + path: &Path, + payload: Bytes, + query: &T, + ) -> Result { let credential = self.get_credential().await?; let url = self.object_url(path); @@ -256,8 +263,10 @@ impl GoogleCloudStorageClient { .get_content_type(path) .unwrap_or("application/octet-stream"); - self.client + let response = self + .client .request(Method::PUT, url) + .query(query) .bearer_auth(&credential.bearer) .header(header::CONTENT_TYPE, content_type) .header(header::CONTENT_LENGTH, payload.len()) @@ -268,7 +277,7 @@ impl GoogleCloudStorageClient { path: path.as_ref(), })?; - Ok(()) + Ok(get_etag(response.headers()).context(MetadataSnafu)?) } /// Initiate a multi-part upload @@ -469,7 +478,7 @@ impl ListClient for GoogleCloudStorageClient { struct GCSMultipartUpload { client: Arc, - encoded_path: String, + path: Path, multipart_id: MultipartId, } @@ -478,38 +487,17 @@ impl PutPart for GCSMultipartUpload { /// Upload an object part async fn put_part(&self, buf: Vec, part_idx: usize) -> Result { let upload_id = self.multipart_id.clone(); - let url = format!( - "{}/{}/{}", - self.client.base_url, self.client.bucket_name_encoded, self.encoded_path - ); - - let credential = self.client.get_credential().await?; - - let response = self + let content_id = self .client - .client - .request(Method::PUT, &url) - .bearer_auth(&credential.bearer) - .query(&[ - ("partNumber", format!("{}", part_idx + 1)), - ("uploadId", upload_id), - ]) - .header(header::CONTENT_TYPE, "application/octet-stream") - .header(header::CONTENT_LENGTH, format!("{}", buf.len())) - .body(buf) - .send_retry(&self.client.retry_config) - .await - .context(PutRequestSnafu { - path: &self.encoded_path, - })?; - - let content_id = response - .headers() - .get("ETag") - .context(MissingEtagSnafu)? - .to_str() - .context(BadHeaderSnafu)? - .to_string(); + .put_request( + &self.path, + buf.into(), + &[ + ("partNumber", format!("{}", part_idx + 1)), + ("uploadId", upload_id), + ], + ) + .await?; Ok(PartId { content_id }) } @@ -517,10 +505,7 @@ impl PutPart for GCSMultipartUpload { /// Complete a multipart upload async fn complete(&self, completed_parts: Vec) -> Result<()> { let upload_id = self.multipart_id.clone(); - let url = format!( - "{}/{}/{}", - self.client.base_url, self.client.bucket_name_encoded, self.encoded_path - ); + let url = self.client.object_url(&self.path); let parts = completed_parts .into_iter() @@ -550,7 +535,7 @@ impl PutPart for GCSMultipartUpload { .send_retry(&self.client.retry_config) .await .context(PostRequestSnafu { - path: &self.encoded_path, + path: self.path.as_ref(), })?; Ok(()) @@ -559,8 +544,9 @@ impl PutPart for GCSMultipartUpload { #[async_trait] impl ObjectStore for GoogleCloudStorage { - async fn put(&self, location: &Path, bytes: Bytes) -> Result<()> { - self.client.put_request(location, bytes).await + async fn put(&self, location: &Path, bytes: Bytes) -> Result { + let e_tag = self.client.put_request(location, bytes, &()).await?; + Ok(PutResult { e_tag: Some(e_tag) }) } async fn put_multipart( @@ -569,12 +555,9 @@ impl ObjectStore for GoogleCloudStorage { ) -> Result<(MultipartId, Box)> { let upload_id = self.client.multipart_initiate(location).await?; - let encoded_path = - percent_encode(location.to_string().as_bytes(), NON_ALPHANUMERIC).to_string(); - let inner = GCSMultipartUpload { client: Arc::clone(&self.client), - encoded_path, + path: location.clone(), multipart_id: upload_id.clone(), }; diff --git a/src/http/client.rs b/src/http/client.rs index b2a6ac0..4c2a7fc 100644 --- a/src/http/client.rs +++ b/src/http/client.rs @@ -160,7 +160,7 @@ impl Client { Ok(()) } - pub async fn put(&self, location: &Path, bytes: Bytes) -> Result<()> { + pub async fn put(&self, location: &Path, bytes: Bytes) -> Result { let mut retry = false; loop { let url = self.path_url(location); @@ -170,7 +170,7 @@ impl Client { } match builder.send_retry(&self.retry_config).await { - Ok(_) => return Ok(()), + Ok(response) => return Ok(response), Err(source) => match source.status() { // Some implementations return 404 instead of 409 Some(StatusCode::CONFLICT | StatusCode::NOT_FOUND) if !retry => { diff --git a/src/http/mod.rs b/src/http/mod.rs index 2fd7850..e41e4f9 100644 --- a/src/http/mod.rs +++ b/src/http/mod.rs @@ -41,11 +41,12 @@ use tokio::io::AsyncWrite; use url::Url; use crate::client::get::GetClientExt; +use crate::client::header::get_etag; use crate::http::client::Client; use crate::path::Path; use crate::{ ClientConfigKey, ClientOptions, GetOptions, GetResult, ListResult, MultipartId, - ObjectMeta, ObjectStore, Result, RetryConfig, + ObjectMeta, ObjectStore, PutResult, Result, RetryConfig, }; mod client; @@ -95,8 +96,14 @@ impl std::fmt::Display for HttpStore { #[async_trait] impl ObjectStore for HttpStore { - async fn put(&self, location: &Path, bytes: Bytes) -> Result<()> { - self.client.put(location, bytes).await + async fn put(&self, location: &Path, bytes: Bytes) -> Result { + let response = self.client.put(location, bytes).await?; + let e_tag = match get_etag(response.headers()) { + Ok(e_tag) => Some(e_tag), + Err(crate::client::header::Error::MissingEtag) => None, + Err(source) => return Err(Error::Metadata { source }.into()), + }; + Ok(PutResult { e_tag }) } async fn put_multipart( diff --git a/src/lib.rs b/src/lib.rs index 9b39644..018f0f5 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -300,7 +300,7 @@ pub trait ObjectStore: std::fmt::Display + Send + Sync + Debug + 'static { /// The operation is guaranteed to be atomic, it will either successfully /// write the entirety of `bytes` to `location`, or fail. No clients /// should be able to observe a partially written object - async fn put(&self, location: &Path, bytes: Bytes) -> Result<()>; + async fn put(&self, location: &Path, bytes: Bytes) -> Result; /// Get a multi-part upload that allows writing data in chunks /// @@ -528,7 +528,7 @@ macro_rules! as_ref_impl { ($type:ty) => { #[async_trait] impl ObjectStore for $type { - async fn put(&self, location: &Path, bytes: Bytes) -> Result<()> { + async fn put(&self, location: &Path, bytes: Bytes) -> Result { self.as_ref().put(location, bytes).await } @@ -659,6 +659,8 @@ pub struct ObjectMeta { /// The size in bytes of the object pub size: usize, /// The unique identifier for the object + /// + /// pub e_tag: Option, } @@ -850,6 +852,15 @@ impl GetResult { } } +/// Result for a put request +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct PutResult { + /// The unique identifier for the object + /// + /// + pub e_tag: Option, +} + /// A specialized `Result` for object store-related errors pub type Result = std::result::Result; @@ -1383,6 +1394,26 @@ mod tests { ..GetOptions::default() }; storage.get_opts(&path, options).await.unwrap(); + + let result = storage.put(&path, "test".into()).await.unwrap(); + let new_tag = result.e_tag.unwrap(); + assert_ne!(tag, new_tag); + + let meta = storage.head(&path).await.unwrap(); + assert_eq!(meta.e_tag.unwrap(), new_tag); + + let options = GetOptions { + if_match: Some(new_tag), + ..GetOptions::default() + }; + storage.get_opts(&path, options).await.unwrap(); + + let options = GetOptions { + if_match: Some(tag), + ..GetOptions::default() + }; + let err = storage.get_opts(&path, options).await.unwrap_err(); + assert!(matches!(err, Error::Precondition { .. }), "{err}"); } /// Returns a chunk of length `chunk_length` diff --git a/src/limit.rs b/src/limit.rs index 00cbce0..8a45381 100644 --- a/src/limit.rs +++ b/src/limit.rs @@ -19,7 +19,7 @@ use crate::{ BoxStream, GetOptions, GetResult, GetResultPayload, ListResult, MultipartId, - ObjectMeta, ObjectStore, Path, Result, StreamExt, + ObjectMeta, ObjectStore, Path, PutResult, Result, StreamExt, }; use async_trait::async_trait; use bytes::Bytes; @@ -72,7 +72,7 @@ impl std::fmt::Display for LimitStore { #[async_trait] impl ObjectStore for LimitStore { - async fn put(&self, location: &Path, bytes: Bytes) -> Result<()> { + async fn put(&self, location: &Path, bytes: Bytes) -> Result { let _permit = self.semaphore.acquire().await.unwrap(); self.inner.put(location, bytes).await } diff --git a/src/local.rs b/src/local.rs index 38467c3..4b7c963 100644 --- a/src/local.rs +++ b/src/local.rs @@ -20,7 +20,7 @@ use crate::{ maybe_spawn_blocking, path::{absolute_path_to_url, Path}, GetOptions, GetResult, GetResultPayload, ListResult, MultipartId, ObjectMeta, - ObjectStore, Result, + ObjectStore, PutResult, Result, }; use async_trait::async_trait; use bytes::Bytes; @@ -36,6 +36,7 @@ use std::ops::Range; use std::pin::Pin; use std::sync::Arc; use std::task::Poll; +use std::time::SystemTime; use std::{collections::BTreeSet, convert::TryFrom, io}; use std::{collections::VecDeque, path::PathBuf}; use tokio::io::AsyncWrite; @@ -270,7 +271,7 @@ impl Config { #[async_trait] impl ObjectStore for LocalFileSystem { - async fn put(&self, location: &Path, bytes: Bytes) -> Result<()> { + async fn put(&self, location: &Path, bytes: Bytes) -> Result { let path = self.config.path_to_filesystem(location)?; maybe_spawn_blocking(move || { let (mut file, suffix) = new_staged_upload(&path)?; @@ -282,8 +283,17 @@ impl ObjectStore for LocalFileSystem { }) .map_err(|e| { let _ = std::fs::remove_file(&staging_path); // Attempt to cleanup - e.into() - }) + e + })?; + + let metadata = file.metadata().map_err(|e| Error::Metadata { + source: e.into(), + path: path.to_string_lossy().to_string(), + })?; + + Ok(PutResult { + e_tag: Some(get_etag(&metadata)), + }) }) .await } @@ -959,24 +969,33 @@ fn last_modified(metadata: &Metadata) -> DateTime { .into() } +fn get_etag(metadata: &Metadata) -> String { + let inode = get_inode(metadata); + let size = metadata.len(); + let mtime = metadata + .modified() + .ok() + .and_then(|mtime| mtime.duration_since(SystemTime::UNIX_EPOCH).ok()) + .unwrap_or_default() + .as_micros(); + + // Use an ETag scheme based on that used by many popular HTTP servers + // + // + format!("{inode:x}-{mtime:x}-{size:x}") +} + fn convert_metadata(metadata: Metadata, location: Path) -> Result { let last_modified = last_modified(&metadata); let size = usize::try_from(metadata.len()).context(FileSizeOverflowedUsizeSnafu { path: location.as_ref(), })?; - let inode = get_inode(&metadata); - let mtime = last_modified.timestamp_micros(); - - // Use an ETag scheme based on that used by many popular HTTP servers - // - // - let etag = format!("{inode:x}-{mtime:x}-{size:x}"); Ok(ObjectMeta { location, last_modified, size, - e_tag: Some(etag), + e_tag: Some(get_etag(&metadata)), }) } diff --git a/src/memory.rs b/src/memory.rs index 00b330b..952b457 100644 --- a/src/memory.rs +++ b/src/memory.rs @@ -17,7 +17,8 @@ //! An in-memory object store implementation use crate::{ - path::Path, GetResult, GetResultPayload, ListResult, ObjectMeta, ObjectStore, Result, + path::Path, GetResult, GetResultPayload, ListResult, ObjectMeta, ObjectStore, + PutResult, Result, }; use crate::{GetOptions, MultipartId}; use async_trait::async_trait; @@ -106,11 +107,12 @@ struct Storage { type SharedStorage = Arc>; impl Storage { - fn insert(&mut self, location: &Path, bytes: Bytes) { + fn insert(&mut self, location: &Path, bytes: Bytes) -> usize { let etag = self.next_etag; self.next_etag += 1; let entry = Entry::new(bytes, Utc::now(), etag); self.map.insert(location.clone(), entry); + etag } } @@ -122,9 +124,11 @@ impl std::fmt::Display for InMemory { #[async_trait] impl ObjectStore for InMemory { - async fn put(&self, location: &Path, bytes: Bytes) -> Result<()> { - self.storage.write().insert(location, bytes); - Ok(()) + async fn put(&self, location: &Path, bytes: Bytes) -> Result { + let etag = self.storage.write().insert(location, bytes); + Ok(PutResult { + e_tag: Some(etag.to_string()), + }) } async fn put_multipart( diff --git a/src/prefix.rs b/src/prefix.rs index 3776dec..21f6c1d 100644 --- a/src/prefix.rs +++ b/src/prefix.rs @@ -23,7 +23,8 @@ use tokio::io::AsyncWrite; use crate::path::Path; use crate::{ - GetOptions, GetResult, ListResult, MultipartId, ObjectMeta, ObjectStore, Result, + GetOptions, GetResult, ListResult, MultipartId, ObjectMeta, ObjectStore, PutResult, + Result, }; #[doc(hidden)] @@ -79,7 +80,7 @@ impl PrefixStore { #[async_trait::async_trait] impl ObjectStore for PrefixStore { - async fn put(&self, location: &Path, bytes: Bytes) -> Result<()> { + async fn put(&self, location: &Path, bytes: Bytes) -> Result { let full_path = self.full_path(location); self.inner.put(&full_path, bytes).await } diff --git a/src/throttle.rs b/src/throttle.rs index f716a11..d6f191b 100644 --- a/src/throttle.rs +++ b/src/throttle.rs @@ -21,7 +21,8 @@ use std::ops::Range; use std::{convert::TryInto, sync::Arc}; use crate::{ - path::Path, GetResult, GetResultPayload, ListResult, ObjectMeta, ObjectStore, Result, + path::Path, GetResult, GetResultPayload, ListResult, ObjectMeta, ObjectStore, + PutResult, Result, }; use crate::{GetOptions, MultipartId}; use async_trait::async_trait; @@ -147,7 +148,7 @@ impl std::fmt::Display for ThrottledStore { #[async_trait] impl ObjectStore for ThrottledStore { - async fn put(&self, location: &Path, bytes: Bytes) -> Result<()> { + async fn put(&self, location: &Path, bytes: Bytes) -> Result { sleep(self.config().wait_put_per_call).await; self.inner.put(location, bytes).await diff --git a/tests/get_range_file.rs b/tests/get_range_file.rs index 25c4692..5703d7f 100644 --- a/tests/get_range_file.rs +++ b/tests/get_range_file.rs @@ -23,7 +23,7 @@ use futures::stream::BoxStream; use object_store::local::LocalFileSystem; use object_store::path::Path; use object_store::{ - GetOptions, GetResult, ListResult, MultipartId, ObjectMeta, ObjectStore, + GetOptions, GetResult, ListResult, MultipartId, ObjectMeta, ObjectStore, PutResult, }; use std::fmt::Formatter; use tempfile::tempdir; @@ -40,7 +40,7 @@ impl std::fmt::Display for MyStore { #[async_trait] impl ObjectStore for MyStore { - async fn put(&self, path: &Path, data: Bytes) -> object_store::Result<()> { + async fn put(&self, path: &Path, data: Bytes) -> object_store::Result { self.0.put(path, data).await } From 82f893d06fe1393303e49d40b630f5dfddd63e19 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 19 Oct 2023 13:40:49 +0100 Subject: [PATCH 209/397] Split aws Module (#4953) * Split aws module * Clippy * Fix doc --- src/aws/builder.rs | 1098 +++++++++++++++++++++++++++++++++++++++++ src/aws/mod.rs | 1169 +------------------------------------------- src/aws/resolve.rs | 106 ++++ 3 files changed, 1225 insertions(+), 1148 deletions(-) create mode 100644 src/aws/builder.rs create mode 100644 src/aws/resolve.rs diff --git a/src/aws/builder.rs b/src/aws/builder.rs new file mode 100644 index 0000000..422ba15 --- /dev/null +++ b/src/aws/builder.rs @@ -0,0 +1,1098 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::aws::client::{S3Client, S3Config}; +use crate::aws::credential::{ + InstanceCredentialProvider, TaskCredentialProvider, WebIdentityProvider, +}; +use crate::aws::{ + AmazonS3, AwsCredential, AwsCredentialProvider, Checksum, S3CopyIfNotExists, STORE, +}; +use crate::client::TokenCredentialProvider; +use crate::config::ConfigValue; +use crate::{ + ClientConfigKey, ClientOptions, Result, RetryConfig, StaticCredentialProvider, +}; +use itertools::Itertools; +use serde::{Deserialize, Serialize}; +use snafu::{OptionExt, ResultExt, Snafu}; +use std::str::FromStr; +use std::sync::Arc; +use tracing::info; +use url::Url; + +/// Default metadata endpoint +static DEFAULT_METADATA_ENDPOINT: &str = "http://169.254.169.254"; + +/// A specialized `Error` for object store-related errors +#[derive(Debug, Snafu)] +#[allow(missing_docs)] +enum Error { + #[snafu(display("Missing region"))] + MissingRegion, + + #[snafu(display("Missing bucket name"))] + MissingBucketName, + + #[snafu(display("Missing AccessKeyId"))] + MissingAccessKeyId, + + #[snafu(display("Missing SecretAccessKey"))] + MissingSecretAccessKey, + + #[snafu(display("Unable parse source url. Url: {}, Error: {}", url, source))] + UnableToParseUrl { + source: url::ParseError, + url: String, + }, + + #[snafu(display( + "Unknown url scheme cannot be parsed into storage location: {}", + scheme + ))] + UnknownUrlScheme { scheme: String }, + + #[snafu(display("URL did not match any known pattern for scheme: {}", url))] + UrlNotRecognised { url: String }, + + #[snafu(display("Configuration key: '{}' is not known.", key))] + UnknownConfigurationKey { key: String }, + + #[snafu(display("Bucket '{}' not found", bucket))] + BucketNotFound { bucket: String }, + + #[snafu(display("Failed to resolve region for bucket '{}'", bucket))] + ResolveRegion { + bucket: String, + source: reqwest::Error, + }, + + #[snafu(display("Failed to parse the region for bucket '{}'", bucket))] + RegionParse { bucket: String }, +} + +impl From for crate::Error { + fn from(source: Error) -> Self { + match source { + Error::UnknownConfigurationKey { key } => { + Self::UnknownConfigurationKey { store: STORE, key } + } + _ => Self::Generic { + store: STORE, + source: Box::new(source), + }, + } + } +} + +/// Configure a connection to Amazon S3 using the specified credentials in +/// the specified Amazon region and bucket. +/// +/// # Example +/// ``` +/// # let REGION = "foo"; +/// # let BUCKET_NAME = "foo"; +/// # let ACCESS_KEY_ID = "foo"; +/// # let SECRET_KEY = "foo"; +/// # use object_store::aws::AmazonS3Builder; +/// let s3 = AmazonS3Builder::new() +/// .with_region(REGION) +/// .with_bucket_name(BUCKET_NAME) +/// .with_access_key_id(ACCESS_KEY_ID) +/// .with_secret_access_key(SECRET_KEY) +/// .build(); +/// ``` +#[derive(Debug, Default, Clone)] +pub struct AmazonS3Builder { + /// Access key id + access_key_id: Option, + /// Secret access_key + secret_access_key: Option, + /// Region + region: Option, + /// Bucket name + bucket_name: Option, + /// Endpoint for communicating with AWS S3 + endpoint: Option, + /// Token to use for requests + token: Option, + /// Url + url: Option, + /// Retry config + retry_config: RetryConfig, + /// When set to true, fallback to IMDSv1 + imdsv1_fallback: ConfigValue, + /// When set to true, virtual hosted style request has to be used + virtual_hosted_style_request: ConfigValue, + /// When set to true, unsigned payload option has to be used + unsigned_payload: ConfigValue, + /// Checksum algorithm which has to be used for object integrity check during upload + checksum_algorithm: Option>, + /// Metadata endpoint, see + metadata_endpoint: Option, + /// Container credentials URL, see + container_credentials_relative_uri: Option, + /// Client options + client_options: ClientOptions, + /// Credentials + credentials: Option, + /// Skip signing requests + skip_signature: ConfigValue, + /// Copy if not exists + copy_if_not_exists: Option>, +} + +/// Configuration keys for [`AmazonS3Builder`] +/// +/// Configuration via keys can be done via [`AmazonS3Builder::with_config`] +/// +/// # Example +/// ``` +/// # use object_store::aws::{AmazonS3Builder, AmazonS3ConfigKey}; +/// let builder = AmazonS3Builder::new() +/// .with_config("aws_access_key_id".parse().unwrap(), "my-access-key-id") +/// .with_config(AmazonS3ConfigKey::DefaultRegion, "my-default-region"); +/// ``` +#[derive(PartialEq, Eq, Hash, Clone, Debug, Copy, Serialize, Deserialize)] +#[non_exhaustive] +pub enum AmazonS3ConfigKey { + /// AWS Access Key + /// + /// See [`AmazonS3Builder::with_access_key_id`] for details. + /// + /// Supported keys: + /// - `aws_access_key_id` + /// - `access_key_id` + AccessKeyId, + + /// Secret Access Key + /// + /// See [`AmazonS3Builder::with_secret_access_key`] for details. + /// + /// Supported keys: + /// - `aws_secret_access_key` + /// - `secret_access_key` + SecretAccessKey, + + /// Region + /// + /// See [`AmazonS3Builder::with_region`] for details. + /// + /// Supported keys: + /// - `aws_region` + /// - `region` + Region, + + /// Default region + /// + /// See [`AmazonS3Builder::with_region`] for details. + /// + /// Supported keys: + /// - `aws_default_region` + /// - `default_region` + DefaultRegion, + + /// Bucket name + /// + /// See [`AmazonS3Builder::with_bucket_name`] for details. + /// + /// Supported keys: + /// - `aws_bucket` + /// - `aws_bucket_name` + /// - `bucket` + /// - `bucket_name` + Bucket, + + /// Sets custom endpoint for communicating with AWS S3. + /// + /// See [`AmazonS3Builder::with_endpoint`] for details. + /// + /// Supported keys: + /// - `aws_endpoint` + /// - `aws_endpoint_url` + /// - `endpoint` + /// - `endpoint_url` + Endpoint, + + /// Token to use for requests (passed to underlying provider) + /// + /// See [`AmazonS3Builder::with_token`] for details. + /// + /// Supported keys: + /// - `aws_session_token` + /// - `aws_token` + /// - `session_token` + /// - `token` + Token, + + /// Fall back to ImdsV1 + /// + /// See [`AmazonS3Builder::with_imdsv1_fallback`] for details. + /// + /// Supported keys: + /// - `aws_imdsv1_fallback` + /// - `imdsv1_fallback` + ImdsV1Fallback, + + /// If virtual hosted style request has to be used + /// + /// See [`AmazonS3Builder::with_virtual_hosted_style_request`] for details. + /// + /// Supported keys: + /// - `aws_virtual_hosted_style_request` + /// - `virtual_hosted_style_request` + VirtualHostedStyleRequest, + + /// Avoid computing payload checksum when calculating signature. + /// + /// See [`AmazonS3Builder::with_unsigned_payload`] for details. + /// + /// Supported keys: + /// - `aws_unsigned_payload` + /// - `unsigned_payload` + UnsignedPayload, + + /// Set the checksum algorithm for this client + /// + /// See [`AmazonS3Builder::with_checksum_algorithm`] + Checksum, + + /// Set the instance metadata endpoint + /// + /// See [`AmazonS3Builder::with_metadata_endpoint`] for details. + /// + /// Supported keys: + /// - `aws_metadata_endpoint` + /// - `metadata_endpoint` + MetadataEndpoint, + + /// Set the container credentials relative URI + /// + /// + ContainerCredentialsRelativeUri, + + /// Configure how to provide `copy_if_not_exists` + /// + /// See [`S3CopyIfNotExists`] + CopyIfNotExists, + + /// Skip signing request + SkipSignature, + + /// Client options + Client(ClientConfigKey), +} + +impl AsRef for AmazonS3ConfigKey { + fn as_ref(&self) -> &str { + match self { + Self::AccessKeyId => "aws_access_key_id", + Self::SecretAccessKey => "aws_secret_access_key", + Self::Region => "aws_region", + Self::Bucket => "aws_bucket", + Self::Endpoint => "aws_endpoint", + Self::Token => "aws_session_token", + Self::ImdsV1Fallback => "aws_imdsv1_fallback", + Self::VirtualHostedStyleRequest => "aws_virtual_hosted_style_request", + Self::DefaultRegion => "aws_default_region", + Self::MetadataEndpoint => "aws_metadata_endpoint", + Self::UnsignedPayload => "aws_unsigned_payload", + Self::Checksum => "aws_checksum_algorithm", + Self::ContainerCredentialsRelativeUri => { + "aws_container_credentials_relative_uri" + } + Self::SkipSignature => "aws_skip_signature", + Self::CopyIfNotExists => "copy_if_not_exists", + Self::Client(opt) => opt.as_ref(), + } + } +} + +impl FromStr for AmazonS3ConfigKey { + type Err = crate::Error; + + fn from_str(s: &str) -> Result { + match s { + "aws_access_key_id" | "access_key_id" => Ok(Self::AccessKeyId), + "aws_secret_access_key" | "secret_access_key" => Ok(Self::SecretAccessKey), + "aws_default_region" | "default_region" => Ok(Self::DefaultRegion), + "aws_region" | "region" => Ok(Self::Region), + "aws_bucket" | "aws_bucket_name" | "bucket_name" | "bucket" => { + Ok(Self::Bucket) + } + "aws_endpoint_url" | "aws_endpoint" | "endpoint_url" | "endpoint" => { + Ok(Self::Endpoint) + } + "aws_session_token" | "aws_token" | "session_token" | "token" => { + Ok(Self::Token) + } + "aws_virtual_hosted_style_request" | "virtual_hosted_style_request" => { + Ok(Self::VirtualHostedStyleRequest) + } + "aws_imdsv1_fallback" | "imdsv1_fallback" => Ok(Self::ImdsV1Fallback), + "aws_metadata_endpoint" | "metadata_endpoint" => Ok(Self::MetadataEndpoint), + "aws_unsigned_payload" | "unsigned_payload" => Ok(Self::UnsignedPayload), + "aws_checksum_algorithm" | "checksum_algorithm" => Ok(Self::Checksum), + "aws_container_credentials_relative_uri" => { + Ok(Self::ContainerCredentialsRelativeUri) + } + "aws_skip_signature" | "skip_signature" => Ok(Self::SkipSignature), + "copy_if_not_exists" => Ok(Self::CopyIfNotExists), + // Backwards compatibility + "aws_allow_http" => Ok(Self::Client(ClientConfigKey::AllowHttp)), + _ => match s.parse() { + Ok(key) => Ok(Self::Client(key)), + Err(_) => Err(Error::UnknownConfigurationKey { key: s.into() }.into()), + }, + } + } +} + +impl AmazonS3Builder { + /// Create a new [`AmazonS3Builder`] with default values. + pub fn new() -> Self { + Default::default() + } + + /// Fill the [`AmazonS3Builder`] with regular AWS environment variables + /// + /// Variables extracted from environment: + /// * `AWS_ACCESS_KEY_ID` -> access_key_id + /// * `AWS_SECRET_ACCESS_KEY` -> secret_access_key + /// * `AWS_DEFAULT_REGION` -> region + /// * `AWS_ENDPOINT` -> endpoint + /// * `AWS_SESSION_TOKEN` -> token + /// * `AWS_CONTAINER_CREDENTIALS_RELATIVE_URI` -> + /// * `AWS_ALLOW_HTTP` -> set to "true" to permit HTTP connections without TLS + /// # Example + /// ``` + /// use object_store::aws::AmazonS3Builder; + /// + /// let s3 = AmazonS3Builder::from_env() + /// .with_bucket_name("foo") + /// .build(); + /// ``` + pub fn from_env() -> Self { + let mut builder: Self = Default::default(); + + for (os_key, os_value) in std::env::vars_os() { + if let (Some(key), Some(value)) = (os_key.to_str(), os_value.to_str()) { + if key.starts_with("AWS_") { + if let Ok(config_key) = key.to_ascii_lowercase().parse() { + builder = builder.with_config(config_key, value); + } + } + } + } + + builder + } + + /// Parse available connection info form a well-known storage URL. + /// + /// The supported url schemes are: + /// + /// - `s3:///` + /// - `s3a:///` + /// - `https://s3..amazonaws.com/` + /// - `https://.s3..amazonaws.com` + /// - `https://ACCOUNT_ID.r2.cloudflarestorage.com/bucket` + /// + /// Note: Settings derived from the URL will override any others set on this builder + /// + /// # Example + /// ``` + /// use object_store::aws::AmazonS3Builder; + /// + /// let s3 = AmazonS3Builder::from_env() + /// .with_url("s3://bucket/path") + /// .build(); + /// ``` + pub fn with_url(mut self, url: impl Into) -> Self { + self.url = Some(url.into()); + self + } + + /// Set an option on the builder via a key - value pair. + pub fn with_config( + mut self, + key: AmazonS3ConfigKey, + value: impl Into, + ) -> Self { + match key { + AmazonS3ConfigKey::AccessKeyId => self.access_key_id = Some(value.into()), + AmazonS3ConfigKey::SecretAccessKey => { + self.secret_access_key = Some(value.into()) + } + AmazonS3ConfigKey::Region => self.region = Some(value.into()), + AmazonS3ConfigKey::Bucket => self.bucket_name = Some(value.into()), + AmazonS3ConfigKey::Endpoint => self.endpoint = Some(value.into()), + AmazonS3ConfigKey::Token => self.token = Some(value.into()), + AmazonS3ConfigKey::ImdsV1Fallback => self.imdsv1_fallback.parse(value), + AmazonS3ConfigKey::VirtualHostedStyleRequest => { + self.virtual_hosted_style_request.parse(value) + } + AmazonS3ConfigKey::DefaultRegion => { + self.region = self.region.or_else(|| Some(value.into())) + } + AmazonS3ConfigKey::MetadataEndpoint => { + self.metadata_endpoint = Some(value.into()) + } + AmazonS3ConfigKey::UnsignedPayload => self.unsigned_payload.parse(value), + AmazonS3ConfigKey::Checksum => { + self.checksum_algorithm = Some(ConfigValue::Deferred(value.into())) + } + AmazonS3ConfigKey::ContainerCredentialsRelativeUri => { + self.container_credentials_relative_uri = Some(value.into()) + } + AmazonS3ConfigKey::Client(key) => { + self.client_options = self.client_options.with_config(key, value) + } + AmazonS3ConfigKey::SkipSignature => self.skip_signature.parse(value), + AmazonS3ConfigKey::CopyIfNotExists => { + self.copy_if_not_exists = Some(ConfigValue::Deferred(value.into())) + } + }; + self + } + + /// Set an option on the builder via a key - value pair. + /// + /// This method will return an `UnknownConfigKey` error if key cannot be parsed into [`AmazonS3ConfigKey`]. + #[deprecated(note = "Use with_config")] + pub fn try_with_option( + self, + key: impl AsRef, + value: impl Into, + ) -> Result { + Ok(self.with_config(key.as_ref().parse()?, value)) + } + + /// Hydrate builder from key value pairs + /// + /// This method will return an `UnknownConfigKey` error if any key cannot be parsed into [`AmazonS3ConfigKey`]. + #[deprecated(note = "Use with_config")] + #[allow(deprecated)] + pub fn try_with_options< + I: IntoIterator, impl Into)>, + >( + mut self, + options: I, + ) -> Result { + for (key, value) in options { + self = self.try_with_option(key, value)?; + } + Ok(self) + } + + /// Get config value via a [`AmazonS3ConfigKey`]. + /// + /// # Example + /// ``` + /// use object_store::aws::{AmazonS3Builder, AmazonS3ConfigKey}; + /// + /// let builder = AmazonS3Builder::from_env() + /// .with_bucket_name("foo"); + /// let bucket_name = builder.get_config_value(&AmazonS3ConfigKey::Bucket).unwrap_or_default(); + /// assert_eq!("foo", &bucket_name); + /// ``` + pub fn get_config_value(&self, key: &AmazonS3ConfigKey) -> Option { + match key { + AmazonS3ConfigKey::AccessKeyId => self.access_key_id.clone(), + AmazonS3ConfigKey::SecretAccessKey => self.secret_access_key.clone(), + AmazonS3ConfigKey::Region | AmazonS3ConfigKey::DefaultRegion => { + self.region.clone() + } + AmazonS3ConfigKey::Bucket => self.bucket_name.clone(), + AmazonS3ConfigKey::Endpoint => self.endpoint.clone(), + AmazonS3ConfigKey::Token => self.token.clone(), + AmazonS3ConfigKey::ImdsV1Fallback => Some(self.imdsv1_fallback.to_string()), + AmazonS3ConfigKey::VirtualHostedStyleRequest => { + Some(self.virtual_hosted_style_request.to_string()) + } + AmazonS3ConfigKey::MetadataEndpoint => self.metadata_endpoint.clone(), + AmazonS3ConfigKey::UnsignedPayload => Some(self.unsigned_payload.to_string()), + AmazonS3ConfigKey::Checksum => { + self.checksum_algorithm.as_ref().map(ToString::to_string) + } + AmazonS3ConfigKey::Client(key) => self.client_options.get_config_value(key), + AmazonS3ConfigKey::ContainerCredentialsRelativeUri => { + self.container_credentials_relative_uri.clone() + } + AmazonS3ConfigKey::SkipSignature => Some(self.skip_signature.to_string()), + AmazonS3ConfigKey::CopyIfNotExists => { + self.copy_if_not_exists.as_ref().map(ToString::to_string) + } + } + } + + /// Sets properties on this builder based on a URL + /// + /// This is a separate member function to allow fallible computation to + /// be deferred until [`Self::build`] which in turn allows deriving [`Clone`] + fn parse_url(&mut self, url: &str) -> Result<()> { + let parsed = Url::parse(url).context(UnableToParseUrlSnafu { url })?; + let host = parsed.host_str().context(UrlNotRecognisedSnafu { url })?; + match parsed.scheme() { + "s3" | "s3a" => self.bucket_name = Some(host.to_string()), + "https" => match host.splitn(4, '.').collect_tuple() { + Some(("s3", region, "amazonaws", "com")) => { + self.region = Some(region.to_string()); + let bucket = parsed.path_segments().into_iter().flatten().next(); + if let Some(bucket) = bucket { + self.bucket_name = Some(bucket.into()); + } + } + Some((bucket, "s3", region, "amazonaws.com")) => { + self.bucket_name = Some(bucket.to_string()); + self.region = Some(region.to_string()); + self.virtual_hosted_style_request = true.into(); + } + Some((account, "r2", "cloudflarestorage", "com")) => { + self.region = Some("auto".to_string()); + let endpoint = format!("https://{account}.r2.cloudflarestorage.com"); + self.endpoint = Some(endpoint); + + let bucket = parsed.path_segments().into_iter().flatten().next(); + if let Some(bucket) = bucket { + self.bucket_name = Some(bucket.into()); + } + } + _ => return Err(UrlNotRecognisedSnafu { url }.build().into()), + }, + scheme => return Err(UnknownUrlSchemeSnafu { scheme }.build().into()), + }; + Ok(()) + } + + /// Set the AWS Access Key (required) + pub fn with_access_key_id(mut self, access_key_id: impl Into) -> Self { + self.access_key_id = Some(access_key_id.into()); + self + } + + /// Set the AWS Secret Access Key (required) + pub fn with_secret_access_key( + mut self, + secret_access_key: impl Into, + ) -> Self { + self.secret_access_key = Some(secret_access_key.into()); + self + } + + /// Set the region (e.g. `us-east-1`) (required) + pub fn with_region(mut self, region: impl Into) -> Self { + self.region = Some(region.into()); + self + } + + /// Set the bucket_name (required) + pub fn with_bucket_name(mut self, bucket_name: impl Into) -> Self { + self.bucket_name = Some(bucket_name.into()); + self + } + + /// Sets the endpoint for communicating with AWS S3. Default value + /// is based on region. The `endpoint` field should be consistent with + /// the field `virtual_hosted_style_request'. + /// + /// For example, this might be set to `"http://localhost:4566:` + /// for testing against a localstack instance. + /// If `virtual_hosted_style_request` is set to true then `endpoint` + /// should have bucket name included. + pub fn with_endpoint(mut self, endpoint: impl Into) -> Self { + self.endpoint = Some(endpoint.into()); + self + } + + /// Set the token to use for requests (passed to underlying provider) + pub fn with_token(mut self, token: impl Into) -> Self { + self.token = Some(token.into()); + self + } + + /// Set the credential provider overriding any other options + pub fn with_credentials(mut self, credentials: AwsCredentialProvider) -> Self { + self.credentials = Some(credentials); + self + } + + /// Sets what protocol is allowed. If `allow_http` is : + /// * false (default): Only HTTPS are allowed + /// * true: HTTP and HTTPS are allowed + pub fn with_allow_http(mut self, allow_http: bool) -> Self { + self.client_options = self.client_options.with_allow_http(allow_http); + self + } + + /// Sets if virtual hosted style request has to be used. + /// If `virtual_hosted_style_request` is : + /// * false (default): Path style request is used + /// * true: Virtual hosted style request is used + /// + /// If the `endpoint` is provided then it should be + /// consistent with `virtual_hosted_style_request`. + /// i.e. if `virtual_hosted_style_request` is set to true + /// then `endpoint` should have bucket name included. + pub fn with_virtual_hosted_style_request( + mut self, + virtual_hosted_style_request: bool, + ) -> Self { + self.virtual_hosted_style_request = virtual_hosted_style_request.into(); + self + } + + /// Set the retry configuration + pub fn with_retry(mut self, retry_config: RetryConfig) -> Self { + self.retry_config = retry_config; + self + } + + /// By default instance credentials will only be fetched over [IMDSv2], as AWS recommends + /// against having IMDSv1 enabled on EC2 instances as it is vulnerable to [SSRF attack] + /// + /// However, certain deployment environments, such as those running old versions of kube2iam, + /// may not support IMDSv2. This option will enable automatic fallback to using IMDSv1 + /// if the token endpoint returns a 403 error indicating that IMDSv2 is not supported. + /// + /// This option has no effect if not using instance credentials + /// + /// [IMDSv2]: https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/configuring-instance-metadata-service.html + /// [SSRF attack]: https://aws.amazon.com/blogs/security/defense-in-depth-open-firewalls-reverse-proxies-ssrf-vulnerabilities-ec2-instance-metadata-service/ + /// + pub fn with_imdsv1_fallback(mut self) -> Self { + self.imdsv1_fallback = true.into(); + self + } + + /// Sets if unsigned payload option has to be used. + /// See [unsigned payload option](https://docs.aws.amazon.com/AmazonS3/latest/API/sig-v4-header-based-auth.html) + /// * false (default): Signed payload option is used, where the checksum for the request body is computed and included when constructing a canonical request. + /// * true: Unsigned payload option is used. `UNSIGNED-PAYLOAD` literal is included when constructing a canonical request, + pub fn with_unsigned_payload(mut self, unsigned_payload: bool) -> Self { + self.unsigned_payload = unsigned_payload.into(); + self + } + + /// If enabled, [`AmazonS3`] will not fetch credentials and will not sign requests + /// + /// This can be useful when interacting with public S3 buckets that deny authorized requests + pub fn with_skip_signature(mut self, skip_signature: bool) -> Self { + self.skip_signature = skip_signature.into(); + self + } + + /// Sets the [checksum algorithm] which has to be used for object integrity check during upload. + /// + /// [checksum algorithm]: https://docs.aws.amazon.com/AmazonS3/latest/userguide/checking-object-integrity.html + pub fn with_checksum_algorithm(mut self, checksum_algorithm: Checksum) -> Self { + // Convert to String to enable deferred parsing of config + self.checksum_algorithm = Some(checksum_algorithm.into()); + self + } + + /// Set the [instance metadata endpoint](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/ec2-instance-metadata.html), + /// used primarily within AWS EC2. + /// + /// This defaults to the IPv4 endpoint: http://169.254.169.254. One can alternatively use the IPv6 + /// endpoint http://fd00:ec2::254. + pub fn with_metadata_endpoint(mut self, endpoint: impl Into) -> Self { + self.metadata_endpoint = Some(endpoint.into()); + self + } + + /// Set the proxy_url to be used by the underlying client + pub fn with_proxy_url(mut self, proxy_url: impl Into) -> Self { + self.client_options = self.client_options.with_proxy_url(proxy_url); + self + } + + /// Set a trusted proxy CA certificate + pub fn with_proxy_ca_certificate( + mut self, + proxy_ca_certificate: impl Into, + ) -> Self { + self.client_options = self + .client_options + .with_proxy_ca_certificate(proxy_ca_certificate); + self + } + + /// Set a list of hosts to exclude from proxy connections + pub fn with_proxy_excludes(mut self, proxy_excludes: impl Into) -> Self { + self.client_options = self.client_options.with_proxy_excludes(proxy_excludes); + self + } + + /// Sets the client options, overriding any already set + pub fn with_client_options(mut self, options: ClientOptions) -> Self { + self.client_options = options; + self + } + + /// Configure how to provide `copy_if_not_exists` + pub fn with_copy_if_not_exists(mut self, config: S3CopyIfNotExists) -> Self { + self.copy_if_not_exists = Some(config.into()); + self + } + + /// Create a [`AmazonS3`] instance from the provided values, + /// consuming `self`. + pub fn build(mut self) -> Result { + if let Some(url) = self.url.take() { + self.parse_url(&url)?; + } + + let bucket = self.bucket_name.context(MissingBucketNameSnafu)?; + let region = self.region.context(MissingRegionSnafu)?; + let checksum = self.checksum_algorithm.map(|x| x.get()).transpose()?; + let copy_if_not_exists = self.copy_if_not_exists.map(|x| x.get()).transpose()?; + + let credentials = if let Some(credentials) = self.credentials { + credentials + } else if self.access_key_id.is_some() || self.secret_access_key.is_some() { + match (self.access_key_id, self.secret_access_key, self.token) { + (Some(key_id), Some(secret_key), token) => { + info!("Using Static credential provider"); + let credential = AwsCredential { + key_id, + secret_key, + token, + }; + Arc::new(StaticCredentialProvider::new(credential)) as _ + } + (None, Some(_), _) => return Err(Error::MissingAccessKeyId.into()), + (Some(_), None, _) => return Err(Error::MissingSecretAccessKey.into()), + (None, None, _) => unreachable!(), + } + } else if let (Ok(token_path), Ok(role_arn)) = ( + std::env::var("AWS_WEB_IDENTITY_TOKEN_FILE"), + std::env::var("AWS_ROLE_ARN"), + ) { + // TODO: Replace with `AmazonS3Builder::credentials_from_env` + info!("Using WebIdentity credential provider"); + + let session_name = std::env::var("AWS_ROLE_SESSION_NAME") + .unwrap_or_else(|_| "WebIdentitySession".to_string()); + + let endpoint = format!("https://sts.{region}.amazonaws.com"); + + // Disallow non-HTTPs requests + let client = self + .client_options + .clone() + .with_allow_http(false) + .client()?; + + let token = WebIdentityProvider { + token_path, + session_name, + role_arn, + endpoint, + }; + + Arc::new(TokenCredentialProvider::new( + token, + client, + self.retry_config.clone(), + )) as _ + } else if let Some(uri) = self.container_credentials_relative_uri { + info!("Using Task credential provider"); + Arc::new(TaskCredentialProvider { + url: format!("http://169.254.170.2{uri}"), + retry: self.retry_config.clone(), + // The instance metadata endpoint is access over HTTP + client: self.client_options.clone().with_allow_http(true).client()?, + cache: Default::default(), + }) as _ + } else { + info!("Using Instance credential provider"); + + let token = InstanceCredentialProvider { + cache: Default::default(), + imdsv1_fallback: self.imdsv1_fallback.get()?, + metadata_endpoint: self + .metadata_endpoint + .unwrap_or_else(|| DEFAULT_METADATA_ENDPOINT.into()), + }; + + Arc::new(TokenCredentialProvider::new( + token, + self.client_options.metadata_client()?, + self.retry_config.clone(), + )) as _ + }; + + let endpoint: String; + let bucket_endpoint: String; + + // If `endpoint` is provided then its assumed to be consistent with + // `virtual_hosted_style_request`. i.e. if `virtual_hosted_style_request` is true then + // `endpoint` should have bucket name included. + if self.virtual_hosted_style_request.get()? { + endpoint = self + .endpoint + .unwrap_or_else(|| format!("https://{bucket}.s3.{region}.amazonaws.com")); + bucket_endpoint = endpoint.clone(); + } else { + endpoint = self + .endpoint + .unwrap_or_else(|| format!("https://s3.{region}.amazonaws.com")); + bucket_endpoint = format!("{endpoint}/{bucket}"); + } + + let config = S3Config { + region, + endpoint, + bucket, + bucket_endpoint, + credentials, + retry_config: self.retry_config, + client_options: self.client_options, + sign_payload: !self.unsigned_payload.get()?, + skip_signature: self.skip_signature.get()?, + checksum, + copy_if_not_exists, + }; + + let client = Arc::new(S3Client::new(config)?); + + Ok(AmazonS3 { client }) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use std::collections::HashMap; + + #[test] + fn s3_test_config_from_map() { + let aws_access_key_id = "object_store:fake_access_key_id".to_string(); + let aws_secret_access_key = "object_store:fake_secret_key".to_string(); + let aws_default_region = "object_store:fake_default_region".to_string(); + let aws_endpoint = "object_store:fake_endpoint".to_string(); + let aws_session_token = "object_store:fake_session_token".to_string(); + let options = HashMap::from([ + ("aws_access_key_id", aws_access_key_id.clone()), + ("aws_secret_access_key", aws_secret_access_key), + ("aws_default_region", aws_default_region.clone()), + ("aws_endpoint", aws_endpoint.clone()), + ("aws_session_token", aws_session_token.clone()), + ("aws_unsigned_payload", "true".to_string()), + ("aws_checksum_algorithm", "sha256".to_string()), + ]); + + let builder = options + .into_iter() + .fold(AmazonS3Builder::new(), |builder, (key, value)| { + builder.with_config(key.parse().unwrap(), value) + }) + .with_config(AmazonS3ConfigKey::SecretAccessKey, "new-secret-key"); + + assert_eq!(builder.access_key_id.unwrap(), aws_access_key_id.as_str()); + assert_eq!(builder.secret_access_key.unwrap(), "new-secret-key"); + assert_eq!(builder.region.unwrap(), aws_default_region); + assert_eq!(builder.endpoint.unwrap(), aws_endpoint); + assert_eq!(builder.token.unwrap(), aws_session_token); + assert_eq!( + builder.checksum_algorithm.unwrap().get().unwrap(), + Checksum::SHA256 + ); + assert!(builder.unsigned_payload.get().unwrap()); + } + + #[test] + fn s3_test_config_get_value() { + let aws_access_key_id = "object_store:fake_access_key_id".to_string(); + let aws_secret_access_key = "object_store:fake_secret_key".to_string(); + let aws_default_region = "object_store:fake_default_region".to_string(); + let aws_endpoint = "object_store:fake_endpoint".to_string(); + let aws_session_token = "object_store:fake_session_token".to_string(); + + let builder = AmazonS3Builder::new() + .with_config(AmazonS3ConfigKey::AccessKeyId, &aws_access_key_id) + .with_config(AmazonS3ConfigKey::SecretAccessKey, &aws_secret_access_key) + .with_config(AmazonS3ConfigKey::DefaultRegion, &aws_default_region) + .with_config(AmazonS3ConfigKey::Endpoint, &aws_endpoint) + .with_config(AmazonS3ConfigKey::Token, &aws_session_token) + .with_config(AmazonS3ConfigKey::UnsignedPayload, "true"); + + assert_eq!( + builder + .get_config_value(&AmazonS3ConfigKey::AccessKeyId) + .unwrap(), + aws_access_key_id + ); + assert_eq!( + builder + .get_config_value(&AmazonS3ConfigKey::SecretAccessKey) + .unwrap(), + aws_secret_access_key + ); + assert_eq!( + builder + .get_config_value(&AmazonS3ConfigKey::DefaultRegion) + .unwrap(), + aws_default_region + ); + assert_eq!( + builder + .get_config_value(&AmazonS3ConfigKey::Endpoint) + .unwrap(), + aws_endpoint + ); + assert_eq!( + builder.get_config_value(&AmazonS3ConfigKey::Token).unwrap(), + aws_session_token + ); + assert_eq!( + builder + .get_config_value(&AmazonS3ConfigKey::UnsignedPayload) + .unwrap(), + "true" + ); + } + + #[test] + fn s3_test_urls() { + let mut builder = AmazonS3Builder::new(); + builder.parse_url("s3://bucket/path").unwrap(); + assert_eq!(builder.bucket_name, Some("bucket".to_string())); + + let mut builder = AmazonS3Builder::new(); + builder + .parse_url("s3://buckets.can.have.dots/path") + .unwrap(); + assert_eq!( + builder.bucket_name, + Some("buckets.can.have.dots".to_string()) + ); + + let mut builder = AmazonS3Builder::new(); + builder + .parse_url("https://s3.region.amazonaws.com") + .unwrap(); + assert_eq!(builder.region, Some("region".to_string())); + + let mut builder = AmazonS3Builder::new(); + builder + .parse_url("https://s3.region.amazonaws.com/bucket") + .unwrap(); + assert_eq!(builder.region, Some("region".to_string())); + assert_eq!(builder.bucket_name, Some("bucket".to_string())); + + let mut builder = AmazonS3Builder::new(); + builder + .parse_url("https://s3.region.amazonaws.com/bucket.with.dot/path") + .unwrap(); + assert_eq!(builder.region, Some("region".to_string())); + assert_eq!(builder.bucket_name, Some("bucket.with.dot".to_string())); + + let mut builder = AmazonS3Builder::new(); + builder + .parse_url("https://bucket.s3.region.amazonaws.com") + .unwrap(); + assert_eq!(builder.bucket_name, Some("bucket".to_string())); + assert_eq!(builder.region, Some("region".to_string())); + assert!(builder.virtual_hosted_style_request.get().unwrap()); + + let mut builder = AmazonS3Builder::new(); + builder + .parse_url("https://account123.r2.cloudflarestorage.com/bucket-123") + .unwrap(); + + assert_eq!(builder.bucket_name, Some("bucket-123".to_string())); + assert_eq!(builder.region, Some("auto".to_string())); + assert_eq!( + builder.endpoint, + Some("https://account123.r2.cloudflarestorage.com".to_string()) + ); + + let err_cases = [ + "mailto://bucket/path", + "https://s3.bucket.mydomain.com", + "https://s3.bucket.foo.amazonaws.com", + "https://bucket.mydomain.region.amazonaws.com", + "https://bucket.s3.region.bar.amazonaws.com", + "https://bucket.foo.s3.amazonaws.com", + ]; + let mut builder = AmazonS3Builder::new(); + for case in err_cases { + builder.parse_url(case).unwrap_err(); + } + } + + #[tokio::test] + async fn s3_test_proxy_url() { + let s3 = AmazonS3Builder::new() + .with_access_key_id("access_key_id") + .with_secret_access_key("secret_access_key") + .with_region("region") + .with_bucket_name("bucket_name") + .with_allow_http(true) + .with_proxy_url("https://example.com") + .build(); + + assert!(s3.is_ok()); + + let err = AmazonS3Builder::new() + .with_access_key_id("access_key_id") + .with_secret_access_key("secret_access_key") + .with_region("region") + .with_bucket_name("bucket_name") + .with_allow_http(true) + .with_proxy_url("asdf://example.com") + .build() + .unwrap_err() + .to_string(); + + assert_eq!( + "Generic HTTP client error: builder error: unknown proxy scheme", + err + ); + } + + #[test] + fn test_invalid_config() { + let err = AmazonS3Builder::new() + .with_config(AmazonS3ConfigKey::ImdsV1Fallback, "enabled") + .with_bucket_name("bucket") + .with_region("region") + .build() + .unwrap_err() + .to_string(); + + assert_eq!( + err, + "Generic Config error: failed to parse \"enabled\" as boolean" + ); + + let err = AmazonS3Builder::new() + .with_config(AmazonS3ConfigKey::Checksum, "md5") + .with_bucket_name("bucket") + .with_region("region") + .build() + .unwrap_err() + .to_string(); + + assert_eq!( + err, + "Generic Config error: \"md5\" is not a valid checksum algorithm" + ); + } +} diff --git a/src/aws/mod.rs b/src/aws/mod.rs index 6d5aece..a4e39c3 100644 --- a/src/aws/mod.rs +++ b/src/aws/mod.rs @@ -35,40 +35,33 @@ use async_trait::async_trait; use bytes::Bytes; use futures::stream::BoxStream; use futures::{StreamExt, TryStreamExt}; -use itertools::Itertools; use reqwest::Method; -use serde::{Deserialize, Serialize}; -use snafu::{ensure, OptionExt, ResultExt, Snafu}; -use std::{str::FromStr, sync::Arc, time::Duration}; +use std::{sync::Arc, time::Duration}; use tokio::io::AsyncWrite; -use tracing::info; use url::Url; -use crate::aws::client::{S3Client, S3Config}; -use crate::aws::credential::{ - InstanceCredentialProvider, TaskCredentialProvider, WebIdentityProvider, -}; +use crate::aws::client::S3Client; use crate::client::get::GetClientExt; use crate::client::list::ListClientExt; -use crate::client::{ - ClientConfigKey, CredentialProvider, StaticCredentialProvider, - TokenCredentialProvider, -}; -use crate::config::ConfigValue; +use crate::client::CredentialProvider; use crate::multipart::{PartId, PutPart, WriteMultiPart}; use crate::signer::Signer; use crate::{ - ClientOptions, GetOptions, GetResult, ListResult, MultipartId, ObjectMeta, - ObjectStore, Path, PutResult, Result, RetryConfig, + GetOptions, GetResult, ListResult, MultipartId, ObjectMeta, ObjectStore, Path, + PutResult, Result, }; +mod builder; mod checksum; mod client; mod copy; mod credential; +mod resolve; +pub use builder::{AmazonS3Builder, AmazonS3ConfigKey}; pub use checksum::Checksum; pub use copy::S3CopyIfNotExists; +pub use resolve::resolve_bucket_region; // http://docs.aws.amazon.com/general/latest/gr/sigv4-create-canonical-request.html // @@ -90,103 +83,6 @@ const STORE: &str = "S3"; pub type AwsCredentialProvider = Arc>; pub use credential::{AwsAuthorizer, AwsCredential}; -/// Default metadata endpoint -static DEFAULT_METADATA_ENDPOINT: &str = "http://169.254.169.254"; - -/// A specialized `Error` for object store-related errors -#[derive(Debug, Snafu)] -#[allow(missing_docs)] -enum Error { - #[snafu(display("Missing region"))] - MissingRegion, - - #[snafu(display("Missing bucket name"))] - MissingBucketName, - - #[snafu(display("Missing AccessKeyId"))] - MissingAccessKeyId, - - #[snafu(display("Missing SecretAccessKey"))] - MissingSecretAccessKey, - - #[snafu(display("Unable parse source url. Url: {}, Error: {}", url, source))] - UnableToParseUrl { - source: url::ParseError, - url: String, - }, - - #[snafu(display( - "Unknown url scheme cannot be parsed into storage location: {}", - scheme - ))] - UnknownUrlScheme { scheme: String }, - - #[snafu(display("URL did not match any known pattern for scheme: {}", url))] - UrlNotRecognised { url: String }, - - #[snafu(display("Configuration key: '{}' is not known.", key))] - UnknownConfigurationKey { key: String }, - - #[snafu(display("Bucket '{}' not found", bucket))] - BucketNotFound { bucket: String }, - - #[snafu(display("Failed to resolve region for bucket '{}'", bucket))] - ResolveRegion { - bucket: String, - source: reqwest::Error, - }, - - #[snafu(display("Failed to parse the region for bucket '{}'", bucket))] - RegionParse { bucket: String }, -} - -impl From for super::Error { - fn from(source: Error) -> Self { - match source { - Error::UnknownConfigurationKey { key } => { - Self::UnknownConfigurationKey { store: STORE, key } - } - _ => Self::Generic { - store: STORE, - source: Box::new(source), - }, - } - } -} - -/// Get the bucket region using the [HeadBucket API]. This will fail if the bucket does not exist. -/// -/// [HeadBucket API]: https://docs.aws.amazon.com/AmazonS3/latest/API/API_HeadBucket.html -pub async fn resolve_bucket_region( - bucket: &str, - client_options: &ClientOptions, -) -> Result { - use reqwest::StatusCode; - - let endpoint = format!("https://{}.s3.amazonaws.com", bucket); - - let client = client_options.client()?; - - let response = client - .head(&endpoint) - .send() - .await - .context(ResolveRegionSnafu { bucket })?; - - ensure!( - response.status() != StatusCode::NOT_FOUND, - BucketNotFoundSnafu { bucket } - ); - - let region = response - .headers() - .get("x-amz-bucket-region") - .and_then(|x| x.to_str().ok()) - .context(RegionParseSnafu { bucket })?; - - Ok(region.to_string()) -} - /// Interface for [Amazon S3](https://aws.amazon.com/s3/). #[derive(Debug)] pub struct AmazonS3 { @@ -256,8 +152,10 @@ impl Signer for AmazonS3 { AwsAuthorizer::new(&credential, "s3", &self.client.config().region); let path_url = self.path_url(path); - let mut url = - Url::parse(&path_url).context(UnableToParseUrlSnafu { url: path_url })?; + let mut url = Url::parse(&path_url).map_err(|e| crate::Error::Generic { + store: STORE, + source: format!("Unable to parse url {path_url}: {e}").into(), + })?; authorizer.sign(method, &mut url, expires_in); @@ -381,891 +279,23 @@ impl PutPart for S3MultiPartUpload { } } -/// Configure a connection to Amazon S3 using the specified credentials in -/// the specified Amazon region and bucket. -/// -/// # Example -/// ``` -/// # let REGION = "foo"; -/// # let BUCKET_NAME = "foo"; -/// # let ACCESS_KEY_ID = "foo"; -/// # let SECRET_KEY = "foo"; -/// # use object_store::aws::AmazonS3Builder; -/// let s3 = AmazonS3Builder::new() -/// .with_region(REGION) -/// .with_bucket_name(BUCKET_NAME) -/// .with_access_key_id(ACCESS_KEY_ID) -/// .with_secret_access_key(SECRET_KEY) -/// .build(); -/// ``` -#[derive(Debug, Default, Clone)] -pub struct AmazonS3Builder { - /// Access key id - access_key_id: Option, - /// Secret access_key - secret_access_key: Option, - /// Region - region: Option, - /// Bucket name - bucket_name: Option, - /// Endpoint for communicating with AWS S3 - endpoint: Option, - /// Token to use for requests - token: Option, - /// Url - url: Option, - /// Retry config - retry_config: RetryConfig, - /// When set to true, fallback to IMDSv1 - imdsv1_fallback: ConfigValue, - /// When set to true, virtual hosted style request has to be used - virtual_hosted_style_request: ConfigValue, - /// When set to true, unsigned payload option has to be used - unsigned_payload: ConfigValue, - /// Checksum algorithm which has to be used for object integrity check during upload - checksum_algorithm: Option>, - /// Metadata endpoint, see - metadata_endpoint: Option, - /// Container credentials URL, see - container_credentials_relative_uri: Option, - /// Client options - client_options: ClientOptions, - /// Credentials - credentials: Option, - /// Skip signing requests - skip_signature: ConfigValue, - /// Copy if not exists - copy_if_not_exists: Option>, -} - -/// Configuration keys for [`AmazonS3Builder`] -/// -/// Configuration via keys can be done via [`AmazonS3Builder::with_config`] -/// -/// # Example -/// ``` -/// # use object_store::aws::{AmazonS3Builder, AmazonS3ConfigKey}; -/// let builder = AmazonS3Builder::new() -/// .with_config("aws_access_key_id".parse().unwrap(), "my-access-key-id") -/// .with_config(AmazonS3ConfigKey::DefaultRegion, "my-default-region"); -/// ``` -#[derive(PartialEq, Eq, Hash, Clone, Debug, Copy, Serialize, Deserialize)] -#[non_exhaustive] -pub enum AmazonS3ConfigKey { - /// AWS Access Key - /// - /// See [`AmazonS3Builder::with_access_key_id`] for details. - /// - /// Supported keys: - /// - `aws_access_key_id` - /// - `access_key_id` - AccessKeyId, - - /// Secret Access Key - /// - /// See [`AmazonS3Builder::with_secret_access_key`] for details. - /// - /// Supported keys: - /// - `aws_secret_access_key` - /// - `secret_access_key` - SecretAccessKey, - - /// Region - /// - /// See [`AmazonS3Builder::with_region`] for details. - /// - /// Supported keys: - /// - `aws_region` - /// - `region` - Region, - - /// Default region - /// - /// See [`AmazonS3Builder::with_region`] for details. - /// - /// Supported keys: - /// - `aws_default_region` - /// - `default_region` - DefaultRegion, - - /// Bucket name - /// - /// See [`AmazonS3Builder::with_bucket_name`] for details. - /// - /// Supported keys: - /// - `aws_bucket` - /// - `aws_bucket_name` - /// - `bucket` - /// - `bucket_name` - Bucket, - - /// Sets custom endpoint for communicating with AWS S3. - /// - /// See [`AmazonS3Builder::with_endpoint`] for details. - /// - /// Supported keys: - /// - `aws_endpoint` - /// - `aws_endpoint_url` - /// - `endpoint` - /// - `endpoint_url` - Endpoint, - - /// Token to use for requests (passed to underlying provider) - /// - /// See [`AmazonS3Builder::with_token`] for details. - /// - /// Supported keys: - /// - `aws_session_token` - /// - `aws_token` - /// - `session_token` - /// - `token` - Token, - - /// Fall back to ImdsV1 - /// - /// See [`AmazonS3Builder::with_imdsv1_fallback`] for details. - /// - /// Supported keys: - /// - `aws_imdsv1_fallback` - /// - `imdsv1_fallback` - ImdsV1Fallback, - - /// If virtual hosted style request has to be used - /// - /// See [`AmazonS3Builder::with_virtual_hosted_style_request`] for details. - /// - /// Supported keys: - /// - `aws_virtual_hosted_style_request` - /// - `virtual_hosted_style_request` - VirtualHostedStyleRequest, - - /// Avoid computing payload checksum when calculating signature. - /// - /// See [`AmazonS3Builder::with_unsigned_payload`] for details. - /// - /// Supported keys: - /// - `aws_unsigned_payload` - /// - `unsigned_payload` - UnsignedPayload, - - /// Set the checksum algorithm for this client - /// - /// See [`AmazonS3Builder::with_checksum_algorithm`] - Checksum, - - /// Set the instance metadata endpoint - /// - /// See [`AmazonS3Builder::with_metadata_endpoint`] for details. - /// - /// Supported keys: - /// - `aws_metadata_endpoint` - /// - `metadata_endpoint` - MetadataEndpoint, - - /// Set the container credentials relative URI - /// - /// - ContainerCredentialsRelativeUri, - - /// Configure how to provide [`ObjectStore::copy_if_not_exists`] - /// - /// See [`S3CopyIfNotExists`] - CopyIfNotExists, - - /// Skip signing request - SkipSignature, - - /// Client options - Client(ClientConfigKey), -} - -impl AsRef for AmazonS3ConfigKey { - fn as_ref(&self) -> &str { - match self { - Self::AccessKeyId => "aws_access_key_id", - Self::SecretAccessKey => "aws_secret_access_key", - Self::Region => "aws_region", - Self::Bucket => "aws_bucket", - Self::Endpoint => "aws_endpoint", - Self::Token => "aws_session_token", - Self::ImdsV1Fallback => "aws_imdsv1_fallback", - Self::VirtualHostedStyleRequest => "aws_virtual_hosted_style_request", - Self::DefaultRegion => "aws_default_region", - Self::MetadataEndpoint => "aws_metadata_endpoint", - Self::UnsignedPayload => "aws_unsigned_payload", - Self::Checksum => "aws_checksum_algorithm", - Self::ContainerCredentialsRelativeUri => { - "aws_container_credentials_relative_uri" - } - Self::SkipSignature => "aws_skip_signature", - Self::CopyIfNotExists => "copy_if_not_exists", - Self::Client(opt) => opt.as_ref(), - } - } -} - -impl FromStr for AmazonS3ConfigKey { - type Err = super::Error; - - fn from_str(s: &str) -> Result { - match s { - "aws_access_key_id" | "access_key_id" => Ok(Self::AccessKeyId), - "aws_secret_access_key" | "secret_access_key" => Ok(Self::SecretAccessKey), - "aws_default_region" | "default_region" => Ok(Self::DefaultRegion), - "aws_region" | "region" => Ok(Self::Region), - "aws_bucket" | "aws_bucket_name" | "bucket_name" | "bucket" => { - Ok(Self::Bucket) - } - "aws_endpoint_url" | "aws_endpoint" | "endpoint_url" | "endpoint" => { - Ok(Self::Endpoint) - } - "aws_session_token" | "aws_token" | "session_token" | "token" => { - Ok(Self::Token) - } - "aws_virtual_hosted_style_request" | "virtual_hosted_style_request" => { - Ok(Self::VirtualHostedStyleRequest) - } - "aws_imdsv1_fallback" | "imdsv1_fallback" => Ok(Self::ImdsV1Fallback), - "aws_metadata_endpoint" | "metadata_endpoint" => Ok(Self::MetadataEndpoint), - "aws_unsigned_payload" | "unsigned_payload" => Ok(Self::UnsignedPayload), - "aws_checksum_algorithm" | "checksum_algorithm" => Ok(Self::Checksum), - "aws_container_credentials_relative_uri" => { - Ok(Self::ContainerCredentialsRelativeUri) - } - "aws_skip_signature" | "skip_signature" => Ok(Self::SkipSignature), - "copy_if_not_exists" => Ok(Self::CopyIfNotExists), - // Backwards compatibility - "aws_allow_http" => Ok(Self::Client(ClientConfigKey::AllowHttp)), - _ => match s.parse() { - Ok(key) => Ok(Self::Client(key)), - Err(_) => Err(Error::UnknownConfigurationKey { key: s.into() }.into()), - }, - } - } -} - -impl AmazonS3Builder { - /// Create a new [`AmazonS3Builder`] with default values. - pub fn new() -> Self { - Default::default() - } - - /// Fill the [`AmazonS3Builder`] with regular AWS environment variables - /// - /// Variables extracted from environment: - /// * `AWS_ACCESS_KEY_ID` -> access_key_id - /// * `AWS_SECRET_ACCESS_KEY` -> secret_access_key - /// * `AWS_DEFAULT_REGION` -> region - /// * `AWS_ENDPOINT` -> endpoint - /// * `AWS_SESSION_TOKEN` -> token - /// * `AWS_CONTAINER_CREDENTIALS_RELATIVE_URI` -> - /// * `AWS_ALLOW_HTTP` -> set to "true" to permit HTTP connections without TLS - /// # Example - /// ``` - /// use object_store::aws::AmazonS3Builder; - /// - /// let s3 = AmazonS3Builder::from_env() - /// .with_bucket_name("foo") - /// .build(); - /// ``` - pub fn from_env() -> Self { - let mut builder: Self = Default::default(); - - for (os_key, os_value) in std::env::vars_os() { - if let (Some(key), Some(value)) = (os_key.to_str(), os_value.to_str()) { - if key.starts_with("AWS_") { - if let Ok(config_key) = key.to_ascii_lowercase().parse() { - builder = builder.with_config(config_key, value); - } - } - } - } - - builder - } - - /// Parse available connection info form a well-known storage URL. - /// - /// The supported url schemes are: - /// - /// - `s3:///` - /// - `s3a:///` - /// - `https://s3..amazonaws.com/` - /// - `https://.s3..amazonaws.com` - /// - `https://ACCOUNT_ID.r2.cloudflarestorage.com/bucket` - /// - /// Note: Settings derived from the URL will override any others set on this builder - /// - /// # Example - /// ``` - /// use object_store::aws::AmazonS3Builder; - /// - /// let s3 = AmazonS3Builder::from_env() - /// .with_url("s3://bucket/path") - /// .build(); - /// ``` - pub fn with_url(mut self, url: impl Into) -> Self { - self.url = Some(url.into()); - self - } - - /// Set an option on the builder via a key - value pair. - pub fn with_config( - mut self, - key: AmazonS3ConfigKey, - value: impl Into, - ) -> Self { - match key { - AmazonS3ConfigKey::AccessKeyId => self.access_key_id = Some(value.into()), - AmazonS3ConfigKey::SecretAccessKey => { - self.secret_access_key = Some(value.into()) - } - AmazonS3ConfigKey::Region => self.region = Some(value.into()), - AmazonS3ConfigKey::Bucket => self.bucket_name = Some(value.into()), - AmazonS3ConfigKey::Endpoint => self.endpoint = Some(value.into()), - AmazonS3ConfigKey::Token => self.token = Some(value.into()), - AmazonS3ConfigKey::ImdsV1Fallback => self.imdsv1_fallback.parse(value), - AmazonS3ConfigKey::VirtualHostedStyleRequest => { - self.virtual_hosted_style_request.parse(value) - } - AmazonS3ConfigKey::DefaultRegion => { - self.region = self.region.or_else(|| Some(value.into())) - } - AmazonS3ConfigKey::MetadataEndpoint => { - self.metadata_endpoint = Some(value.into()) - } - AmazonS3ConfigKey::UnsignedPayload => self.unsigned_payload.parse(value), - AmazonS3ConfigKey::Checksum => { - self.checksum_algorithm = Some(ConfigValue::Deferred(value.into())) - } - AmazonS3ConfigKey::ContainerCredentialsRelativeUri => { - self.container_credentials_relative_uri = Some(value.into()) - } - AmazonS3ConfigKey::Client(key) => { - self.client_options = self.client_options.with_config(key, value) - } - AmazonS3ConfigKey::SkipSignature => self.skip_signature.parse(value), - AmazonS3ConfigKey::CopyIfNotExists => { - self.copy_if_not_exists = Some(ConfigValue::Deferred(value.into())) - } - }; - self - } - - /// Set an option on the builder via a key - value pair. - /// - /// This method will return an `UnknownConfigKey` error if key cannot be parsed into [`AmazonS3ConfigKey`]. - #[deprecated(note = "Use with_config")] - pub fn try_with_option( - self, - key: impl AsRef, - value: impl Into, - ) -> Result { - Ok(self.with_config(key.as_ref().parse()?, value)) - } - - /// Hydrate builder from key value pairs - /// - /// This method will return an `UnknownConfigKey` error if any key cannot be parsed into [`AmazonS3ConfigKey`]. - #[deprecated(note = "Use with_config")] - #[allow(deprecated)] - pub fn try_with_options< - I: IntoIterator, impl Into)>, - >( - mut self, - options: I, - ) -> Result { - for (key, value) in options { - self = self.try_with_option(key, value)?; - } - Ok(self) - } - - /// Get config value via a [`AmazonS3ConfigKey`]. - /// - /// # Example - /// ``` - /// use object_store::aws::{AmazonS3Builder, AmazonS3ConfigKey}; - /// - /// let builder = AmazonS3Builder::from_env() - /// .with_bucket_name("foo"); - /// let bucket_name = builder.get_config_value(&AmazonS3ConfigKey::Bucket).unwrap_or_default(); - /// assert_eq!("foo", &bucket_name); - /// ``` - pub fn get_config_value(&self, key: &AmazonS3ConfigKey) -> Option { - match key { - AmazonS3ConfigKey::AccessKeyId => self.access_key_id.clone(), - AmazonS3ConfigKey::SecretAccessKey => self.secret_access_key.clone(), - AmazonS3ConfigKey::Region | AmazonS3ConfigKey::DefaultRegion => { - self.region.clone() - } - AmazonS3ConfigKey::Bucket => self.bucket_name.clone(), - AmazonS3ConfigKey::Endpoint => self.endpoint.clone(), - AmazonS3ConfigKey::Token => self.token.clone(), - AmazonS3ConfigKey::ImdsV1Fallback => Some(self.imdsv1_fallback.to_string()), - AmazonS3ConfigKey::VirtualHostedStyleRequest => { - Some(self.virtual_hosted_style_request.to_string()) - } - AmazonS3ConfigKey::MetadataEndpoint => self.metadata_endpoint.clone(), - AmazonS3ConfigKey::UnsignedPayload => Some(self.unsigned_payload.to_string()), - AmazonS3ConfigKey::Checksum => { - self.checksum_algorithm.as_ref().map(ToString::to_string) - } - AmazonS3ConfigKey::Client(key) => self.client_options.get_config_value(key), - AmazonS3ConfigKey::ContainerCredentialsRelativeUri => { - self.container_credentials_relative_uri.clone() - } - AmazonS3ConfigKey::SkipSignature => Some(self.skip_signature.to_string()), - AmazonS3ConfigKey::CopyIfNotExists => { - self.copy_if_not_exists.as_ref().map(ToString::to_string) - } - } - } - - /// Sets properties on this builder based on a URL - /// - /// This is a separate member function to allow fallible computation to - /// be deferred until [`Self::build`] which in turn allows deriving [`Clone`] - fn parse_url(&mut self, url: &str) -> Result<()> { - let parsed = Url::parse(url).context(UnableToParseUrlSnafu { url })?; - let host = parsed.host_str().context(UrlNotRecognisedSnafu { url })?; - match parsed.scheme() { - "s3" | "s3a" => self.bucket_name = Some(host.to_string()), - "https" => match host.splitn(4, '.').collect_tuple() { - Some(("s3", region, "amazonaws", "com")) => { - self.region = Some(region.to_string()); - let bucket = parsed.path_segments().into_iter().flatten().next(); - if let Some(bucket) = bucket { - self.bucket_name = Some(bucket.into()); - } - } - Some((bucket, "s3", region, "amazonaws.com")) => { - self.bucket_name = Some(bucket.to_string()); - self.region = Some(region.to_string()); - self.virtual_hosted_style_request = true.into(); - } - Some((account, "r2", "cloudflarestorage", "com")) => { - self.region = Some("auto".to_string()); - let endpoint = format!("https://{account}.r2.cloudflarestorage.com"); - self.endpoint = Some(endpoint); - - let bucket = parsed.path_segments().into_iter().flatten().next(); - if let Some(bucket) = bucket { - self.bucket_name = Some(bucket.into()); - } - } - _ => return Err(UrlNotRecognisedSnafu { url }.build().into()), - }, - scheme => return Err(UnknownUrlSchemeSnafu { scheme }.build().into()), - }; - Ok(()) - } - - /// Set the AWS Access Key (required) - pub fn with_access_key_id(mut self, access_key_id: impl Into) -> Self { - self.access_key_id = Some(access_key_id.into()); - self - } - - /// Set the AWS Secret Access Key (required) - pub fn with_secret_access_key( - mut self, - secret_access_key: impl Into, - ) -> Self { - self.secret_access_key = Some(secret_access_key.into()); - self - } - - /// Set the region (e.g. `us-east-1`) (required) - pub fn with_region(mut self, region: impl Into) -> Self { - self.region = Some(region.into()); - self - } - - /// Set the bucket_name (required) - pub fn with_bucket_name(mut self, bucket_name: impl Into) -> Self { - self.bucket_name = Some(bucket_name.into()); - self - } - - /// Sets the endpoint for communicating with AWS S3. Default value - /// is based on region. The `endpoint` field should be consistent with - /// the field `virtual_hosted_style_request'. - /// - /// For example, this might be set to `"http://localhost:4566:` - /// for testing against a localstack instance. - /// If `virtual_hosted_style_request` is set to true then `endpoint` - /// should have bucket name included. - pub fn with_endpoint(mut self, endpoint: impl Into) -> Self { - self.endpoint = Some(endpoint.into()); - self - } - - /// Set the token to use for requests (passed to underlying provider) - pub fn with_token(mut self, token: impl Into) -> Self { - self.token = Some(token.into()); - self - } - - /// Set the credential provider overriding any other options - pub fn with_credentials(mut self, credentials: AwsCredentialProvider) -> Self { - self.credentials = Some(credentials); - self - } - - /// Sets what protocol is allowed. If `allow_http` is : - /// * false (default): Only HTTPS are allowed - /// * true: HTTP and HTTPS are allowed - pub fn with_allow_http(mut self, allow_http: bool) -> Self { - self.client_options = self.client_options.with_allow_http(allow_http); - self - } - - /// Sets if virtual hosted style request has to be used. - /// If `virtual_hosted_style_request` is : - /// * false (default): Path style request is used - /// * true: Virtual hosted style request is used - /// - /// If the `endpoint` is provided then it should be - /// consistent with `virtual_hosted_style_request`. - /// i.e. if `virtual_hosted_style_request` is set to true - /// then `endpoint` should have bucket name included. - pub fn with_virtual_hosted_style_request( - mut self, - virtual_hosted_style_request: bool, - ) -> Self { - self.virtual_hosted_style_request = virtual_hosted_style_request.into(); - self - } - - /// Set the retry configuration - pub fn with_retry(mut self, retry_config: RetryConfig) -> Self { - self.retry_config = retry_config; - self - } - - /// By default instance credentials will only be fetched over [IMDSv2], as AWS recommends - /// against having IMDSv1 enabled on EC2 instances as it is vulnerable to [SSRF attack] - /// - /// However, certain deployment environments, such as those running old versions of kube2iam, - /// may not support IMDSv2. This option will enable automatic fallback to using IMDSv1 - /// if the token endpoint returns a 403 error indicating that IMDSv2 is not supported. - /// - /// This option has no effect if not using instance credentials - /// - /// [IMDSv2]: https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/configuring-instance-metadata-service.html - /// [SSRF attack]: https://aws.amazon.com/blogs/security/defense-in-depth-open-firewalls-reverse-proxies-ssrf-vulnerabilities-ec2-instance-metadata-service/ - /// - pub fn with_imdsv1_fallback(mut self) -> Self { - self.imdsv1_fallback = true.into(); - self - } - - /// Sets if unsigned payload option has to be used. - /// See [unsigned payload option](https://docs.aws.amazon.com/AmazonS3/latest/API/sig-v4-header-based-auth.html) - /// * false (default): Signed payload option is used, where the checksum for the request body is computed and included when constructing a canonical request. - /// * true: Unsigned payload option is used. `UNSIGNED-PAYLOAD` literal is included when constructing a canonical request, - pub fn with_unsigned_payload(mut self, unsigned_payload: bool) -> Self { - self.unsigned_payload = unsigned_payload.into(); - self - } - - /// If enabled, [`AmazonS3`] will not fetch credentials and will not sign requests - /// - /// This can be useful when interacting with public S3 buckets that deny authorized requests - pub fn with_skip_signature(mut self, skip_signature: bool) -> Self { - self.skip_signature = skip_signature.into(); - self - } - - /// Sets the [checksum algorithm] which has to be used for object integrity check during upload. - /// - /// [checksum algorithm]: https://docs.aws.amazon.com/AmazonS3/latest/userguide/checking-object-integrity.html - pub fn with_checksum_algorithm(mut self, checksum_algorithm: Checksum) -> Self { - // Convert to String to enable deferred parsing of config - self.checksum_algorithm = Some(checksum_algorithm.into()); - self - } - - /// Set the [instance metadata endpoint](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/ec2-instance-metadata.html), - /// used primarily within AWS EC2. - /// - /// This defaults to the IPv4 endpoint: http://169.254.169.254. One can alternatively use the IPv6 - /// endpoint http://fd00:ec2::254. - pub fn with_metadata_endpoint(mut self, endpoint: impl Into) -> Self { - self.metadata_endpoint = Some(endpoint.into()); - self - } - - /// Set the proxy_url to be used by the underlying client - pub fn with_proxy_url(mut self, proxy_url: impl Into) -> Self { - self.client_options = self.client_options.with_proxy_url(proxy_url); - self - } - - /// Set a trusted proxy CA certificate - pub fn with_proxy_ca_certificate( - mut self, - proxy_ca_certificate: impl Into, - ) -> Self { - self.client_options = self - .client_options - .with_proxy_ca_certificate(proxy_ca_certificate); - self - } - - /// Set a list of hosts to exclude from proxy connections - pub fn with_proxy_excludes(mut self, proxy_excludes: impl Into) -> Self { - self.client_options = self.client_options.with_proxy_excludes(proxy_excludes); - self - } - - /// Sets the client options, overriding any already set - pub fn with_client_options(mut self, options: ClientOptions) -> Self { - self.client_options = options; - self - } - - /// Configure how to provide [`ObjectStore::copy_if_not_exists`] - pub fn with_copy_if_not_exists(mut self, config: S3CopyIfNotExists) -> Self { - self.copy_if_not_exists = Some(config.into()); - self - } - - /// Create a [`AmazonS3`] instance from the provided values, - /// consuming `self`. - pub fn build(mut self) -> Result { - if let Some(url) = self.url.take() { - self.parse_url(&url)?; - } - - let bucket = self.bucket_name.context(MissingBucketNameSnafu)?; - let region = self.region.context(MissingRegionSnafu)?; - let checksum = self.checksum_algorithm.map(|x| x.get()).transpose()?; - let copy_if_not_exists = self.copy_if_not_exists.map(|x| x.get()).transpose()?; - - let credentials = if let Some(credentials) = self.credentials { - credentials - } else if self.access_key_id.is_some() || self.secret_access_key.is_some() { - match (self.access_key_id, self.secret_access_key, self.token) { - (Some(key_id), Some(secret_key), token) => { - info!("Using Static credential provider"); - let credential = AwsCredential { - key_id, - secret_key, - token, - }; - Arc::new(StaticCredentialProvider::new(credential)) as _ - } - (None, Some(_), _) => return Err(Error::MissingAccessKeyId.into()), - (Some(_), None, _) => return Err(Error::MissingSecretAccessKey.into()), - (None, None, _) => unreachable!(), - } - } else if let (Ok(token_path), Ok(role_arn)) = ( - std::env::var("AWS_WEB_IDENTITY_TOKEN_FILE"), - std::env::var("AWS_ROLE_ARN"), - ) { - // TODO: Replace with `AmazonS3Builder::credentials_from_env` - info!("Using WebIdentity credential provider"); - - let session_name = std::env::var("AWS_ROLE_SESSION_NAME") - .unwrap_or_else(|_| "WebIdentitySession".to_string()); - - let endpoint = format!("https://sts.{region}.amazonaws.com"); - - // Disallow non-HTTPs requests - let client = self - .client_options - .clone() - .with_allow_http(false) - .client()?; - - let token = WebIdentityProvider { - token_path, - session_name, - role_arn, - endpoint, - }; - - Arc::new(TokenCredentialProvider::new( - token, - client, - self.retry_config.clone(), - )) as _ - } else if let Some(uri) = self.container_credentials_relative_uri { - info!("Using Task credential provider"); - Arc::new(TaskCredentialProvider { - url: format!("http://169.254.170.2{uri}"), - retry: self.retry_config.clone(), - // The instance metadata endpoint is access over HTTP - client: self.client_options.clone().with_allow_http(true).client()?, - cache: Default::default(), - }) as _ - } else { - info!("Using Instance credential provider"); - - let token = InstanceCredentialProvider { - cache: Default::default(), - imdsv1_fallback: self.imdsv1_fallback.get()?, - metadata_endpoint: self - .metadata_endpoint - .unwrap_or_else(|| DEFAULT_METADATA_ENDPOINT.into()), - }; - - Arc::new(TokenCredentialProvider::new( - token, - self.client_options.metadata_client()?, - self.retry_config.clone(), - )) as _ - }; - - let endpoint: String; - let bucket_endpoint: String; - - // If `endpoint` is provided then its assumed to be consistent with - // `virtual_hosted_style_request`. i.e. if `virtual_hosted_style_request` is true then - // `endpoint` should have bucket name included. - if self.virtual_hosted_style_request.get()? { - endpoint = self - .endpoint - .unwrap_or_else(|| format!("https://{bucket}.s3.{region}.amazonaws.com")); - bucket_endpoint = endpoint.clone(); - } else { - endpoint = self - .endpoint - .unwrap_or_else(|| format!("https://s3.{region}.amazonaws.com")); - bucket_endpoint = format!("{endpoint}/{bucket}"); - } - - let config = S3Config { - region, - endpoint, - bucket, - bucket_endpoint, - credentials, - retry_config: self.retry_config, - client_options: self.client_options, - sign_payload: !self.unsigned_payload.get()?, - skip_signature: self.skip_signature.get()?, - checksum, - copy_if_not_exists, - }; - - let client = Arc::new(S3Client::new(config)?); - - Ok(AmazonS3 { client }) - } -} - #[cfg(test)] mod tests { use super::*; - use crate::tests::{ - copy_if_not_exists, get_nonexistent_object, get_opts, - list_uses_directories_correctly, list_with_delimiter, put_get_delete_list_opts, - rename_and_copy, stream_get, - }; + use crate::tests::*; use bytes::Bytes; - use std::collections::HashMap; const NON_EXISTENT_NAME: &str = "nonexistentname"; - #[test] - fn s3_test_config_from_map() { - let aws_access_key_id = "object_store:fake_access_key_id".to_string(); - let aws_secret_access_key = "object_store:fake_secret_key".to_string(); - let aws_default_region = "object_store:fake_default_region".to_string(); - let aws_endpoint = "object_store:fake_endpoint".to_string(); - let aws_session_token = "object_store:fake_session_token".to_string(); - let options = HashMap::from([ - ("aws_access_key_id", aws_access_key_id.clone()), - ("aws_secret_access_key", aws_secret_access_key), - ("aws_default_region", aws_default_region.clone()), - ("aws_endpoint", aws_endpoint.clone()), - ("aws_session_token", aws_session_token.clone()), - ("aws_unsigned_payload", "true".to_string()), - ("aws_checksum_algorithm", "sha256".to_string()), - ]); - - let builder = options - .into_iter() - .fold(AmazonS3Builder::new(), |builder, (key, value)| { - builder.with_config(key.parse().unwrap(), value) - }) - .with_config(AmazonS3ConfigKey::SecretAccessKey, "new-secret-key"); - - assert_eq!(builder.access_key_id.unwrap(), aws_access_key_id.as_str()); - assert_eq!(builder.secret_access_key.unwrap(), "new-secret-key"); - assert_eq!(builder.region.unwrap(), aws_default_region); - assert_eq!(builder.endpoint.unwrap(), aws_endpoint); - assert_eq!(builder.token.unwrap(), aws_session_token); - assert_eq!( - builder.checksum_algorithm.unwrap().get().unwrap(), - Checksum::SHA256 - ); - assert!(builder.unsigned_payload.get().unwrap()); - } - - #[test] - fn s3_test_config_get_value() { - let aws_access_key_id = "object_store:fake_access_key_id".to_string(); - let aws_secret_access_key = "object_store:fake_secret_key".to_string(); - let aws_default_region = "object_store:fake_default_region".to_string(); - let aws_endpoint = "object_store:fake_endpoint".to_string(); - let aws_session_token = "object_store:fake_session_token".to_string(); - - let builder = AmazonS3Builder::new() - .with_config(AmazonS3ConfigKey::AccessKeyId, &aws_access_key_id) - .with_config(AmazonS3ConfigKey::SecretAccessKey, &aws_secret_access_key) - .with_config(AmazonS3ConfigKey::DefaultRegion, &aws_default_region) - .with_config(AmazonS3ConfigKey::Endpoint, &aws_endpoint) - .with_config(AmazonS3ConfigKey::Token, &aws_session_token) - .with_config(AmazonS3ConfigKey::UnsignedPayload, "true"); - - assert_eq!( - builder - .get_config_value(&AmazonS3ConfigKey::AccessKeyId) - .unwrap(), - aws_access_key_id - ); - assert_eq!( - builder - .get_config_value(&AmazonS3ConfigKey::SecretAccessKey) - .unwrap(), - aws_secret_access_key - ); - assert_eq!( - builder - .get_config_value(&AmazonS3ConfigKey::DefaultRegion) - .unwrap(), - aws_default_region - ); - assert_eq!( - builder - .get_config_value(&AmazonS3ConfigKey::Endpoint) - .unwrap(), - aws_endpoint - ); - assert_eq!( - builder.get_config_value(&AmazonS3ConfigKey::Token).unwrap(), - aws_session_token - ); - assert_eq!( - builder - .get_config_value(&AmazonS3ConfigKey::UnsignedPayload) - .unwrap(), - "true" - ); - } - #[tokio::test] async fn s3_test() { crate::test_util::maybe_skip_integration!(); let config = AmazonS3Builder::from_env(); - let is_local = matches!(&config.endpoint, Some(e) if e.starts_with("http://")); - let test_not_exists = config.copy_if_not_exists.is_some(); let integration = config.build().unwrap(); + let config = integration.client.config(); + let is_local = config.endpoint.starts_with("http://"); + let test_not_exists = config.copy_if_not_exists.is_some(); // Localstack doesn't support listing with spaces https://github.com/localstack/localstack/issues/6328 put_get_delete_list_opts(&integration, is_local).await; @@ -1279,16 +309,14 @@ mod tests { } // run integration test with unsigned payload enabled - let config = AmazonS3Builder::from_env().with_unsigned_payload(true); - let is_local = matches!(&config.endpoint, Some(e) if e.starts_with("http://")); - let integration = config.build().unwrap(); + let builder = AmazonS3Builder::from_env().with_unsigned_payload(true); + let integration = builder.build().unwrap(); put_get_delete_list_opts(&integration, is_local).await; // run integration test with checksum set to sha256 - let config = + let builder = AmazonS3Builder::from_env().with_checksum_algorithm(Checksum::SHA256); - let is_local = matches!(&config.endpoint, Some(e) if e.starts_with("http://")); - let integration = config.build().unwrap(); + let integration = builder.build().unwrap(); put_get_delete_list_opts(&integration, is_local).await; } @@ -1352,161 +380,6 @@ mod tests { assert!(matches!(err, crate::Error::NotFound { .. }), "{}", err); } - #[tokio::test] - async fn s3_test_proxy_url() { - let s3 = AmazonS3Builder::new() - .with_access_key_id("access_key_id") - .with_secret_access_key("secret_access_key") - .with_region("region") - .with_bucket_name("bucket_name") - .with_allow_http(true) - .with_proxy_url("https://example.com") - .build(); - - assert!(s3.is_ok()); - - let err = AmazonS3Builder::new() - .with_access_key_id("access_key_id") - .with_secret_access_key("secret_access_key") - .with_region("region") - .with_bucket_name("bucket_name") - .with_allow_http(true) - .with_proxy_url("asdf://example.com") - .build() - .unwrap_err() - .to_string(); - - assert_eq!( - "Generic HTTP client error: builder error: unknown proxy scheme", - err - ); - } - - #[test] - fn s3_test_urls() { - let mut builder = AmazonS3Builder::new(); - builder.parse_url("s3://bucket/path").unwrap(); - assert_eq!(builder.bucket_name, Some("bucket".to_string())); - - let mut builder = AmazonS3Builder::new(); - builder - .parse_url("s3://buckets.can.have.dots/path") - .unwrap(); - assert_eq!( - builder.bucket_name, - Some("buckets.can.have.dots".to_string()) - ); - - let mut builder = AmazonS3Builder::new(); - builder - .parse_url("https://s3.region.amazonaws.com") - .unwrap(); - assert_eq!(builder.region, Some("region".to_string())); - - let mut builder = AmazonS3Builder::new(); - builder - .parse_url("https://s3.region.amazonaws.com/bucket") - .unwrap(); - assert_eq!(builder.region, Some("region".to_string())); - assert_eq!(builder.bucket_name, Some("bucket".to_string())); - - let mut builder = AmazonS3Builder::new(); - builder - .parse_url("https://s3.region.amazonaws.com/bucket.with.dot/path") - .unwrap(); - assert_eq!(builder.region, Some("region".to_string())); - assert_eq!(builder.bucket_name, Some("bucket.with.dot".to_string())); - - let mut builder = AmazonS3Builder::new(); - builder - .parse_url("https://bucket.s3.region.amazonaws.com") - .unwrap(); - assert_eq!(builder.bucket_name, Some("bucket".to_string())); - assert_eq!(builder.region, Some("region".to_string())); - assert!(builder.virtual_hosted_style_request.get().unwrap()); - - let mut builder = AmazonS3Builder::new(); - builder - .parse_url("https://account123.r2.cloudflarestorage.com/bucket-123") - .unwrap(); - - assert_eq!(builder.bucket_name, Some("bucket-123".to_string())); - assert_eq!(builder.region, Some("auto".to_string())); - assert_eq!( - builder.endpoint, - Some("https://account123.r2.cloudflarestorage.com".to_string()) - ); - - let err_cases = [ - "mailto://bucket/path", - "https://s3.bucket.mydomain.com", - "https://s3.bucket.foo.amazonaws.com", - "https://bucket.mydomain.region.amazonaws.com", - "https://bucket.s3.region.bar.amazonaws.com", - "https://bucket.foo.s3.amazonaws.com", - ]; - let mut builder = AmazonS3Builder::new(); - for case in err_cases { - builder.parse_url(case).unwrap_err(); - } - } - - #[test] - fn test_invalid_config() { - let err = AmazonS3Builder::new() - .with_config(AmazonS3ConfigKey::ImdsV1Fallback, "enabled") - .with_bucket_name("bucket") - .with_region("region") - .build() - .unwrap_err() - .to_string(); - - assert_eq!( - err, - "Generic Config error: failed to parse \"enabled\" as boolean" - ); - - let err = AmazonS3Builder::new() - .with_config(AmazonS3ConfigKey::Checksum, "md5") - .with_bucket_name("bucket") - .with_region("region") - .build() - .unwrap_err() - .to_string(); - - assert_eq!( - err, - "Generic Config error: \"md5\" is not a valid checksum algorithm" - ); - } -} - -#[cfg(test)] -mod s3_resolve_bucket_region_tests { - use super::*; - - #[tokio::test] - async fn test_private_bucket() { - let bucket = "bloxbender"; - - let region = resolve_bucket_region(bucket, &ClientOptions::new()) - .await - .unwrap(); - - let expected = "us-west-2".to_string(); - - assert_eq!(region, expected); - } - - #[tokio::test] - async fn test_bucket_does_not_exist() { - let bucket = "please-dont-exist"; - - let result = resolve_bucket_region(bucket, &ClientOptions::new()).await; - - assert!(result.is_err()); - } - #[tokio::test] #[ignore = "Tests shouldn't call use remote services by default"] async fn test_disable_creds() { diff --git a/src/aws/resolve.rs b/src/aws/resolve.rs new file mode 100644 index 0000000..2b21fab --- /dev/null +++ b/src/aws/resolve.rs @@ -0,0 +1,106 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::aws::STORE; +use crate::{ClientOptions, Result}; +use snafu::{ensure, OptionExt, ResultExt, Snafu}; + +/// A specialized `Error` for object store-related errors +#[derive(Debug, Snafu)] +#[allow(missing_docs)] +enum Error { + #[snafu(display("Bucket '{}' not found", bucket))] + BucketNotFound { bucket: String }, + + #[snafu(display("Failed to resolve region for bucket '{}'", bucket))] + ResolveRegion { + bucket: String, + source: reqwest::Error, + }, + + #[snafu(display("Failed to parse the region for bucket '{}'", bucket))] + RegionParse { bucket: String }, +} + +impl From for crate::Error { + fn from(source: Error) -> Self { + Self::Generic { + store: STORE, + source: Box::new(source), + } + } +} + +/// Get the bucket region using the [HeadBucket API]. This will fail if the bucket does not exist. +/// +/// [HeadBucket API]: https://docs.aws.amazon.com/AmazonS3/latest/API/API_HeadBucket.html +pub async fn resolve_bucket_region( + bucket: &str, + client_options: &ClientOptions, +) -> Result { + use reqwest::StatusCode; + + let endpoint = format!("https://{}.s3.amazonaws.com", bucket); + + let client = client_options.client()?; + + let response = client + .head(&endpoint) + .send() + .await + .context(ResolveRegionSnafu { bucket })?; + + ensure!( + response.status() != StatusCode::NOT_FOUND, + BucketNotFoundSnafu { bucket } + ); + + let region = response + .headers() + .get("x-amz-bucket-region") + .and_then(|x| x.to_str().ok()) + .context(RegionParseSnafu { bucket })?; + + Ok(region.to_string()) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[tokio::test] + async fn test_private_bucket() { + let bucket = "bloxbender"; + + let region = resolve_bucket_region(bucket, &ClientOptions::new()) + .await + .unwrap(); + + let expected = "us-west-2".to_string(); + + assert_eq!(region, expected); + } + + #[tokio::test] + async fn test_bucket_does_not_exist() { + let bucket = "please-dont-exist"; + + let result = resolve_bucket_region(bucket, &ClientOptions::new()).await; + + assert!(result.is_err()); + } +} From 485422ce912726d008cb45d0d1cef0937ccd5cc3 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 19 Oct 2023 13:41:03 +0100 Subject: [PATCH 210/397] Split azure Module (#4954) * Split azure module * Format * Docs --- src/azure/builder.rs | 1101 ++++++++++++++++++++++++++++++++++++++++++ src/azure/mod.rs | 1081 +---------------------------------------- 2 files changed, 1112 insertions(+), 1070 deletions(-) create mode 100644 src/azure/builder.rs diff --git a/src/azure/builder.rs b/src/azure/builder.rs new file mode 100644 index 0000000..eb2de14 --- /dev/null +++ b/src/azure/builder.rs @@ -0,0 +1,1101 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::azure::client::{AzureClient, AzureConfig}; +use crate::azure::credential::{ + AzureCliCredential, ClientSecretOAuthProvider, ImdsManagedIdentityProvider, + WorkloadIdentityOAuthProvider, +}; +use crate::azure::{AzureCredential, AzureCredentialProvider, MicrosoftAzure, STORE}; +use crate::client::TokenCredentialProvider; +use crate::config::ConfigValue; +use crate::{ + ClientConfigKey, ClientOptions, Result, RetryConfig, StaticCredentialProvider, +}; +use percent_encoding::percent_decode_str; +use serde::{Deserialize, Serialize}; +use snafu::{OptionExt, ResultExt, Snafu}; +use std::str::FromStr; +use std::sync::Arc; +use url::Url; + +/// The well-known account used by Azurite and the legacy Azure Storage Emulator. +/// +/// +const EMULATOR_ACCOUNT: &str = "devstoreaccount1"; + +/// The well-known account key used by Azurite and the legacy Azure Storage Emulator. +/// +/// +const EMULATOR_ACCOUNT_KEY: &str = + "Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw=="; + +const MSI_ENDPOINT_ENV_KEY: &str = "IDENTITY_ENDPOINT"; + +/// A specialized `Error` for Azure builder-related errors +#[derive(Debug, Snafu)] +#[allow(missing_docs)] +enum Error { + #[snafu(display("Unable parse source url. Url: {}, Error: {}", url, source))] + UnableToParseUrl { + source: url::ParseError, + url: String, + }, + + #[snafu(display( + "Unable parse emulator url {}={}, Error: {}", + env_name, + env_value, + source + ))] + UnableToParseEmulatorUrl { + env_name: String, + env_value: String, + source: url::ParseError, + }, + + #[snafu(display("Account must be specified"))] + MissingAccount {}, + + #[snafu(display("Container name must be specified"))] + MissingContainerName {}, + + #[snafu(display( + "Unknown url scheme cannot be parsed into storage location: {}", + scheme + ))] + UnknownUrlScheme { scheme: String }, + + #[snafu(display("URL did not match any known pattern for scheme: {}", url))] + UrlNotRecognised { url: String }, + + #[snafu(display("Failed parsing an SAS key"))] + DecodeSasKey { source: std::str::Utf8Error }, + + #[snafu(display("Missing component in SAS query pair"))] + MissingSasComponent {}, + + #[snafu(display("Configuration key: '{}' is not known.", key))] + UnknownConfigurationKey { key: String }, + + #[snafu(display("Unable to extract metadata from headers: {}", source))] + Metadata { + source: crate::client::header::Error, + }, +} + +impl From for crate::Error { + fn from(source: Error) -> Self { + match source { + Error::UnknownConfigurationKey { key } => { + Self::UnknownConfigurationKey { store: STORE, key } + } + _ => Self::Generic { + store: STORE, + source: Box::new(source), + }, + } + } +} + +/// Configure a connection to Microsoft Azure Blob Storage container using +/// the specified credentials. +/// +/// # Example +/// ``` +/// # let ACCOUNT = "foo"; +/// # let BUCKET_NAME = "foo"; +/// # let ACCESS_KEY = "foo"; +/// # use object_store::azure::MicrosoftAzureBuilder; +/// let azure = MicrosoftAzureBuilder::new() +/// .with_account(ACCOUNT) +/// .with_access_key(ACCESS_KEY) +/// .with_container_name(BUCKET_NAME) +/// .build(); +/// ``` +#[derive(Default, Clone)] +pub struct MicrosoftAzureBuilder { + /// Account name + account_name: Option, + /// Access key + access_key: Option, + /// Container name + container_name: Option, + /// Bearer token + bearer_token: Option, + /// Client id + client_id: Option, + /// Client secret + client_secret: Option, + /// Tenant id + tenant_id: Option, + /// Query pairs for shared access signature authorization + sas_query_pairs: Option>, + /// Shared access signature + sas_key: Option, + /// Authority host + authority_host: Option, + /// Url + url: Option, + /// When set to true, azurite storage emulator has to be used + use_emulator: ConfigValue, + /// Storage endpoint + endpoint: Option, + /// Msi endpoint for acquiring managed identity token + msi_endpoint: Option, + /// Object id for use with managed identity authentication + object_id: Option, + /// Msi resource id for use with managed identity authentication + msi_resource_id: Option, + /// File containing token for Azure AD workload identity federation + federated_token_file: Option, + /// When set to true, azure cli has to be used for acquiring access token + use_azure_cli: ConfigValue, + /// Retry config + retry_config: RetryConfig, + /// Client options + client_options: ClientOptions, + /// Credentials + credentials: Option, + /// When set to true, fabric url scheme will be used + /// + /// i.e. https://{account_name}.dfs.fabric.microsoft.com + use_fabric_endpoint: ConfigValue, +} + +/// Configuration keys for [`MicrosoftAzureBuilder`] +/// +/// Configuration via keys can be done via [`MicrosoftAzureBuilder::with_config`] +/// +/// # Example +/// ``` +/// # use object_store::azure::{MicrosoftAzureBuilder, AzureConfigKey}; +/// let builder = MicrosoftAzureBuilder::new() +/// .with_config("azure_client_id".parse().unwrap(), "my-client-id") +/// .with_config(AzureConfigKey::AuthorityId, "my-tenant-id"); +/// ``` +#[derive(PartialEq, Eq, Hash, Clone, Debug, Copy, Deserialize, Serialize)] +#[non_exhaustive] +pub enum AzureConfigKey { + /// The name of the azure storage account + /// + /// Supported keys: + /// - `azure_storage_account_name` + /// - `account_name` + AccountName, + + /// Master key for accessing storage account + /// + /// Supported keys: + /// - `azure_storage_account_key` + /// - `azure_storage_access_key` + /// - `azure_storage_master_key` + /// - `access_key` + /// - `account_key` + /// - `master_key` + AccessKey, + + /// Service principal client id for authorizing requests + /// + /// Supported keys: + /// - `azure_storage_client_id` + /// - `azure_client_id` + /// - `client_id` + ClientId, + + /// Service principal client secret for authorizing requests + /// + /// Supported keys: + /// - `azure_storage_client_secret` + /// - `azure_client_secret` + /// - `client_secret` + ClientSecret, + + /// Tenant id used in oauth flows + /// + /// Supported keys: + /// - `azure_storage_tenant_id` + /// - `azure_storage_authority_id` + /// - `azure_tenant_id` + /// - `azure_authority_id` + /// - `tenant_id` + /// - `authority_id` + AuthorityId, + + /// Shared access signature. + /// + /// The signature is expected to be percent-encoded, much like they are provided + /// in the azure storage explorer or azure portal. + /// + /// Supported keys: + /// - `azure_storage_sas_key` + /// - `azure_storage_sas_token` + /// - `sas_key` + /// - `sas_token` + SasKey, + + /// Bearer token + /// + /// Supported keys: + /// - `azure_storage_token` + /// - `bearer_token` + /// - `token` + Token, + + /// Use object store with azurite storage emulator + /// + /// Supported keys: + /// - `azure_storage_use_emulator` + /// - `object_store_use_emulator` + /// - `use_emulator` + UseEmulator, + + /// Override the endpoint used to communicate with blob storage + /// + /// Supported keys: + /// - `azure_storage_endpoint` + /// - `azure_endpoint` + /// - `endpoint` + Endpoint, + + /// Use object store with url scheme account.dfs.fabric.microsoft.com + /// + /// Supported keys: + /// - `azure_use_fabric_endpoint` + /// - `use_fabric_endpoint` + UseFabricEndpoint, + + /// Endpoint to request a imds managed identity token + /// + /// Supported keys: + /// - `azure_msi_endpoint` + /// - `azure_identity_endpoint` + /// - `identity_endpoint` + /// - `msi_endpoint` + MsiEndpoint, + + /// Object id for use with managed identity authentication + /// + /// Supported keys: + /// - `azure_object_id` + /// - `object_id` + ObjectId, + + /// Msi resource id for use with managed identity authentication + /// + /// Supported keys: + /// - `azure_msi_resource_id` + /// - `msi_resource_id` + MsiResourceId, + + /// File containing token for Azure AD workload identity federation + /// + /// Supported keys: + /// - `azure_federated_token_file` + /// - `federated_token_file` + FederatedTokenFile, + + /// Use azure cli for acquiring access token + /// + /// Supported keys: + /// - `azure_use_azure_cli` + /// - `use_azure_cli` + UseAzureCli, + + /// Container name + /// + /// Supported keys: + /// - `azure_container_name` + /// - `container_name` + ContainerName, + + /// Client options + Client(ClientConfigKey), +} + +impl AsRef for AzureConfigKey { + fn as_ref(&self) -> &str { + match self { + Self::AccountName => "azure_storage_account_name", + Self::AccessKey => "azure_storage_account_key", + Self::ClientId => "azure_storage_client_id", + Self::ClientSecret => "azure_storage_client_secret", + Self::AuthorityId => "azure_storage_tenant_id", + Self::SasKey => "azure_storage_sas_key", + Self::Token => "azure_storage_token", + Self::UseEmulator => "azure_storage_use_emulator", + Self::UseFabricEndpoint => "azure_use_fabric_endpoint", + Self::Endpoint => "azure_storage_endpoint", + Self::MsiEndpoint => "azure_msi_endpoint", + Self::ObjectId => "azure_object_id", + Self::MsiResourceId => "azure_msi_resource_id", + Self::FederatedTokenFile => "azure_federated_token_file", + Self::UseAzureCli => "azure_use_azure_cli", + Self::ContainerName => "azure_container_name", + Self::Client(key) => key.as_ref(), + } + } +} + +impl FromStr for AzureConfigKey { + type Err = crate::Error; + + fn from_str(s: &str) -> Result { + match s { + "azure_storage_account_key" + | "azure_storage_access_key" + | "azure_storage_master_key" + | "master_key" + | "account_key" + | "access_key" => Ok(Self::AccessKey), + "azure_storage_account_name" | "account_name" => Ok(Self::AccountName), + "azure_storage_client_id" | "azure_client_id" | "client_id" => { + Ok(Self::ClientId) + } + "azure_storage_client_secret" | "azure_client_secret" | "client_secret" => { + Ok(Self::ClientSecret) + } + "azure_storage_tenant_id" + | "azure_storage_authority_id" + | "azure_tenant_id" + | "azure_authority_id" + | "tenant_id" + | "authority_id" => Ok(Self::AuthorityId), + "azure_storage_sas_key" + | "azure_storage_sas_token" + | "sas_key" + | "sas_token" => Ok(Self::SasKey), + "azure_storage_token" | "bearer_token" | "token" => Ok(Self::Token), + "azure_storage_use_emulator" | "use_emulator" => Ok(Self::UseEmulator), + "azure_storage_endpoint" | "azure_endpoint" | "endpoint" => { + Ok(Self::Endpoint) + } + "azure_msi_endpoint" + | "azure_identity_endpoint" + | "identity_endpoint" + | "msi_endpoint" => Ok(Self::MsiEndpoint), + "azure_object_id" | "object_id" => Ok(Self::ObjectId), + "azure_msi_resource_id" | "msi_resource_id" => Ok(Self::MsiResourceId), + "azure_federated_token_file" | "federated_token_file" => { + Ok(Self::FederatedTokenFile) + } + "azure_use_fabric_endpoint" | "use_fabric_endpoint" => { + Ok(Self::UseFabricEndpoint) + } + "azure_use_azure_cli" | "use_azure_cli" => Ok(Self::UseAzureCli), + "azure_container_name" | "container_name" => Ok(Self::ContainerName), + // Backwards compatibility + "azure_allow_http" => Ok(Self::Client(ClientConfigKey::AllowHttp)), + _ => match s.parse() { + Ok(key) => Ok(Self::Client(key)), + Err(_) => Err(Error::UnknownConfigurationKey { key: s.into() }.into()), + }, + } + } +} + +impl std::fmt::Debug for MicrosoftAzureBuilder { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!( + f, + "MicrosoftAzureBuilder {{ account: {:?}, container_name: {:?} }}", + self.account_name, self.container_name + ) + } +} + +impl MicrosoftAzureBuilder { + /// Create a new [`MicrosoftAzureBuilder`] with default values. + pub fn new() -> Self { + Default::default() + } + + /// Create an instance of [`MicrosoftAzureBuilder`] with values pre-populated from environment variables. + /// + /// Variables extracted from environment: + /// * AZURE_STORAGE_ACCOUNT_NAME: storage account name + /// * AZURE_STORAGE_ACCOUNT_KEY: storage account master key + /// * AZURE_STORAGE_ACCESS_KEY: alias for AZURE_STORAGE_ACCOUNT_KEY + /// * AZURE_STORAGE_CLIENT_ID -> client id for service principal authorization + /// * AZURE_STORAGE_CLIENT_SECRET -> client secret for service principal authorization + /// * AZURE_STORAGE_TENANT_ID -> tenant id used in oauth flows + /// # Example + /// ``` + /// use object_store::azure::MicrosoftAzureBuilder; + /// + /// let azure = MicrosoftAzureBuilder::from_env() + /// .with_container_name("foo") + /// .build(); + /// ``` + pub fn from_env() -> Self { + let mut builder = Self::default(); + for (os_key, os_value) in std::env::vars_os() { + if let (Some(key), Some(value)) = (os_key.to_str(), os_value.to_str()) { + if key.starts_with("AZURE_") { + if let Ok(config_key) = key.to_ascii_lowercase().parse() { + builder = builder.with_config(config_key, value); + } + } + } + } + + if let Ok(text) = std::env::var(MSI_ENDPOINT_ENV_KEY) { + builder = builder.with_msi_endpoint(text); + } + + builder + } + + /// Parse available connection info form a well-known storage URL. + /// + /// The supported url schemes are: + /// + /// - `abfs[s]:///` (according to [fsspec](https://github.com/fsspec/adlfs)) + /// - `abfs[s]://@.dfs.core.windows.net/` + /// - `abfs[s]://@.dfs.fabric.microsoft.com/` + /// - `az:///` (according to [fsspec](https://github.com/fsspec/adlfs)) + /// - `adl:///` (according to [fsspec](https://github.com/fsspec/adlfs)) + /// - `azure:///` (custom) + /// - `https://.dfs.core.windows.net` + /// - `https://.blob.core.windows.net` + /// - `https://.dfs.fabric.microsoft.com` + /// - `https://.dfs.fabric.microsoft.com/` + /// - `https://.blob.fabric.microsoft.com` + /// - `https://.blob.fabric.microsoft.com/` + /// + /// Note: Settings derived from the URL will override any others set on this builder + /// + /// # Example + /// ``` + /// use object_store::azure::MicrosoftAzureBuilder; + /// + /// let azure = MicrosoftAzureBuilder::from_env() + /// .with_url("abfss://file_system@account.dfs.core.windows.net/") + /// .build(); + /// ``` + pub fn with_url(mut self, url: impl Into) -> Self { + self.url = Some(url.into()); + self + } + + /// Set an option on the builder via a key - value pair. + pub fn with_config(mut self, key: AzureConfigKey, value: impl Into) -> Self { + match key { + AzureConfigKey::AccessKey => self.access_key = Some(value.into()), + AzureConfigKey::AccountName => self.account_name = Some(value.into()), + AzureConfigKey::ClientId => self.client_id = Some(value.into()), + AzureConfigKey::ClientSecret => self.client_secret = Some(value.into()), + AzureConfigKey::AuthorityId => self.tenant_id = Some(value.into()), + AzureConfigKey::SasKey => self.sas_key = Some(value.into()), + AzureConfigKey::Token => self.bearer_token = Some(value.into()), + AzureConfigKey::MsiEndpoint => self.msi_endpoint = Some(value.into()), + AzureConfigKey::ObjectId => self.object_id = Some(value.into()), + AzureConfigKey::MsiResourceId => self.msi_resource_id = Some(value.into()), + AzureConfigKey::FederatedTokenFile => { + self.federated_token_file = Some(value.into()) + } + AzureConfigKey::UseAzureCli => self.use_azure_cli.parse(value), + AzureConfigKey::UseEmulator => self.use_emulator.parse(value), + AzureConfigKey::Endpoint => self.endpoint = Some(value.into()), + AzureConfigKey::UseFabricEndpoint => self.use_fabric_endpoint.parse(value), + AzureConfigKey::Client(key) => { + self.client_options = self.client_options.with_config(key, value) + } + AzureConfigKey::ContainerName => self.container_name = Some(value.into()), + }; + self + } + + /// Set an option on the builder via a key - value pair. + #[deprecated(note = "Use with_config")] + pub fn try_with_option( + self, + key: impl AsRef, + value: impl Into, + ) -> Result { + Ok(self.with_config(key.as_ref().parse()?, value)) + } + + /// Hydrate builder from key value pairs + #[deprecated(note = "Use with_config")] + #[allow(deprecated)] + pub fn try_with_options< + I: IntoIterator, impl Into)>, + >( + mut self, + options: I, + ) -> Result { + for (key, value) in options { + self = self.try_with_option(key, value)?; + } + Ok(self) + } + + /// Get config value via a [`AzureConfigKey`]. + /// + /// # Example + /// ``` + /// use object_store::azure::{MicrosoftAzureBuilder, AzureConfigKey}; + /// + /// let builder = MicrosoftAzureBuilder::from_env() + /// .with_account("foo"); + /// let account_name = builder.get_config_value(&AzureConfigKey::AccountName).unwrap_or_default(); + /// assert_eq!("foo", &account_name); + /// ``` + pub fn get_config_value(&self, key: &AzureConfigKey) -> Option { + match key { + AzureConfigKey::AccountName => self.account_name.clone(), + AzureConfigKey::AccessKey => self.access_key.clone(), + AzureConfigKey::ClientId => self.client_id.clone(), + AzureConfigKey::ClientSecret => self.client_secret.clone(), + AzureConfigKey::AuthorityId => self.tenant_id.clone(), + AzureConfigKey::SasKey => self.sas_key.clone(), + AzureConfigKey::Token => self.bearer_token.clone(), + AzureConfigKey::UseEmulator => Some(self.use_emulator.to_string()), + AzureConfigKey::UseFabricEndpoint => { + Some(self.use_fabric_endpoint.to_string()) + } + AzureConfigKey::Endpoint => self.endpoint.clone(), + AzureConfigKey::MsiEndpoint => self.msi_endpoint.clone(), + AzureConfigKey::ObjectId => self.object_id.clone(), + AzureConfigKey::MsiResourceId => self.msi_resource_id.clone(), + AzureConfigKey::FederatedTokenFile => self.federated_token_file.clone(), + AzureConfigKey::UseAzureCli => Some(self.use_azure_cli.to_string()), + AzureConfigKey::Client(key) => self.client_options.get_config_value(key), + AzureConfigKey::ContainerName => self.container_name.clone(), + } + } + + /// Sets properties on this builder based on a URL + /// + /// This is a separate member function to allow fallible computation to + /// be deferred until [`Self::build`] which in turn allows deriving [`Clone`] + fn parse_url(&mut self, url: &str) -> Result<()> { + let parsed = Url::parse(url).context(UnableToParseUrlSnafu { url })?; + let host = parsed.host_str().context(UrlNotRecognisedSnafu { url })?; + + let validate = |s: &str| match s.contains('.') { + true => Err(UrlNotRecognisedSnafu { url }.build()), + false => Ok(s.to_string()), + }; + + match parsed.scheme() { + "az" | "adl" | "azure" => self.container_name = Some(validate(host)?), + "abfs" | "abfss" => { + // abfs(s) might refer to the fsspec convention abfs:/// + // or the convention for the hadoop driver abfs[s]://@.dfs.core.windows.net/ + if parsed.username().is_empty() { + self.container_name = Some(validate(host)?); + } else if let Some(a) = host.strip_suffix(".dfs.core.windows.net") { + self.container_name = Some(validate(parsed.username())?); + self.account_name = Some(validate(a)?); + } else if let Some(a) = host.strip_suffix(".dfs.fabric.microsoft.com") { + self.container_name = Some(validate(parsed.username())?); + self.account_name = Some(validate(a)?); + self.use_fabric_endpoint = true.into(); + } else { + return Err(UrlNotRecognisedSnafu { url }.build().into()); + } + } + "https" => match host.split_once('.') { + Some((a, "dfs.core.windows.net")) + | Some((a, "blob.core.windows.net")) => { + self.account_name = Some(validate(a)?); + } + Some((a, "dfs.fabric.microsoft.com")) + | Some((a, "blob.fabric.microsoft.com")) => { + self.account_name = Some(validate(a)?); + // Attempt to infer the container name from the URL + // - https://onelake.dfs.fabric.microsoft.com///Files/test.csv + // - https://onelake.dfs.fabric.microsoft.com//.// + // + // See + if let Some(workspace) = parsed.path_segments().unwrap().next() { + if !workspace.is_empty() { + self.container_name = Some(workspace.to_string()) + } + } + self.use_fabric_endpoint = true.into(); + } + _ => return Err(UrlNotRecognisedSnafu { url }.build().into()), + }, + scheme => return Err(UnknownUrlSchemeSnafu { scheme }.build().into()), + } + Ok(()) + } + + /// Set the Azure Account (required) + pub fn with_account(mut self, account: impl Into) -> Self { + self.account_name = Some(account.into()); + self + } + + /// Set the Azure Container Name (required) + pub fn with_container_name(mut self, container_name: impl Into) -> Self { + self.container_name = Some(container_name.into()); + self + } + + /// Set the Azure Access Key (required - one of access key, bearer token, or client credentials) + pub fn with_access_key(mut self, access_key: impl Into) -> Self { + self.access_key = Some(access_key.into()); + self + } + + /// Set a static bearer token to be used for authorizing requests + pub fn with_bearer_token_authorization( + mut self, + bearer_token: impl Into, + ) -> Self { + self.bearer_token = Some(bearer_token.into()); + self + } + + /// Set a client secret used for client secret authorization + pub fn with_client_secret_authorization( + mut self, + client_id: impl Into, + client_secret: impl Into, + tenant_id: impl Into, + ) -> Self { + self.client_id = Some(client_id.into()); + self.client_secret = Some(client_secret.into()); + self.tenant_id = Some(tenant_id.into()); + self + } + + /// Sets the client id for use in client secret or k8s federated credential flow + pub fn with_client_id(mut self, client_id: impl Into) -> Self { + self.client_id = Some(client_id.into()); + self + } + + /// Sets the client secret for use in client secret flow + pub fn with_client_secret(mut self, client_secret: impl Into) -> Self { + self.client_secret = Some(client_secret.into()); + self + } + + /// Sets the tenant id for use in client secret or k8s federated credential flow + pub fn with_tenant_id(mut self, tenant_id: impl Into) -> Self { + self.tenant_id = Some(tenant_id.into()); + self + } + + /// Set query pairs appended to the url for shared access signature authorization + pub fn with_sas_authorization( + mut self, + query_pairs: impl Into>, + ) -> Self { + self.sas_query_pairs = Some(query_pairs.into()); + self + } + + /// Set the credential provider overriding any other options + pub fn with_credentials(mut self, credentials: AzureCredentialProvider) -> Self { + self.credentials = Some(credentials); + self + } + + /// Set if the Azure emulator should be used (defaults to false) + pub fn with_use_emulator(mut self, use_emulator: bool) -> Self { + self.use_emulator = use_emulator.into(); + self + } + + /// Override the endpoint used to communicate with blob storage + /// + /// Defaults to `https://{account}.blob.core.windows.net` + pub fn with_endpoint(mut self, endpoint: String) -> Self { + self.endpoint = Some(endpoint); + self + } + + /// Set if Microsoft Fabric url scheme should be used (defaults to false) + /// + /// When disabled the url scheme used is `https://{account}.blob.core.windows.net` + /// When enabled the url scheme used is `https://{account}.dfs.fabric.microsoft.com` + /// + /// Note: [`Self::with_endpoint`] will take precedence over this option + pub fn with_use_fabric_endpoint(mut self, use_fabric_endpoint: bool) -> Self { + self.use_fabric_endpoint = use_fabric_endpoint.into(); + self + } + + /// Sets what protocol is allowed + /// + /// If `allow_http` is : + /// * false (default): Only HTTPS are allowed + /// * true: HTTP and HTTPS are allowed + pub fn with_allow_http(mut self, allow_http: bool) -> Self { + self.client_options = self.client_options.with_allow_http(allow_http); + self + } + + /// Sets an alternative authority host for OAuth based authorization + /// + /// Common hosts for azure clouds are defined in [authority_hosts](crate::azure::authority_hosts). + /// + /// Defaults to + pub fn with_authority_host(mut self, authority_host: impl Into) -> Self { + self.authority_host = Some(authority_host.into()); + self + } + + /// Set the retry configuration + pub fn with_retry(mut self, retry_config: RetryConfig) -> Self { + self.retry_config = retry_config; + self + } + + /// Set the proxy_url to be used by the underlying client + pub fn with_proxy_url(mut self, proxy_url: impl Into) -> Self { + self.client_options = self.client_options.with_proxy_url(proxy_url); + self + } + + /// Set a trusted proxy CA certificate + pub fn with_proxy_ca_certificate( + mut self, + proxy_ca_certificate: impl Into, + ) -> Self { + self.client_options = self + .client_options + .with_proxy_ca_certificate(proxy_ca_certificate); + self + } + + /// Set a list of hosts to exclude from proxy connections + pub fn with_proxy_excludes(mut self, proxy_excludes: impl Into) -> Self { + self.client_options = self.client_options.with_proxy_excludes(proxy_excludes); + self + } + + /// Sets the client options, overriding any already set + pub fn with_client_options(mut self, options: ClientOptions) -> Self { + self.client_options = options; + self + } + + /// Sets the endpoint for acquiring managed identity token + pub fn with_msi_endpoint(mut self, msi_endpoint: impl Into) -> Self { + self.msi_endpoint = Some(msi_endpoint.into()); + self + } + + /// Sets a file path for acquiring azure federated identity token in k8s + /// + /// requires `client_id` and `tenant_id` to be set + pub fn with_federated_token_file( + mut self, + federated_token_file: impl Into, + ) -> Self { + self.federated_token_file = Some(federated_token_file.into()); + self + } + + /// Set if the Azure Cli should be used for acquiring access token + /// + /// + pub fn with_use_azure_cli(mut self, use_azure_cli: bool) -> Self { + self.use_azure_cli = use_azure_cli.into(); + self + } + + /// Configure a connection to container with given name on Microsoft Azure Blob store. + pub fn build(mut self) -> Result { + if let Some(url) = self.url.take() { + self.parse_url(&url)?; + } + + let container = self.container_name.ok_or(Error::MissingContainerName {})?; + + let static_creds = |credential: AzureCredential| -> AzureCredentialProvider { + Arc::new(StaticCredentialProvider::new(credential)) + }; + + let (is_emulator, storage_url, auth, account) = if self.use_emulator.get()? { + let account_name = self + .account_name + .unwrap_or_else(|| EMULATOR_ACCOUNT.to_string()); + // Allow overriding defaults. Values taken from + // from https://docs.rs/azure_storage/0.2.0/src/azure_storage/core/clients/storage_account_client.rs.html#129-141 + let url = url_from_env("AZURITE_BLOB_STORAGE_URL", "http://127.0.0.1:10000")?; + let account_key = self + .access_key + .unwrap_or_else(|| EMULATOR_ACCOUNT_KEY.to_string()); + + let credential = static_creds(AzureCredential::AccessKey(account_key)); + + self.client_options = self.client_options.with_allow_http(true); + (true, url, credential, account_name) + } else { + let account_name = self.account_name.ok_or(Error::MissingAccount {})?; + let account_url = match self.endpoint { + Some(account_url) => account_url, + None => match self.use_fabric_endpoint.get()? { + true => { + format!("https://{}.blob.fabric.microsoft.com", &account_name) + } + false => format!("https://{}.blob.core.windows.net", &account_name), + }, + }; + + let url = Url::parse(&account_url) + .context(UnableToParseUrlSnafu { url: account_url })?; + + let credential = if let Some(credential) = self.credentials { + credential + } else if let Some(bearer_token) = self.bearer_token { + static_creds(AzureCredential::BearerToken(bearer_token)) + } else if let Some(access_key) = self.access_key { + static_creds(AzureCredential::AccessKey(access_key)) + } else if let (Some(client_id), Some(tenant_id), Some(federated_token_file)) = + (&self.client_id, &self.tenant_id, self.federated_token_file) + { + let client_credential = WorkloadIdentityOAuthProvider::new( + client_id, + federated_token_file, + tenant_id, + self.authority_host, + ); + Arc::new(TokenCredentialProvider::new( + client_credential, + self.client_options.client()?, + self.retry_config.clone(), + )) as _ + } else if let (Some(client_id), Some(client_secret), Some(tenant_id)) = + (&self.client_id, self.client_secret, &self.tenant_id) + { + let client_credential = ClientSecretOAuthProvider::new( + client_id.clone(), + client_secret, + tenant_id, + self.authority_host, + ); + Arc::new(TokenCredentialProvider::new( + client_credential, + self.client_options.client()?, + self.retry_config.clone(), + )) as _ + } else if let Some(query_pairs) = self.sas_query_pairs { + static_creds(AzureCredential::SASToken(query_pairs)) + } else if let Some(sas) = self.sas_key { + static_creds(AzureCredential::SASToken(split_sas(&sas)?)) + } else if self.use_azure_cli.get()? { + Arc::new(AzureCliCredential::new()) as _ + } else { + let msi_credential = ImdsManagedIdentityProvider::new( + self.client_id, + self.object_id, + self.msi_resource_id, + self.msi_endpoint, + ); + Arc::new(TokenCredentialProvider::new( + msi_credential, + self.client_options.metadata_client()?, + self.retry_config.clone(), + )) as _ + }; + (false, url, credential, account_name) + }; + + let config = AzureConfig { + account, + is_emulator, + container, + retry_config: self.retry_config, + client_options: self.client_options, + service: storage_url, + credentials: auth, + }; + + let client = Arc::new(AzureClient::new(config)?); + + Ok(MicrosoftAzure { client }) + } +} + +/// Parses the contents of the environment variable `env_name` as a URL +/// if present, otherwise falls back to default_url +fn url_from_env(env_name: &str, default_url: &str) -> Result { + let url = match std::env::var(env_name) { + Ok(env_value) => { + Url::parse(&env_value).context(UnableToParseEmulatorUrlSnafu { + env_name, + env_value, + })? + } + Err(_) => Url::parse(default_url).expect("Failed to parse default URL"), + }; + Ok(url) +} + +fn split_sas(sas: &str) -> Result, Error> { + let sas = percent_decode_str(sas) + .decode_utf8() + .context(DecodeSasKeySnafu {})?; + let kv_str_pairs = sas + .trim_start_matches('?') + .split('&') + .filter(|s| !s.chars().all(char::is_whitespace)); + let mut pairs = Vec::new(); + for kv_pair_str in kv_str_pairs { + let (k, v) = kv_pair_str + .trim() + .split_once('=') + .ok_or(Error::MissingSasComponent {})?; + pairs.push((k.into(), v.into())) + } + Ok(pairs) +} + +#[cfg(test)] +mod tests { + use super::*; + use std::collections::HashMap; + + #[test] + fn azure_blob_test_urls() { + let mut builder = MicrosoftAzureBuilder::new(); + builder + .parse_url("abfss://file_system@account.dfs.core.windows.net/") + .unwrap(); + assert_eq!(builder.account_name, Some("account".to_string())); + assert_eq!(builder.container_name, Some("file_system".to_string())); + assert!(!builder.use_fabric_endpoint.get().unwrap()); + + let mut builder = MicrosoftAzureBuilder::new(); + builder + .parse_url("abfss://file_system@account.dfs.fabric.microsoft.com/") + .unwrap(); + assert_eq!(builder.account_name, Some("account".to_string())); + assert_eq!(builder.container_name, Some("file_system".to_string())); + assert!(builder.use_fabric_endpoint.get().unwrap()); + + let mut builder = MicrosoftAzureBuilder::new(); + builder.parse_url("abfs://container/path").unwrap(); + assert_eq!(builder.container_name, Some("container".to_string())); + + let mut builder = MicrosoftAzureBuilder::new(); + builder.parse_url("az://container").unwrap(); + assert_eq!(builder.container_name, Some("container".to_string())); + + let mut builder = MicrosoftAzureBuilder::new(); + builder.parse_url("az://container/path").unwrap(); + assert_eq!(builder.container_name, Some("container".to_string())); + + let mut builder = MicrosoftAzureBuilder::new(); + builder + .parse_url("https://account.dfs.core.windows.net/") + .unwrap(); + assert_eq!(builder.account_name, Some("account".to_string())); + assert!(!builder.use_fabric_endpoint.get().unwrap()); + + let mut builder = MicrosoftAzureBuilder::new(); + builder + .parse_url("https://account.blob.core.windows.net/") + .unwrap(); + assert_eq!(builder.account_name, Some("account".to_string())); + assert!(!builder.use_fabric_endpoint.get().unwrap()); + + let mut builder = MicrosoftAzureBuilder::new(); + builder + .parse_url("https://account.dfs.fabric.microsoft.com/") + .unwrap(); + assert_eq!(builder.account_name, Some("account".to_string())); + assert_eq!(builder.container_name, None); + assert!(builder.use_fabric_endpoint.get().unwrap()); + + let mut builder = MicrosoftAzureBuilder::new(); + builder + .parse_url("https://account.dfs.fabric.microsoft.com/container") + .unwrap(); + assert_eq!(builder.account_name, Some("account".to_string())); + assert_eq!(builder.container_name.as_deref(), Some("container")); + assert!(builder.use_fabric_endpoint.get().unwrap()); + + let mut builder = MicrosoftAzureBuilder::new(); + builder + .parse_url("https://account.blob.fabric.microsoft.com/") + .unwrap(); + assert_eq!(builder.account_name, Some("account".to_string())); + assert_eq!(builder.container_name, None); + assert!(builder.use_fabric_endpoint.get().unwrap()); + + let mut builder = MicrosoftAzureBuilder::new(); + builder + .parse_url("https://account.blob.fabric.microsoft.com/container") + .unwrap(); + assert_eq!(builder.account_name, Some("account".to_string())); + assert_eq!(builder.container_name.as_deref(), Some("container")); + assert!(builder.use_fabric_endpoint.get().unwrap()); + + let err_cases = [ + "mailto://account.blob.core.windows.net/", + "az://blob.mydomain/", + "abfs://container.foo/path", + "abfss://file_system@account.foo.dfs.core.windows.net/", + "abfss://file_system.bar@account.dfs.core.windows.net/", + "https://blob.mydomain/", + "https://blob.foo.dfs.core.windows.net/", + ]; + let mut builder = MicrosoftAzureBuilder::new(); + for case in err_cases { + builder.parse_url(case).unwrap_err(); + } + } + + #[test] + fn azure_test_config_from_map() { + let azure_client_id = "object_store:fake_access_key_id"; + let azure_storage_account_name = "object_store:fake_secret_key"; + let azure_storage_token = "object_store:fake_default_region"; + let options = HashMap::from([ + ("azure_client_id", azure_client_id), + ("azure_storage_account_name", azure_storage_account_name), + ("azure_storage_token", azure_storage_token), + ]); + + let builder = options + .into_iter() + .fold(MicrosoftAzureBuilder::new(), |builder, (key, value)| { + builder.with_config(key.parse().unwrap(), value) + }); + assert_eq!(builder.client_id.unwrap(), azure_client_id); + assert_eq!(builder.account_name.unwrap(), azure_storage_account_name); + assert_eq!(builder.bearer_token.unwrap(), azure_storage_token); + } + + #[test] + fn azure_test_split_sas() { + let raw_sas = "?sv=2021-10-04&st=2023-01-04T17%3A48%3A57Z&se=2023-01-04T18%3A15%3A00Z&sr=c&sp=rcwl&sig=C7%2BZeEOWbrxPA3R0Cw%2Fw1EZz0%2B4KBvQexeKZKe%2BB6h0%3D"; + let expected = vec![ + ("sv".to_string(), "2021-10-04".to_string()), + ("st".to_string(), "2023-01-04T17:48:57Z".to_string()), + ("se".to_string(), "2023-01-04T18:15:00Z".to_string()), + ("sr".to_string(), "c".to_string()), + ("sp".to_string(), "rcwl".to_string()), + ( + "sig".to_string(), + "C7+ZeEOWbrxPA3R0Cw/w1EZz0+4KBvQexeKZKe+B6h0=".to_string(), + ), + ]; + let pairs = split_sas(raw_sas).unwrap(); + assert_eq!(expected, pairs); + } +} diff --git a/src/azure/mod.rs b/src/azure/mod.rs index 0e638ef..7e1db5b 100644 --- a/src/azure/mod.rs +++ b/src/azure/mod.rs @@ -30,32 +30,24 @@ use self::client::{BlockId, BlockList}; use crate::{ multipart::{PartId, PutPart, WriteMultiPart}, path::Path, - ClientOptions, GetOptions, GetResult, ListResult, MultipartId, ObjectMeta, - ObjectStore, PutResult, Result, RetryConfig, + GetOptions, GetResult, ListResult, MultipartId, ObjectMeta, ObjectStore, PutResult, + Result, }; use async_trait::async_trait; use base64::prelude::BASE64_STANDARD; use base64::Engine; use bytes::Bytes; use futures::stream::BoxStream; -use percent_encoding::percent_decode_str; -use serde::{Deserialize, Serialize}; -use snafu::{OptionExt, ResultExt, Snafu}; -use std::fmt::{Debug, Formatter}; -use std::str::FromStr; +use std::fmt::Debug; use std::sync::Arc; use tokio::io::AsyncWrite; -use url::Url; use crate::client::get::GetClientExt; use crate::client::list::ListClientExt; -use crate::client::{ - ClientConfigKey, CredentialProvider, StaticCredentialProvider, - TokenCredentialProvider, -}; -use crate::config::ConfigValue; +use crate::client::CredentialProvider; pub use credential::authority_hosts; +mod builder; mod client; mod credential; @@ -63,87 +55,11 @@ mod credential; pub type AzureCredentialProvider = Arc>; use crate::client::header::get_etag; +pub use builder::{AzureConfigKey, MicrosoftAzureBuilder}; pub use credential::AzureCredential; const STORE: &str = "MicrosoftAzure"; -/// The well-known account used by Azurite and the legacy Azure Storage Emulator. -/// -const EMULATOR_ACCOUNT: &str = "devstoreaccount1"; - -/// The well-known account key used by Azurite and the legacy Azure Storage Emulator. -/// -const EMULATOR_ACCOUNT_KEY: &str = - "Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw=="; - -const MSI_ENDPOINT_ENV_KEY: &str = "IDENTITY_ENDPOINT"; - -/// A specialized `Error` for Azure object store-related errors -#[derive(Debug, Snafu)] -#[allow(missing_docs)] -enum Error { - #[snafu(display("Unable parse source url. Url: {}, Error: {}", url, source))] - UnableToParseUrl { - source: url::ParseError, - url: String, - }, - - #[snafu(display( - "Unable parse emulator url {}={}, Error: {}", - env_name, - env_value, - source - ))] - UnableToParseEmulatorUrl { - env_name: String, - env_value: String, - source: url::ParseError, - }, - - #[snafu(display("Account must be specified"))] - MissingAccount {}, - - #[snafu(display("Container name must be specified"))] - MissingContainerName {}, - - #[snafu(display( - "Unknown url scheme cannot be parsed into storage location: {}", - scheme - ))] - UnknownUrlScheme { scheme: String }, - - #[snafu(display("URL did not match any known pattern for scheme: {}", url))] - UrlNotRecognised { url: String }, - - #[snafu(display("Failed parsing an SAS key"))] - DecodeSasKey { source: std::str::Utf8Error }, - - #[snafu(display("Missing component in SAS query pair"))] - MissingSasComponent {}, - - #[snafu(display("Configuration key: '{}' is not known.", key))] - UnknownConfigurationKey { key: String }, - - #[snafu(display("Unable to extract metadata from headers: {}", source))] - Metadata { - source: crate::client::header::Error, - }, -} - -impl From for super::Error { - fn from(source: Error) -> Self { - match source { - Error::UnknownConfigurationKey { key } => { - Self::UnknownConfigurationKey { store: STORE, key } - } - _ => Self::Generic { - store: STORE, - source: Box::new(source), - }, - } - } -} - /// Interface for [Microsoft Azure Blob Storage](https://azure.microsoft.com/en-us/services/storage/blobs/). #[derive(Debug)] pub struct MicrosoftAzure { @@ -175,8 +91,11 @@ impl ObjectStore for MicrosoftAzure { .client .put_request(location, Some(bytes), false, &()) .await?; - let e_tag = Some(get_etag(response.headers()).context(MetadataSnafu)?); - Ok(PutResult { e_tag }) + let e_tag = get_etag(response.headers()).map_err(|e| crate::Error::Generic { + store: STORE, + source: Box::new(e), + })?; + Ok(PutResult { e_tag: Some(e_tag) }) } async fn put_multipart( @@ -279,853 +198,6 @@ impl PutPart for AzureMultiPartUpload { } } -/// Configure a connection to Microsoft Azure Blob Storage container using -/// the specified credentials. -/// -/// # Example -/// ``` -/// # let ACCOUNT = "foo"; -/// # let BUCKET_NAME = "foo"; -/// # let ACCESS_KEY = "foo"; -/// # use object_store::azure::MicrosoftAzureBuilder; -/// let azure = MicrosoftAzureBuilder::new() -/// .with_account(ACCOUNT) -/// .with_access_key(ACCESS_KEY) -/// .with_container_name(BUCKET_NAME) -/// .build(); -/// ``` -#[derive(Default, Clone)] -pub struct MicrosoftAzureBuilder { - /// Account name - account_name: Option, - /// Access key - access_key: Option, - /// Container name - container_name: Option, - /// Bearer token - bearer_token: Option, - /// Client id - client_id: Option, - /// Client secret - client_secret: Option, - /// Tenant id - tenant_id: Option, - /// Query pairs for shared access signature authorization - sas_query_pairs: Option>, - /// Shared access signature - sas_key: Option, - /// Authority host - authority_host: Option, - /// Url - url: Option, - /// When set to true, azurite storage emulator has to be used - use_emulator: ConfigValue, - /// Storage endpoint - endpoint: Option, - /// Msi endpoint for acquiring managed identity token - msi_endpoint: Option, - /// Object id for use with managed identity authentication - object_id: Option, - /// Msi resource id for use with managed identity authentication - msi_resource_id: Option, - /// File containing token for Azure AD workload identity federation - federated_token_file: Option, - /// When set to true, azure cli has to be used for acquiring access token - use_azure_cli: ConfigValue, - /// Retry config - retry_config: RetryConfig, - /// Client options - client_options: ClientOptions, - /// Credentials - credentials: Option, - /// When set to true, fabric url scheme will be used - /// - /// i.e. https://{account_name}.dfs.fabric.microsoft.com - use_fabric_endpoint: ConfigValue, -} - -/// Configuration keys for [`MicrosoftAzureBuilder`] -/// -/// Configuration via keys can be done via [`MicrosoftAzureBuilder::with_config`] -/// -/// # Example -/// ``` -/// # use object_store::azure::{MicrosoftAzureBuilder, AzureConfigKey}; -/// let builder = MicrosoftAzureBuilder::new() -/// .with_config("azure_client_id".parse().unwrap(), "my-client-id") -/// .with_config(AzureConfigKey::AuthorityId, "my-tenant-id"); -/// ``` -#[derive(PartialEq, Eq, Hash, Clone, Debug, Copy, Deserialize, Serialize)] -#[non_exhaustive] -pub enum AzureConfigKey { - /// The name of the azure storage account - /// - /// Supported keys: - /// - `azure_storage_account_name` - /// - `account_name` - AccountName, - - /// Master key for accessing storage account - /// - /// Supported keys: - /// - `azure_storage_account_key` - /// - `azure_storage_access_key` - /// - `azure_storage_master_key` - /// - `access_key` - /// - `account_key` - /// - `master_key` - AccessKey, - - /// Service principal client id for authorizing requests - /// - /// Supported keys: - /// - `azure_storage_client_id` - /// - `azure_client_id` - /// - `client_id` - ClientId, - - /// Service principal client secret for authorizing requests - /// - /// Supported keys: - /// - `azure_storage_client_secret` - /// - `azure_client_secret` - /// - `client_secret` - ClientSecret, - - /// Tenant id used in oauth flows - /// - /// Supported keys: - /// - `azure_storage_tenant_id` - /// - `azure_storage_authority_id` - /// - `azure_tenant_id` - /// - `azure_authority_id` - /// - `tenant_id` - /// - `authority_id` - AuthorityId, - - /// Shared access signature. - /// - /// The signature is expected to be percent-encoded, much like they are provided - /// in the azure storage explorer or azure portal. - /// - /// Supported keys: - /// - `azure_storage_sas_key` - /// - `azure_storage_sas_token` - /// - `sas_key` - /// - `sas_token` - SasKey, - - /// Bearer token - /// - /// Supported keys: - /// - `azure_storage_token` - /// - `bearer_token` - /// - `token` - Token, - - /// Use object store with azurite storage emulator - /// - /// Supported keys: - /// - `azure_storage_use_emulator` - /// - `object_store_use_emulator` - /// - `use_emulator` - UseEmulator, - - /// Override the endpoint used to communicate with blob storage - /// - /// Supported keys: - /// - `azure_storage_endpoint` - /// - `azure_endpoint` - /// - `endpoint` - Endpoint, - - /// Use object store with url scheme account.dfs.fabric.microsoft.com - /// - /// Supported keys: - /// - `azure_use_fabric_endpoint` - /// - `use_fabric_endpoint` - UseFabricEndpoint, - - /// Endpoint to request a imds managed identity token - /// - /// Supported keys: - /// - `azure_msi_endpoint` - /// - `azure_identity_endpoint` - /// - `identity_endpoint` - /// - `msi_endpoint` - MsiEndpoint, - - /// Object id for use with managed identity authentication - /// - /// Supported keys: - /// - `azure_object_id` - /// - `object_id` - ObjectId, - - /// Msi resource id for use with managed identity authentication - /// - /// Supported keys: - /// - `azure_msi_resource_id` - /// - `msi_resource_id` - MsiResourceId, - - /// File containing token for Azure AD workload identity federation - /// - /// Supported keys: - /// - `azure_federated_token_file` - /// - `federated_token_file` - FederatedTokenFile, - - /// Use azure cli for acquiring access token - /// - /// Supported keys: - /// - `azure_use_azure_cli` - /// - `use_azure_cli` - UseAzureCli, - - /// Container name - /// - /// Supported keys: - /// - `azure_container_name` - /// - `container_name` - ContainerName, - - /// Client options - Client(ClientConfigKey), -} - -impl AsRef for AzureConfigKey { - fn as_ref(&self) -> &str { - match self { - Self::AccountName => "azure_storage_account_name", - Self::AccessKey => "azure_storage_account_key", - Self::ClientId => "azure_storage_client_id", - Self::ClientSecret => "azure_storage_client_secret", - Self::AuthorityId => "azure_storage_tenant_id", - Self::SasKey => "azure_storage_sas_key", - Self::Token => "azure_storage_token", - Self::UseEmulator => "azure_storage_use_emulator", - Self::UseFabricEndpoint => "azure_use_fabric_endpoint", - Self::Endpoint => "azure_storage_endpoint", - Self::MsiEndpoint => "azure_msi_endpoint", - Self::ObjectId => "azure_object_id", - Self::MsiResourceId => "azure_msi_resource_id", - Self::FederatedTokenFile => "azure_federated_token_file", - Self::UseAzureCli => "azure_use_azure_cli", - Self::ContainerName => "azure_container_name", - Self::Client(key) => key.as_ref(), - } - } -} - -impl FromStr for AzureConfigKey { - type Err = super::Error; - - fn from_str(s: &str) -> Result { - match s { - "azure_storage_account_key" - | "azure_storage_access_key" - | "azure_storage_master_key" - | "master_key" - | "account_key" - | "access_key" => Ok(Self::AccessKey), - "azure_storage_account_name" | "account_name" => Ok(Self::AccountName), - "azure_storage_client_id" | "azure_client_id" | "client_id" => { - Ok(Self::ClientId) - } - "azure_storage_client_secret" | "azure_client_secret" | "client_secret" => { - Ok(Self::ClientSecret) - } - "azure_storage_tenant_id" - | "azure_storage_authority_id" - | "azure_tenant_id" - | "azure_authority_id" - | "tenant_id" - | "authority_id" => Ok(Self::AuthorityId), - "azure_storage_sas_key" - | "azure_storage_sas_token" - | "sas_key" - | "sas_token" => Ok(Self::SasKey), - "azure_storage_token" | "bearer_token" | "token" => Ok(Self::Token), - "azure_storage_use_emulator" | "use_emulator" => Ok(Self::UseEmulator), - "azure_storage_endpoint" | "azure_endpoint" | "endpoint" => { - Ok(Self::Endpoint) - } - "azure_msi_endpoint" - | "azure_identity_endpoint" - | "identity_endpoint" - | "msi_endpoint" => Ok(Self::MsiEndpoint), - "azure_object_id" | "object_id" => Ok(Self::ObjectId), - "azure_msi_resource_id" | "msi_resource_id" => Ok(Self::MsiResourceId), - "azure_federated_token_file" | "federated_token_file" => { - Ok(Self::FederatedTokenFile) - } - "azure_use_fabric_endpoint" | "use_fabric_endpoint" => { - Ok(Self::UseFabricEndpoint) - } - "azure_use_azure_cli" | "use_azure_cli" => Ok(Self::UseAzureCli), - "azure_container_name" | "container_name" => Ok(Self::ContainerName), - // Backwards compatibility - "azure_allow_http" => Ok(Self::Client(ClientConfigKey::AllowHttp)), - _ => match s.parse() { - Ok(key) => Ok(Self::Client(key)), - Err(_) => Err(Error::UnknownConfigurationKey { key: s.into() }.into()), - }, - } - } -} - -impl Debug for MicrosoftAzureBuilder { - fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { - write!( - f, - "MicrosoftAzureBuilder {{ account: {:?}, container_name: {:?} }}", - self.account_name, self.container_name - ) - } -} - -impl MicrosoftAzureBuilder { - /// Create a new [`MicrosoftAzureBuilder`] with default values. - pub fn new() -> Self { - Default::default() - } - - /// Create an instance of [`MicrosoftAzureBuilder`] with values pre-populated from environment variables. - /// - /// Variables extracted from environment: - /// * AZURE_STORAGE_ACCOUNT_NAME: storage account name - /// * AZURE_STORAGE_ACCOUNT_KEY: storage account master key - /// * AZURE_STORAGE_ACCESS_KEY: alias for AZURE_STORAGE_ACCOUNT_KEY - /// * AZURE_STORAGE_CLIENT_ID -> client id for service principal authorization - /// * AZURE_STORAGE_CLIENT_SECRET -> client secret for service principal authorization - /// * AZURE_STORAGE_TENANT_ID -> tenant id used in oauth flows - /// # Example - /// ``` - /// use object_store::azure::MicrosoftAzureBuilder; - /// - /// let azure = MicrosoftAzureBuilder::from_env() - /// .with_container_name("foo") - /// .build(); - /// ``` - pub fn from_env() -> Self { - let mut builder = Self::default(); - for (os_key, os_value) in std::env::vars_os() { - if let (Some(key), Some(value)) = (os_key.to_str(), os_value.to_str()) { - if key.starts_with("AZURE_") { - if let Ok(config_key) = key.to_ascii_lowercase().parse() { - builder = builder.with_config(config_key, value); - } - } - } - } - - if let Ok(text) = std::env::var(MSI_ENDPOINT_ENV_KEY) { - builder = builder.with_msi_endpoint(text); - } - - builder - } - - /// Parse available connection info form a well-known storage URL. - /// - /// The supported url schemes are: - /// - /// - `abfs[s]:///` (according to [fsspec](https://github.com/fsspec/adlfs)) - /// - `abfs[s]://@.dfs.core.windows.net/` - /// - `abfs[s]://@.dfs.fabric.microsoft.com/` - /// - `az:///` (according to [fsspec](https://github.com/fsspec/adlfs)) - /// - `adl:///` (according to [fsspec](https://github.com/fsspec/adlfs)) - /// - `azure:///` (custom) - /// - `https://.dfs.core.windows.net` - /// - `https://.blob.core.windows.net` - /// - `https://.dfs.fabric.microsoft.com` - /// - `https://.dfs.fabric.microsoft.com/` - /// - `https://.blob.fabric.microsoft.com` - /// - `https://.blob.fabric.microsoft.com/` - /// - /// Note: Settings derived from the URL will override any others set on this builder - /// - /// # Example - /// ``` - /// use object_store::azure::MicrosoftAzureBuilder; - /// - /// let azure = MicrosoftAzureBuilder::from_env() - /// .with_url("abfss://file_system@account.dfs.core.windows.net/") - /// .build(); - /// ``` - pub fn with_url(mut self, url: impl Into) -> Self { - self.url = Some(url.into()); - self - } - - /// Set an option on the builder via a key - value pair. - pub fn with_config(mut self, key: AzureConfigKey, value: impl Into) -> Self { - match key { - AzureConfigKey::AccessKey => self.access_key = Some(value.into()), - AzureConfigKey::AccountName => self.account_name = Some(value.into()), - AzureConfigKey::ClientId => self.client_id = Some(value.into()), - AzureConfigKey::ClientSecret => self.client_secret = Some(value.into()), - AzureConfigKey::AuthorityId => self.tenant_id = Some(value.into()), - AzureConfigKey::SasKey => self.sas_key = Some(value.into()), - AzureConfigKey::Token => self.bearer_token = Some(value.into()), - AzureConfigKey::MsiEndpoint => self.msi_endpoint = Some(value.into()), - AzureConfigKey::ObjectId => self.object_id = Some(value.into()), - AzureConfigKey::MsiResourceId => self.msi_resource_id = Some(value.into()), - AzureConfigKey::FederatedTokenFile => { - self.federated_token_file = Some(value.into()) - } - AzureConfigKey::UseAzureCli => self.use_azure_cli.parse(value), - AzureConfigKey::UseEmulator => self.use_emulator.parse(value), - AzureConfigKey::Endpoint => self.endpoint = Some(value.into()), - AzureConfigKey::UseFabricEndpoint => self.use_fabric_endpoint.parse(value), - AzureConfigKey::Client(key) => { - self.client_options = self.client_options.with_config(key, value) - } - AzureConfigKey::ContainerName => self.container_name = Some(value.into()), - }; - self - } - - /// Set an option on the builder via a key - value pair. - #[deprecated(note = "Use with_config")] - pub fn try_with_option( - self, - key: impl AsRef, - value: impl Into, - ) -> Result { - Ok(self.with_config(key.as_ref().parse()?, value)) - } - - /// Hydrate builder from key value pairs - #[deprecated(note = "Use with_config")] - #[allow(deprecated)] - pub fn try_with_options< - I: IntoIterator, impl Into)>, - >( - mut self, - options: I, - ) -> Result { - for (key, value) in options { - self = self.try_with_option(key, value)?; - } - Ok(self) - } - - /// Get config value via a [`AzureConfigKey`]. - /// - /// # Example - /// ``` - /// use object_store::azure::{MicrosoftAzureBuilder, AzureConfigKey}; - /// - /// let builder = MicrosoftAzureBuilder::from_env() - /// .with_account("foo"); - /// let account_name = builder.get_config_value(&AzureConfigKey::AccountName).unwrap_or_default(); - /// assert_eq!("foo", &account_name); - /// ``` - pub fn get_config_value(&self, key: &AzureConfigKey) -> Option { - match key { - AzureConfigKey::AccountName => self.account_name.clone(), - AzureConfigKey::AccessKey => self.access_key.clone(), - AzureConfigKey::ClientId => self.client_id.clone(), - AzureConfigKey::ClientSecret => self.client_secret.clone(), - AzureConfigKey::AuthorityId => self.tenant_id.clone(), - AzureConfigKey::SasKey => self.sas_key.clone(), - AzureConfigKey::Token => self.bearer_token.clone(), - AzureConfigKey::UseEmulator => Some(self.use_emulator.to_string()), - AzureConfigKey::UseFabricEndpoint => { - Some(self.use_fabric_endpoint.to_string()) - } - AzureConfigKey::Endpoint => self.endpoint.clone(), - AzureConfigKey::MsiEndpoint => self.msi_endpoint.clone(), - AzureConfigKey::ObjectId => self.object_id.clone(), - AzureConfigKey::MsiResourceId => self.msi_resource_id.clone(), - AzureConfigKey::FederatedTokenFile => self.federated_token_file.clone(), - AzureConfigKey::UseAzureCli => Some(self.use_azure_cli.to_string()), - AzureConfigKey::Client(key) => self.client_options.get_config_value(key), - AzureConfigKey::ContainerName => self.container_name.clone(), - } - } - - /// Sets properties on this builder based on a URL - /// - /// This is a separate member function to allow fallible computation to - /// be deferred until [`Self::build`] which in turn allows deriving [`Clone`] - fn parse_url(&mut self, url: &str) -> Result<()> { - let parsed = Url::parse(url).context(UnableToParseUrlSnafu { url })?; - let host = parsed.host_str().context(UrlNotRecognisedSnafu { url })?; - - let validate = |s: &str| match s.contains('.') { - true => Err(UrlNotRecognisedSnafu { url }.build()), - false => Ok(s.to_string()), - }; - - match parsed.scheme() { - "az" | "adl" | "azure" => self.container_name = Some(validate(host)?), - "abfs" | "abfss" => { - // abfs(s) might refer to the fsspec convention abfs:/// - // or the convention for the hadoop driver abfs[s]://@.dfs.core.windows.net/ - if parsed.username().is_empty() { - self.container_name = Some(validate(host)?); - } else if let Some(a) = host.strip_suffix(".dfs.core.windows.net") { - self.container_name = Some(validate(parsed.username())?); - self.account_name = Some(validate(a)?); - } else if let Some(a) = host.strip_suffix(".dfs.fabric.microsoft.com") { - self.container_name = Some(validate(parsed.username())?); - self.account_name = Some(validate(a)?); - self.use_fabric_endpoint = true.into(); - } else { - return Err(UrlNotRecognisedSnafu { url }.build().into()); - } - } - "https" => match host.split_once('.') { - Some((a, "dfs.core.windows.net")) - | Some((a, "blob.core.windows.net")) => { - self.account_name = Some(validate(a)?); - } - Some((a, "dfs.fabric.microsoft.com")) - | Some((a, "blob.fabric.microsoft.com")) => { - self.account_name = Some(validate(a)?); - // Attempt to infer the container name from the URL - // - https://onelake.dfs.fabric.microsoft.com///Files/test.csv - // - https://onelake.dfs.fabric.microsoft.com//.// - // - // See - if let Some(workspace) = parsed.path_segments().unwrap().next() { - if !workspace.is_empty() { - self.container_name = Some(workspace.to_string()) - } - } - self.use_fabric_endpoint = true.into(); - } - _ => return Err(UrlNotRecognisedSnafu { url }.build().into()), - }, - scheme => return Err(UnknownUrlSchemeSnafu { scheme }.build().into()), - } - Ok(()) - } - - /// Set the Azure Account (required) - pub fn with_account(mut self, account: impl Into) -> Self { - self.account_name = Some(account.into()); - self - } - - /// Set the Azure Container Name (required) - pub fn with_container_name(mut self, container_name: impl Into) -> Self { - self.container_name = Some(container_name.into()); - self - } - - /// Set the Azure Access Key (required - one of access key, bearer token, or client credentials) - pub fn with_access_key(mut self, access_key: impl Into) -> Self { - self.access_key = Some(access_key.into()); - self - } - - /// Set a static bearer token to be used for authorizing requests - pub fn with_bearer_token_authorization( - mut self, - bearer_token: impl Into, - ) -> Self { - self.bearer_token = Some(bearer_token.into()); - self - } - - /// Set a client secret used for client secret authorization - pub fn with_client_secret_authorization( - mut self, - client_id: impl Into, - client_secret: impl Into, - tenant_id: impl Into, - ) -> Self { - self.client_id = Some(client_id.into()); - self.client_secret = Some(client_secret.into()); - self.tenant_id = Some(tenant_id.into()); - self - } - - /// Sets the client id for use in client secret or k8s federated credential flow - pub fn with_client_id(mut self, client_id: impl Into) -> Self { - self.client_id = Some(client_id.into()); - self - } - - /// Sets the client secret for use in client secret flow - pub fn with_client_secret(mut self, client_secret: impl Into) -> Self { - self.client_secret = Some(client_secret.into()); - self - } - - /// Sets the tenant id for use in client secret or k8s federated credential flow - pub fn with_tenant_id(mut self, tenant_id: impl Into) -> Self { - self.tenant_id = Some(tenant_id.into()); - self - } - - /// Set query pairs appended to the url for shared access signature authorization - pub fn with_sas_authorization( - mut self, - query_pairs: impl Into>, - ) -> Self { - self.sas_query_pairs = Some(query_pairs.into()); - self - } - - /// Set the credential provider overriding any other options - pub fn with_credentials(mut self, credentials: AzureCredentialProvider) -> Self { - self.credentials = Some(credentials); - self - } - - /// Set if the Azure emulator should be used (defaults to false) - pub fn with_use_emulator(mut self, use_emulator: bool) -> Self { - self.use_emulator = use_emulator.into(); - self - } - - /// Override the endpoint used to communicate with blob storage - /// - /// Defaults to `https://{account}.blob.core.windows.net` - pub fn with_endpoint(mut self, endpoint: String) -> Self { - self.endpoint = Some(endpoint); - self - } - - /// Set if Microsoft Fabric url scheme should be used (defaults to false) - /// When disabled the url scheme used is `https://{account}.blob.core.windows.net` - /// When enabled the url scheme used is `https://{account}.dfs.fabric.microsoft.com` - /// - /// Note: [`Self::with_endpoint`] will take precedence over this option - pub fn with_use_fabric_endpoint(mut self, use_fabric_endpoint: bool) -> Self { - self.use_fabric_endpoint = use_fabric_endpoint.into(); - self - } - - /// Sets what protocol is allowed. If `allow_http` is : - /// * false (default): Only HTTPS are allowed - /// * true: HTTP and HTTPS are allowed - pub fn with_allow_http(mut self, allow_http: bool) -> Self { - self.client_options = self.client_options.with_allow_http(allow_http); - self - } - - /// Sets an alternative authority host for OAuth based authorization - /// common hosts for azure clouds are defined in [authority_hosts]. - /// Defaults to - pub fn with_authority_host(mut self, authority_host: impl Into) -> Self { - self.authority_host = Some(authority_host.into()); - self - } - - /// Set the retry configuration - pub fn with_retry(mut self, retry_config: RetryConfig) -> Self { - self.retry_config = retry_config; - self - } - - /// Set the proxy_url to be used by the underlying client - pub fn with_proxy_url(mut self, proxy_url: impl Into) -> Self { - self.client_options = self.client_options.with_proxy_url(proxy_url); - self - } - - /// Set a trusted proxy CA certificate - pub fn with_proxy_ca_certificate( - mut self, - proxy_ca_certificate: impl Into, - ) -> Self { - self.client_options = self - .client_options - .with_proxy_ca_certificate(proxy_ca_certificate); - self - } - - /// Set a list of hosts to exclude from proxy connections - pub fn with_proxy_excludes(mut self, proxy_excludes: impl Into) -> Self { - self.client_options = self.client_options.with_proxy_excludes(proxy_excludes); - self - } - - /// Sets the client options, overriding any already set - pub fn with_client_options(mut self, options: ClientOptions) -> Self { - self.client_options = options; - self - } - - /// Sets the endpoint for acquiring managed identity token - pub fn with_msi_endpoint(mut self, msi_endpoint: impl Into) -> Self { - self.msi_endpoint = Some(msi_endpoint.into()); - self - } - - /// Sets a file path for acquiring azure federated identity token in k8s - /// - /// requires `client_id` and `tenant_id` to be set - pub fn with_federated_token_file( - mut self, - federated_token_file: impl Into, - ) -> Self { - self.federated_token_file = Some(federated_token_file.into()); - self - } - - /// Set if the Azure Cli should be used for acquiring access token - /// - pub fn with_use_azure_cli(mut self, use_azure_cli: bool) -> Self { - self.use_azure_cli = use_azure_cli.into(); - self - } - - /// Configure a connection to container with given name on Microsoft Azure - /// Blob store. - pub fn build(mut self) -> Result { - if let Some(url) = self.url.take() { - self.parse_url(&url)?; - } - - let container = self.container_name.ok_or(Error::MissingContainerName {})?; - - let static_creds = |credential: AzureCredential| -> AzureCredentialProvider { - Arc::new(StaticCredentialProvider::new(credential)) - }; - - let (is_emulator, storage_url, auth, account) = if self.use_emulator.get()? { - let account_name = self - .account_name - .unwrap_or_else(|| EMULATOR_ACCOUNT.to_string()); - // Allow overriding defaults. Values taken from - // from https://docs.rs/azure_storage/0.2.0/src/azure_storage/core/clients/storage_account_client.rs.html#129-141 - let url = url_from_env("AZURITE_BLOB_STORAGE_URL", "http://127.0.0.1:10000")?; - let account_key = self - .access_key - .unwrap_or_else(|| EMULATOR_ACCOUNT_KEY.to_string()); - - let credential = static_creds(AzureCredential::AccessKey(account_key)); - - self.client_options = self.client_options.with_allow_http(true); - (true, url, credential, account_name) - } else { - let account_name = self.account_name.ok_or(Error::MissingAccount {})?; - let account_url = match self.endpoint { - Some(account_url) => account_url, - None => match self.use_fabric_endpoint.get()? { - true => { - format!("https://{}.blob.fabric.microsoft.com", &account_name) - } - false => format!("https://{}.blob.core.windows.net", &account_name), - }, - }; - - let url = Url::parse(&account_url) - .context(UnableToParseUrlSnafu { url: account_url })?; - - let credential = if let Some(credential) = self.credentials { - credential - } else if let Some(bearer_token) = self.bearer_token { - static_creds(AzureCredential::BearerToken(bearer_token)) - } else if let Some(access_key) = self.access_key { - static_creds(AzureCredential::AccessKey(access_key)) - } else if let (Some(client_id), Some(tenant_id), Some(federated_token_file)) = - (&self.client_id, &self.tenant_id, self.federated_token_file) - { - let client_credential = credential::WorkloadIdentityOAuthProvider::new( - client_id, - federated_token_file, - tenant_id, - self.authority_host, - ); - Arc::new(TokenCredentialProvider::new( - client_credential, - self.client_options.client()?, - self.retry_config.clone(), - )) as _ - } else if let (Some(client_id), Some(client_secret), Some(tenant_id)) = - (&self.client_id, self.client_secret, &self.tenant_id) - { - let client_credential = credential::ClientSecretOAuthProvider::new( - client_id.clone(), - client_secret, - tenant_id, - self.authority_host, - ); - Arc::new(TokenCredentialProvider::new( - client_credential, - self.client_options.client()?, - self.retry_config.clone(), - )) as _ - } else if let Some(query_pairs) = self.sas_query_pairs { - static_creds(AzureCredential::SASToken(query_pairs)) - } else if let Some(sas) = self.sas_key { - static_creds(AzureCredential::SASToken(split_sas(&sas)?)) - } else if self.use_azure_cli.get()? { - Arc::new(credential::AzureCliCredential::new()) as _ - } else { - let msi_credential = credential::ImdsManagedIdentityProvider::new( - self.client_id, - self.object_id, - self.msi_resource_id, - self.msi_endpoint, - ); - Arc::new(TokenCredentialProvider::new( - msi_credential, - self.client_options.metadata_client()?, - self.retry_config.clone(), - )) as _ - }; - (false, url, credential, account_name) - }; - - let config = client::AzureConfig { - account, - is_emulator, - container, - retry_config: self.retry_config, - client_options: self.client_options, - service: storage_url, - credentials: auth, - }; - - let client = Arc::new(client::AzureClient::new(config)?); - - Ok(MicrosoftAzure { client }) - } -} - -/// Parses the contents of the environment variable `env_name` as a URL -/// if present, otherwise falls back to default_url -fn url_from_env(env_name: &str, default_url: &str) -> Result { - let url = match std::env::var(env_name) { - Ok(env_value) => { - Url::parse(&env_value).context(UnableToParseEmulatorUrlSnafu { - env_name, - env_value, - })? - } - Err(_) => Url::parse(default_url).expect("Failed to parse default URL"), - }; - Ok(url) -} - -fn split_sas(sas: &str) -> Result, Error> { - let sas = percent_decode_str(sas) - .decode_utf8() - .context(DecodeSasKeySnafu {})?; - let kv_str_pairs = sas - .trim_start_matches('?') - .split('&') - .filter(|s| !s.chars().all(char::is_whitespace)); - let mut pairs = Vec::new(); - for kv_pair_str in kv_str_pairs { - let (k, v) = kv_pair_str - .trim() - .split_once('=') - .ok_or(Error::MissingSasComponent {})?; - pairs.push((k.into(), v.into())) - } - Ok(pairs) -} - #[cfg(test)] mod tests { use super::*; @@ -1133,7 +205,6 @@ mod tests { copy_if_not_exists, get_opts, list_uses_directories_correctly, list_with_delimiter, put_get_delete_list_opts, rename_and_copy, stream_get, }; - use std::collections::HashMap; #[tokio::test] async fn azure_blob_test() { @@ -1149,118 +220,6 @@ mod tests { stream_get(&integration).await; } - #[test] - fn azure_blob_test_urls() { - let mut builder = MicrosoftAzureBuilder::new(); - builder - .parse_url("abfss://file_system@account.dfs.core.windows.net/") - .unwrap(); - assert_eq!(builder.account_name, Some("account".to_string())); - assert_eq!(builder.container_name, Some("file_system".to_string())); - assert!(!builder.use_fabric_endpoint.get().unwrap()); - - let mut builder = MicrosoftAzureBuilder::new(); - builder - .parse_url("abfss://file_system@account.dfs.fabric.microsoft.com/") - .unwrap(); - assert_eq!(builder.account_name, Some("account".to_string())); - assert_eq!(builder.container_name, Some("file_system".to_string())); - assert!(builder.use_fabric_endpoint.get().unwrap()); - - let mut builder = MicrosoftAzureBuilder::new(); - builder.parse_url("abfs://container/path").unwrap(); - assert_eq!(builder.container_name, Some("container".to_string())); - - let mut builder = MicrosoftAzureBuilder::new(); - builder.parse_url("az://container").unwrap(); - assert_eq!(builder.container_name, Some("container".to_string())); - - let mut builder = MicrosoftAzureBuilder::new(); - builder.parse_url("az://container/path").unwrap(); - assert_eq!(builder.container_name, Some("container".to_string())); - - let mut builder = MicrosoftAzureBuilder::new(); - builder - .parse_url("https://account.dfs.core.windows.net/") - .unwrap(); - assert_eq!(builder.account_name, Some("account".to_string())); - assert!(!builder.use_fabric_endpoint.get().unwrap()); - - let mut builder = MicrosoftAzureBuilder::new(); - builder - .parse_url("https://account.blob.core.windows.net/") - .unwrap(); - assert_eq!(builder.account_name, Some("account".to_string())); - assert!(!builder.use_fabric_endpoint.get().unwrap()); - - let mut builder = MicrosoftAzureBuilder::new(); - builder - .parse_url("https://account.dfs.fabric.microsoft.com/") - .unwrap(); - assert_eq!(builder.account_name, Some("account".to_string())); - assert_eq!(builder.container_name, None); - assert!(builder.use_fabric_endpoint.get().unwrap()); - - let mut builder = MicrosoftAzureBuilder::new(); - builder - .parse_url("https://account.dfs.fabric.microsoft.com/container") - .unwrap(); - assert_eq!(builder.account_name, Some("account".to_string())); - assert_eq!(builder.container_name.as_deref(), Some("container")); - assert!(builder.use_fabric_endpoint.get().unwrap()); - - let mut builder = MicrosoftAzureBuilder::new(); - builder - .parse_url("https://account.blob.fabric.microsoft.com/") - .unwrap(); - assert_eq!(builder.account_name, Some("account".to_string())); - assert_eq!(builder.container_name, None); - assert!(builder.use_fabric_endpoint.get().unwrap()); - - let mut builder = MicrosoftAzureBuilder::new(); - builder - .parse_url("https://account.blob.fabric.microsoft.com/container") - .unwrap(); - assert_eq!(builder.account_name, Some("account".to_string())); - assert_eq!(builder.container_name.as_deref(), Some("container")); - assert!(builder.use_fabric_endpoint.get().unwrap()); - - let err_cases = [ - "mailto://account.blob.core.windows.net/", - "az://blob.mydomain/", - "abfs://container.foo/path", - "abfss://file_system@account.foo.dfs.core.windows.net/", - "abfss://file_system.bar@account.dfs.core.windows.net/", - "https://blob.mydomain/", - "https://blob.foo.dfs.core.windows.net/", - ]; - let mut builder = MicrosoftAzureBuilder::new(); - for case in err_cases { - builder.parse_url(case).unwrap_err(); - } - } - - #[test] - fn azure_test_config_from_map() { - let azure_client_id = "object_store:fake_access_key_id"; - let azure_storage_account_name = "object_store:fake_secret_key"; - let azure_storage_token = "object_store:fake_default_region"; - let options = HashMap::from([ - ("azure_client_id", azure_client_id), - ("azure_storage_account_name", azure_storage_account_name), - ("azure_storage_token", azure_storage_token), - ]); - - let builder = options - .into_iter() - .fold(MicrosoftAzureBuilder::new(), |builder, (key, value)| { - builder.with_config(key.parse().unwrap(), value) - }); - assert_eq!(builder.client_id.unwrap(), azure_client_id); - assert_eq!(builder.account_name.unwrap(), azure_storage_account_name); - assert_eq!(builder.bearer_token.unwrap(), azure_storage_token); - } - #[test] fn azure_test_config_get_value() { let azure_client_id = "object_store:fake_access_key_id".to_string(); @@ -1286,22 +245,4 @@ mod tests { azure_storage_token ); } - - #[test] - fn azure_test_split_sas() { - let raw_sas = "?sv=2021-10-04&st=2023-01-04T17%3A48%3A57Z&se=2023-01-04T18%3A15%3A00Z&sr=c&sp=rcwl&sig=C7%2BZeEOWbrxPA3R0Cw%2Fw1EZz0%2B4KBvQexeKZKe%2BB6h0%3D"; - let expected = vec![ - ("sv".to_string(), "2021-10-04".to_string()), - ("st".to_string(), "2023-01-04T17:48:57Z".to_string()), - ("se".to_string(), "2023-01-04T18:15:00Z".to_string()), - ("sr".to_string(), "c".to_string()), - ("sp".to_string(), "rcwl".to_string()), - ( - "sig".to_string(), - "C7+ZeEOWbrxPA3R0Cw/w1EZz0+4KBvQexeKZKe+B6h0=".to_string(), - ), - ]; - let pairs = split_sas(raw_sas).unwrap(); - assert_eq!(expected, pairs); - } } From f5c696a72a435329f34f7b79c0dff4003ee718bc Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 19 Oct 2023 13:41:17 +0100 Subject: [PATCH 211/397] Add module links in docs root (#4955) --- src/lib.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 018f0f5..8631361 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -64,19 +64,19 @@ //! #![cfg_attr( feature = "gcp", - doc = "* `gcp`: [Google Cloud Storage](https://cloud.google.com/storage/) support. See [`GoogleCloudStorageBuilder`](gcp::GoogleCloudStorageBuilder)" + doc = "* [`gcp`]: [Google Cloud Storage](https://cloud.google.com/storage/) support. See [`GoogleCloudStorageBuilder`](gcp::GoogleCloudStorageBuilder)" )] #![cfg_attr( feature = "aws", - doc = "* `aws`: [Amazon S3](https://aws.amazon.com/s3/). See [`AmazonS3Builder`](aws::AmazonS3Builder)" + doc = "* [`aws`]: [Amazon S3](https://aws.amazon.com/s3/). See [`AmazonS3Builder`](aws::AmazonS3Builder)" )] #![cfg_attr( feature = "azure", - doc = "* `azure`: [Azure Blob Storage](https://azure.microsoft.com/en-gb/services/storage/blobs/). See [`MicrosoftAzureBuilder`](azure::MicrosoftAzureBuilder)" + doc = "* [`azure`]: [Azure Blob Storage](https://azure.microsoft.com/en-gb/services/storage/blobs/). See [`MicrosoftAzureBuilder`](azure::MicrosoftAzureBuilder)" )] #![cfg_attr( feature = "http", - doc = "* `http`: [HTTP/WebDAV Storage](https://datatracker.ietf.org/doc/html/rfc2518). See [`HttpBuilder`](http::HttpBuilder)" + doc = "* [`http`]: [HTTP/WebDAV Storage](https://datatracker.ietf.org/doc/html/rfc2518). See [`HttpBuilder`](http::HttpBuilder)" )] //! //! # Adapters From 82bbdbe692e837d90fd23920f6b25a47385c95b6 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 19 Oct 2023 13:45:43 +0100 Subject: [PATCH 212/397] Split gcp Module (#4956) * Split out GCP client * Split out builder * RAT --- src/gcp/builder.rs | 705 ++++++++++++++++++++++++++++ src/gcp/client.rs | 446 ++++++++++++++++++ src/gcp/mod.rs | 1097 ++------------------------------------------ 3 files changed, 1177 insertions(+), 1071 deletions(-) create mode 100644 src/gcp/builder.rs create mode 100644 src/gcp/client.rs diff --git a/src/gcp/builder.rs b/src/gcp/builder.rs new file mode 100644 index 0000000..920ab8b --- /dev/null +++ b/src/gcp/builder.rs @@ -0,0 +1,705 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::client::TokenCredentialProvider; +use crate::gcp::client::{GoogleCloudStorageClient, GoogleCloudStorageConfig}; +use crate::gcp::credential::{ + ApplicationDefaultCredentials, InstanceCredentialProvider, ServiceAccountCredentials, + DEFAULT_GCS_BASE_URL, +}; +use crate::gcp::{ + credential, GcpCredential, GcpCredentialProvider, GoogleCloudStorage, STORE, +}; +use crate::{ + ClientConfigKey, ClientOptions, Result, RetryConfig, StaticCredentialProvider, +}; +use serde::{Deserialize, Serialize}; +use snafu::{OptionExt, ResultExt, Snafu}; +use std::str::FromStr; +use std::sync::Arc; +use url::Url; + +#[derive(Debug, Snafu)] +enum Error { + #[snafu(display("Missing bucket name"))] + MissingBucketName {}, + + #[snafu(display( + "One of service account path or service account key may be provided." + ))] + ServiceAccountPathAndKeyProvided, + + #[snafu(display("Unable parse source url. Url: {}, Error: {}", url, source))] + UnableToParseUrl { + source: url::ParseError, + url: String, + }, + + #[snafu(display( + "Unknown url scheme cannot be parsed into storage location: {}", + scheme + ))] + UnknownUrlScheme { scheme: String }, + + #[snafu(display("URL did not match any known pattern for scheme: {}", url))] + UrlNotRecognised { url: String }, + + #[snafu(display("Configuration key: '{}' is not known.", key))] + UnknownConfigurationKey { key: String }, + + #[snafu(display("Unable to extract metadata from headers: {}", source))] + Metadata { + source: crate::client::header::Error, + }, + + #[snafu(display("GCP credential error: {}", source))] + Credential { source: credential::Error }, +} + +impl From for crate::Error { + fn from(err: Error) -> Self { + match err { + Error::UnknownConfigurationKey { key } => { + Self::UnknownConfigurationKey { store: STORE, key } + } + _ => Self::Generic { + store: STORE, + source: Box::new(err), + }, + } + } +} + +/// Configure a connection to Google Cloud Storage using the specified +/// credentials. +/// +/// # Example +/// ``` +/// # let BUCKET_NAME = "foo"; +/// # let SERVICE_ACCOUNT_PATH = "/tmp/foo.json"; +/// # use object_store::gcp::GoogleCloudStorageBuilder; +/// let gcs = GoogleCloudStorageBuilder::new() +/// .with_service_account_path(SERVICE_ACCOUNT_PATH) +/// .with_bucket_name(BUCKET_NAME) +/// .build(); +/// ``` +#[derive(Debug, Clone)] +pub struct GoogleCloudStorageBuilder { + /// Bucket name + bucket_name: Option, + /// Url + url: Option, + /// Path to the service account file + service_account_path: Option, + /// The serialized service account key + service_account_key: Option, + /// Path to the application credentials file. + application_credentials_path: Option, + /// Retry config + retry_config: RetryConfig, + /// Client options + client_options: ClientOptions, + /// Credentials + credentials: Option, +} + +/// Configuration keys for [`GoogleCloudStorageBuilder`] +/// +/// Configuration via keys can be done via [`GoogleCloudStorageBuilder::with_config`] +/// +/// # Example +/// ``` +/// # use object_store::gcp::{GoogleCloudStorageBuilder, GoogleConfigKey}; +/// let builder = GoogleCloudStorageBuilder::new() +/// .with_config("google_service_account".parse().unwrap(), "my-service-account") +/// .with_config(GoogleConfigKey::Bucket, "my-bucket"); +/// ``` +#[derive(PartialEq, Eq, Hash, Clone, Debug, Copy, Serialize, Deserialize)] +#[non_exhaustive] +pub enum GoogleConfigKey { + /// Path to the service account file + /// + /// Supported keys: + /// - `google_service_account` + /// - `service_account` + /// - `google_service_account_path` + /// - `service_account_path` + ServiceAccount, + + /// The serialized service account key. + /// + /// Supported keys: + /// - `google_service_account_key` + /// - `service_account_key` + ServiceAccountKey, + + /// Bucket name + /// + /// See [`GoogleCloudStorageBuilder::with_bucket_name`] for details. + /// + /// Supported keys: + /// - `google_bucket` + /// - `google_bucket_name` + /// - `bucket` + /// - `bucket_name` + Bucket, + + /// Application credentials path + /// + /// See [`GoogleCloudStorageBuilder::with_application_credentials`]. + ApplicationCredentials, + + /// Client options + Client(ClientConfigKey), +} + +impl AsRef for GoogleConfigKey { + fn as_ref(&self) -> &str { + match self { + Self::ServiceAccount => "google_service_account", + Self::ServiceAccountKey => "google_service_account_key", + Self::Bucket => "google_bucket", + Self::ApplicationCredentials => "google_application_credentials", + Self::Client(key) => key.as_ref(), + } + } +} + +impl FromStr for GoogleConfigKey { + type Err = crate::Error; + + fn from_str(s: &str) -> Result { + match s { + "google_service_account" + | "service_account" + | "google_service_account_path" + | "service_account_path" => Ok(Self::ServiceAccount), + "google_service_account_key" | "service_account_key" => { + Ok(Self::ServiceAccountKey) + } + "google_bucket" | "google_bucket_name" | "bucket" | "bucket_name" => { + Ok(Self::Bucket) + } + "google_application_credentials" => Ok(Self::ApplicationCredentials), + _ => match s.parse() { + Ok(key) => Ok(Self::Client(key)), + Err(_) => Err(Error::UnknownConfigurationKey { key: s.into() }.into()), + }, + } + } +} + +impl Default for GoogleCloudStorageBuilder { + fn default() -> Self { + Self { + bucket_name: None, + service_account_path: None, + service_account_key: None, + application_credentials_path: None, + retry_config: Default::default(), + client_options: ClientOptions::new().with_allow_http(true), + url: None, + credentials: None, + } + } +} + +impl GoogleCloudStorageBuilder { + /// Create a new [`GoogleCloudStorageBuilder`] with default values. + pub fn new() -> Self { + Default::default() + } + + /// Create an instance of [`GoogleCloudStorageBuilder`] with values pre-populated from environment variables. + /// + /// Variables extracted from environment: + /// * GOOGLE_SERVICE_ACCOUNT: location of service account file + /// * GOOGLE_SERVICE_ACCOUNT_PATH: (alias) location of service account file + /// * SERVICE_ACCOUNT: (alias) location of service account file + /// * GOOGLE_SERVICE_ACCOUNT_KEY: JSON serialized service account key + /// * GOOGLE_BUCKET: bucket name + /// * GOOGLE_BUCKET_NAME: (alias) bucket name + /// + /// # Example + /// ``` + /// use object_store::gcp::GoogleCloudStorageBuilder; + /// + /// let gcs = GoogleCloudStorageBuilder::from_env() + /// .with_bucket_name("foo") + /// .build(); + /// ``` + pub fn from_env() -> Self { + let mut builder = Self::default(); + + if let Ok(service_account_path) = std::env::var("SERVICE_ACCOUNT") { + builder.service_account_path = Some(service_account_path); + } + + for (os_key, os_value) in std::env::vars_os() { + if let (Some(key), Some(value)) = (os_key.to_str(), os_value.to_str()) { + if key.starts_with("GOOGLE_") { + if let Ok(config_key) = key.to_ascii_lowercase().parse() { + builder = builder.with_config(config_key, value); + } + } + } + } + + builder + } + + /// Parse available connection info form a well-known storage URL. + /// + /// The supported url schemes are: + /// + /// - `gs:///` + /// + /// Note: Settings derived from the URL will override any others set on this builder + /// + /// # Example + /// ``` + /// use object_store::gcp::GoogleCloudStorageBuilder; + /// + /// let gcs = GoogleCloudStorageBuilder::from_env() + /// .with_url("gs://bucket/path") + /// .build(); + /// ``` + pub fn with_url(mut self, url: impl Into) -> Self { + self.url = Some(url.into()); + self + } + + /// Set an option on the builder via a key - value pair. + pub fn with_config(mut self, key: GoogleConfigKey, value: impl Into) -> Self { + match key { + GoogleConfigKey::ServiceAccount => { + self.service_account_path = Some(value.into()) + } + GoogleConfigKey::ServiceAccountKey => { + self.service_account_key = Some(value.into()) + } + GoogleConfigKey::Bucket => self.bucket_name = Some(value.into()), + GoogleConfigKey::ApplicationCredentials => { + self.application_credentials_path = Some(value.into()) + } + GoogleConfigKey::Client(key) => { + self.client_options = self.client_options.with_config(key, value) + } + }; + self + } + + /// Set an option on the builder via a key - value pair. + #[deprecated(note = "Use with_config")] + pub fn try_with_option( + self, + key: impl AsRef, + value: impl Into, + ) -> Result { + Ok(self.with_config(key.as_ref().parse()?, value)) + } + + /// Hydrate builder from key value pairs + #[deprecated(note = "Use with_config")] + #[allow(deprecated)] + pub fn try_with_options< + I: IntoIterator, impl Into)>, + >( + mut self, + options: I, + ) -> Result { + for (key, value) in options { + self = self.try_with_option(key, value)?; + } + Ok(self) + } + + /// Get config value via a [`GoogleConfigKey`]. + /// + /// # Example + /// ``` + /// use object_store::gcp::{GoogleCloudStorageBuilder, GoogleConfigKey}; + /// + /// let builder = GoogleCloudStorageBuilder::from_env() + /// .with_service_account_key("foo"); + /// let service_account_key = builder.get_config_value(&GoogleConfigKey::ServiceAccountKey).unwrap_or_default(); + /// assert_eq!("foo", &service_account_key); + /// ``` + pub fn get_config_value(&self, key: &GoogleConfigKey) -> Option { + match key { + GoogleConfigKey::ServiceAccount => self.service_account_path.clone(), + GoogleConfigKey::ServiceAccountKey => self.service_account_key.clone(), + GoogleConfigKey::Bucket => self.bucket_name.clone(), + GoogleConfigKey::ApplicationCredentials => { + self.application_credentials_path.clone() + } + GoogleConfigKey::Client(key) => self.client_options.get_config_value(key), + } + } + + /// Sets properties on this builder based on a URL + /// + /// This is a separate member function to allow fallible computation to + /// be deferred until [`Self::build`] which in turn allows deriving [`Clone`] + fn parse_url(&mut self, url: &str) -> Result<()> { + let parsed = Url::parse(url).context(UnableToParseUrlSnafu { url })?; + let host = parsed.host_str().context(UrlNotRecognisedSnafu { url })?; + + let validate = |s: &str| match s.contains('.') { + true => Err(UrlNotRecognisedSnafu { url }.build()), + false => Ok(s.to_string()), + }; + + match parsed.scheme() { + "gs" => self.bucket_name = Some(validate(host)?), + scheme => return Err(UnknownUrlSchemeSnafu { scheme }.build().into()), + } + Ok(()) + } + + /// Set the bucket name (required) + pub fn with_bucket_name(mut self, bucket_name: impl Into) -> Self { + self.bucket_name = Some(bucket_name.into()); + self + } + + /// Set the path to the service account file. + /// + /// This or [`GoogleCloudStorageBuilder::with_service_account_key`] must be + /// set. + /// + /// Example `"/tmp/gcs.json"`. + /// + /// Example contents of `gcs.json`: + /// + /// ```json + /// { + /// "gcs_base_url": "https://localhost:4443", + /// "disable_oauth": true, + /// "client_email": "", + /// "private_key": "" + /// } + /// ``` + pub fn with_service_account_path( + mut self, + service_account_path: impl Into, + ) -> Self { + self.service_account_path = Some(service_account_path.into()); + self + } + + /// Set the service account key. The service account must be in the JSON + /// format. + /// + /// This or [`GoogleCloudStorageBuilder::with_service_account_path`] must be + /// set. + pub fn with_service_account_key( + mut self, + service_account: impl Into, + ) -> Self { + self.service_account_key = Some(service_account.into()); + self + } + + /// Set the path to the application credentials file. + /// + /// + pub fn with_application_credentials( + mut self, + application_credentials_path: impl Into, + ) -> Self { + self.application_credentials_path = Some(application_credentials_path.into()); + self + } + + /// Set the credential provider overriding any other options + pub fn with_credentials(mut self, credentials: GcpCredentialProvider) -> Self { + self.credentials = Some(credentials); + self + } + + /// Set the retry configuration + pub fn with_retry(mut self, retry_config: RetryConfig) -> Self { + self.retry_config = retry_config; + self + } + + /// Set the proxy_url to be used by the underlying client + pub fn with_proxy_url(mut self, proxy_url: impl Into) -> Self { + self.client_options = self.client_options.with_proxy_url(proxy_url); + self + } + + /// Set a trusted proxy CA certificate + pub fn with_proxy_ca_certificate( + mut self, + proxy_ca_certificate: impl Into, + ) -> Self { + self.client_options = self + .client_options + .with_proxy_ca_certificate(proxy_ca_certificate); + self + } + + /// Set a list of hosts to exclude from proxy connections + pub fn with_proxy_excludes(mut self, proxy_excludes: impl Into) -> Self { + self.client_options = self.client_options.with_proxy_excludes(proxy_excludes); + self + } + + /// Sets the client options, overriding any already set + pub fn with_client_options(mut self, options: ClientOptions) -> Self { + self.client_options = options; + self + } + + /// Configure a connection to Google Cloud Storage, returning a + /// new [`GoogleCloudStorage`] and consuming `self` + pub fn build(mut self) -> Result { + if let Some(url) = self.url.take() { + self.parse_url(&url)?; + } + + let bucket_name = self.bucket_name.ok_or(Error::MissingBucketName {})?; + + // First try to initialize from the service account information. + let service_account_credentials = + match (self.service_account_path, self.service_account_key) { + (Some(path), None) => Some( + ServiceAccountCredentials::from_file(path) + .context(CredentialSnafu)?, + ), + (None, Some(key)) => Some( + ServiceAccountCredentials::from_key(&key).context(CredentialSnafu)?, + ), + (None, None) => None, + (Some(_), Some(_)) => { + return Err(Error::ServiceAccountPathAndKeyProvided.into()) + } + }; + + // Then try to initialize from the application credentials file, or the environment. + let application_default_credentials = ApplicationDefaultCredentials::read( + self.application_credentials_path.as_deref(), + )?; + + let disable_oauth = service_account_credentials + .as_ref() + .map(|c| c.disable_oauth) + .unwrap_or(false); + + let gcs_base_url: String = service_account_credentials + .as_ref() + .and_then(|c| c.gcs_base_url.clone()) + .unwrap_or_else(|| DEFAULT_GCS_BASE_URL.to_string()); + + let credentials = if let Some(credentials) = self.credentials { + credentials + } else if disable_oauth { + Arc::new(StaticCredentialProvider::new(GcpCredential { + bearer: "".to_string(), + })) as _ + } else if let Some(credentials) = service_account_credentials { + Arc::new(TokenCredentialProvider::new( + credentials.token_provider()?, + self.client_options.client()?, + self.retry_config.clone(), + )) as _ + } else if let Some(credentials) = application_default_credentials { + match credentials { + ApplicationDefaultCredentials::AuthorizedUser(token) => { + Arc::new(TokenCredentialProvider::new( + token, + self.client_options.client()?, + self.retry_config.clone(), + )) as _ + } + ApplicationDefaultCredentials::ServiceAccount(token) => { + Arc::new(TokenCredentialProvider::new( + token.token_provider()?, + self.client_options.client()?, + self.retry_config.clone(), + )) as _ + } + } + } else { + Arc::new(TokenCredentialProvider::new( + InstanceCredentialProvider::default(), + self.client_options.metadata_client()?, + self.retry_config.clone(), + )) as _ + }; + + let config = GoogleCloudStorageConfig { + base_url: gcs_base_url, + credentials, + bucket_name, + retry_config: self.retry_config, + client_options: self.client_options, + }; + + Ok(GoogleCloudStorage { + client: Arc::new(GoogleCloudStorageClient::new(config)?), + }) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use std::collections::HashMap; + use std::io::Write; + use tempfile::NamedTempFile; + + const FAKE_KEY: &str = r#"{"private_key": "private_key", "private_key_id": "private_key_id", "client_email":"client_email", "disable_oauth":true}"#; + + #[test] + fn gcs_test_service_account_key_and_path() { + let mut tfile = NamedTempFile::new().unwrap(); + write!(tfile, "{FAKE_KEY}").unwrap(); + let _ = GoogleCloudStorageBuilder::new() + .with_service_account_key(FAKE_KEY) + .with_service_account_path(tfile.path().to_str().unwrap()) + .with_bucket_name("foo") + .build() + .unwrap_err(); + } + + #[test] + fn gcs_test_config_from_map() { + let google_service_account = "object_store:fake_service_account".to_string(); + let google_bucket_name = "object_store:fake_bucket".to_string(); + let options = HashMap::from([ + ("google_service_account", google_service_account.clone()), + ("google_bucket_name", google_bucket_name.clone()), + ]); + + let builder = options + .iter() + .fold(GoogleCloudStorageBuilder::new(), |builder, (key, value)| { + builder.with_config(key.parse().unwrap(), value) + }); + + assert_eq!( + builder.service_account_path.unwrap(), + google_service_account.as_str() + ); + assert_eq!(builder.bucket_name.unwrap(), google_bucket_name.as_str()); + } + + #[test] + fn gcs_test_config_aliases() { + // Service account path + for alias in [ + "google_service_account", + "service_account", + "google_service_account_path", + "service_account_path", + ] { + let builder = GoogleCloudStorageBuilder::new() + .with_config(alias.parse().unwrap(), "/fake/path.json"); + assert_eq!("/fake/path.json", builder.service_account_path.unwrap()); + } + + // Service account key + for alias in ["google_service_account_key", "service_account_key"] { + let builder = GoogleCloudStorageBuilder::new() + .with_config(alias.parse().unwrap(), FAKE_KEY); + assert_eq!(FAKE_KEY, builder.service_account_key.unwrap()); + } + + // Bucket name + for alias in [ + "google_bucket", + "google_bucket_name", + "bucket", + "bucket_name", + ] { + let builder = GoogleCloudStorageBuilder::new() + .with_config(alias.parse().unwrap(), "fake_bucket"); + assert_eq!("fake_bucket", builder.bucket_name.unwrap()); + } + } + + #[tokio::test] + async fn gcs_test_proxy_url() { + let mut tfile = NamedTempFile::new().unwrap(); + write!(tfile, "{FAKE_KEY}").unwrap(); + let service_account_path = tfile.path(); + let gcs = GoogleCloudStorageBuilder::new() + .with_service_account_path(service_account_path.to_str().unwrap()) + .with_bucket_name("foo") + .with_proxy_url("https://example.com") + .build(); + assert!(dbg!(gcs).is_ok()); + + let err = GoogleCloudStorageBuilder::new() + .with_service_account_path(service_account_path.to_str().unwrap()) + .with_bucket_name("foo") + .with_proxy_url("asdf://example.com") + .build() + .unwrap_err() + .to_string(); + + assert_eq!( + "Generic HTTP client error: builder error: unknown proxy scheme", + err + ); + } + + #[test] + fn gcs_test_urls() { + let mut builder = GoogleCloudStorageBuilder::new(); + builder.parse_url("gs://bucket/path").unwrap(); + assert_eq!(builder.bucket_name, Some("bucket".to_string())); + + let err_cases = ["mailto://bucket/path", "gs://bucket.mydomain/path"]; + let mut builder = GoogleCloudStorageBuilder::new(); + for case in err_cases { + builder.parse_url(case).unwrap_err(); + } + } + + #[test] + fn gcs_test_service_account_key_only() { + let _ = GoogleCloudStorageBuilder::new() + .with_service_account_key(FAKE_KEY) + .with_bucket_name("foo") + .build() + .unwrap(); + } + + #[test] + fn gcs_test_config_get_value() { + let google_service_account = "object_store:fake_service_account".to_string(); + let google_bucket_name = "object_store:fake_bucket".to_string(); + let builder = GoogleCloudStorageBuilder::new() + .with_config(GoogleConfigKey::ServiceAccount, &google_service_account) + .with_config(GoogleConfigKey::Bucket, &google_bucket_name); + + assert_eq!( + builder + .get_config_value(&GoogleConfigKey::ServiceAccount) + .unwrap(), + google_service_account + ); + assert_eq!( + builder.get_config_value(&GoogleConfigKey::Bucket).unwrap(), + google_bucket_name + ); + } +} diff --git a/src/gcp/client.rs b/src/gcp/client.rs new file mode 100644 index 0000000..9141a9d --- /dev/null +++ b/src/gcp/client.rs @@ -0,0 +1,446 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::client::get::GetClient; +use crate::client::header::get_etag; +use crate::client::list::ListClient; +use crate::client::list_response::ListResponse; +use crate::client::retry::RetryExt; +use crate::client::GetOptionsExt; +use crate::gcp::{GcpCredential, GcpCredentialProvider, STORE}; +use crate::multipart::PartId; +use crate::path::{Path, DELIMITER}; +use crate::{ClientOptions, GetOptions, ListResult, MultipartId, Result, RetryConfig}; +use async_trait::async_trait; +use bytes::{Buf, Bytes}; +use percent_encoding::{percent_encode, utf8_percent_encode, NON_ALPHANUMERIC}; +use reqwest::{header, Client, Method, Response, StatusCode}; +use serde::Serialize; +use snafu::{ResultExt, Snafu}; +use std::sync::Arc; + +#[derive(Debug, Snafu)] +enum Error { + #[snafu(display("Error performing list request: {}", source))] + ListRequest { source: crate::client::retry::Error }, + + #[snafu(display("Error getting list response body: {}", source))] + ListResponseBody { source: reqwest::Error }, + + #[snafu(display("Got invalid list response: {}", source))] + InvalidListResponse { source: quick_xml::de::DeError }, + + #[snafu(display("Error performing get request {}: {}", path, source))] + GetRequest { + source: crate::client::retry::Error, + path: String, + }, + + #[snafu(display("Error performing delete request {}: {}", path, source))] + DeleteRequest { + source: crate::client::retry::Error, + path: String, + }, + + #[snafu(display("Error performing put request {}: {}", path, source))] + PutRequest { + source: crate::client::retry::Error, + path: String, + }, + + #[snafu(display("Error getting put response body: {}", source))] + PutResponseBody { source: reqwest::Error }, + + #[snafu(display("Got invalid put response: {}", source))] + InvalidPutResponse { source: quick_xml::de::DeError }, + + #[snafu(display("Error performing post request {}: {}", path, source))] + PostRequest { + source: crate::client::retry::Error, + path: String, + }, + + #[snafu(display("Unable to extract metadata from headers: {}", source))] + Metadata { + source: crate::client::header::Error, + }, +} + +impl From for crate::Error { + fn from(err: Error) -> Self { + match err { + Error::GetRequest { source, path } + | Error::DeleteRequest { source, path } + | Error::PutRequest { source, path } => source.error(STORE, path), + _ => Self::Generic { + store: STORE, + source: Box::new(err), + }, + } + } +} + +#[derive(Debug)] +pub struct GoogleCloudStorageConfig { + pub base_url: String, + + pub credentials: GcpCredentialProvider, + + pub bucket_name: String, + + pub retry_config: RetryConfig, + + pub client_options: ClientOptions, +} + +#[derive(Debug)] +pub struct GoogleCloudStorageClient { + config: GoogleCloudStorageConfig, + + client: Client, + + bucket_name_encoded: String, + + // TODO: Hook this up in tests + max_list_results: Option, +} + +impl GoogleCloudStorageClient { + pub fn new(config: GoogleCloudStorageConfig) -> Result { + let client = config.client_options.client()?; + let bucket_name_encoded = + percent_encode(config.bucket_name.as_bytes(), NON_ALPHANUMERIC).to_string(); + + Ok(Self { + config, + client, + bucket_name_encoded, + max_list_results: None, + }) + } + + pub fn config(&self) -> &GoogleCloudStorageConfig { + &self.config + } + + async fn get_credential(&self) -> Result> { + self.config.credentials.get_credential().await + } + + pub fn object_url(&self, path: &Path) -> String { + let encoded = utf8_percent_encode(path.as_ref(), NON_ALPHANUMERIC); + format!( + "{}/{}/{}", + self.config.base_url, self.bucket_name_encoded, encoded + ) + } + + /// Perform a put request + /// + /// Returns the new ETag + pub async fn put_request( + &self, + path: &Path, + payload: Bytes, + query: &T, + ) -> Result { + let credential = self.get_credential().await?; + let url = self.object_url(path); + + let content_type = self + .config + .client_options + .get_content_type(path) + .unwrap_or("application/octet-stream"); + + let response = self + .client + .request(Method::PUT, url) + .query(query) + .bearer_auth(&credential.bearer) + .header(header::CONTENT_TYPE, content_type) + .header(header::CONTENT_LENGTH, payload.len()) + .body(payload) + .send_retry(&self.config.retry_config) + .await + .context(PutRequestSnafu { + path: path.as_ref(), + })?; + + Ok(get_etag(response.headers()).context(MetadataSnafu)?) + } + + /// Initiate a multi-part upload + pub async fn multipart_initiate(&self, path: &Path) -> Result { + let credential = self.get_credential().await?; + let url = self.object_url(path); + + let content_type = self + .config + .client_options + .get_content_type(path) + .unwrap_or("application/octet-stream"); + + let response = self + .client + .request(Method::POST, &url) + .bearer_auth(&credential.bearer) + .header(header::CONTENT_TYPE, content_type) + .header(header::CONTENT_LENGTH, "0") + .query(&[("uploads", "")]) + .send_retry(&self.config.retry_config) + .await + .context(PutRequestSnafu { + path: path.as_ref(), + })?; + + let data = response.bytes().await.context(PutResponseBodySnafu)?; + let result: InitiateMultipartUploadResult = + quick_xml::de::from_reader(data.as_ref().reader()) + .context(InvalidPutResponseSnafu)?; + + Ok(result.upload_id) + } + + /// Cleanup unused parts + pub async fn multipart_cleanup( + &self, + path: &Path, + multipart_id: &MultipartId, + ) -> Result<()> { + let credential = self.get_credential().await?; + let url = self.object_url(path); + + self.client + .request(Method::DELETE, &url) + .bearer_auth(&credential.bearer) + .header(header::CONTENT_TYPE, "application/octet-stream") + .header(header::CONTENT_LENGTH, "0") + .query(&[("uploadId", multipart_id)]) + .send_retry(&self.config.retry_config) + .await + .context(PutRequestSnafu { + path: path.as_ref(), + })?; + + Ok(()) + } + + pub async fn multipart_complete( + &self, + path: &Path, + multipart_id: &MultipartId, + completed_parts: Vec, + ) -> Result<()> { + let upload_id = multipart_id.clone(); + let url = self.object_url(path); + + let parts = completed_parts + .into_iter() + .enumerate() + .map(|(part_number, part)| MultipartPart { + e_tag: part.content_id, + part_number: part_number + 1, + }) + .collect(); + + let credential = self.get_credential().await?; + let upload_info = CompleteMultipartUpload { parts }; + + let data = quick_xml::se::to_string(&upload_info) + .context(InvalidPutResponseSnafu)? + // We cannot disable the escaping that transforms "/" to ""e;" :( + // https://github.com/tafia/quick-xml/issues/362 + // https://github.com/tafia/quick-xml/issues/350 + .replace(""", "\""); + + self.client + .request(Method::POST, &url) + .bearer_auth(&credential.bearer) + .query(&[("uploadId", upload_id)]) + .body(data) + .send_retry(&self.config.retry_config) + .await + .context(PostRequestSnafu { + path: path.as_ref(), + })?; + + Ok(()) + } + + /// Perform a delete request + pub async fn delete_request(&self, path: &Path) -> Result<()> { + let credential = self.get_credential().await?; + let url = self.object_url(path); + + let builder = self.client.request(Method::DELETE, url); + builder + .bearer_auth(&credential.bearer) + .send_retry(&self.config.retry_config) + .await + .context(DeleteRequestSnafu { + path: path.as_ref(), + })?; + + Ok(()) + } + + /// Perform a copy request + pub async fn copy_request( + &self, + from: &Path, + to: &Path, + if_not_exists: bool, + ) -> Result<()> { + let credential = self.get_credential().await?; + let url = self.object_url(to); + + let from = utf8_percent_encode(from.as_ref(), NON_ALPHANUMERIC); + let source = format!("{}/{}", self.bucket_name_encoded, from); + + let mut builder = self + .client + .request(Method::PUT, url) + .header("x-goog-copy-source", source); + + if if_not_exists { + builder = builder.header("x-goog-if-generation-match", 0); + } + + builder + .bearer_auth(&credential.bearer) + // Needed if reqwest is compiled with native-tls instead of rustls-tls + // See https://github.com/apache/arrow-rs/pull/3921 + .header(header::CONTENT_LENGTH, 0) + .send_retry(&self.config.retry_config) + .await + .map_err(|err| match err.status() { + Some(StatusCode::PRECONDITION_FAILED) => crate::Error::AlreadyExists { + source: Box::new(err), + path: to.to_string(), + }, + _ => err.error(STORE, from.to_string()), + })?; + + Ok(()) + } +} + +#[async_trait] +impl GetClient for GoogleCloudStorageClient { + const STORE: &'static str = STORE; + + /// Perform a get request + async fn get_request(&self, path: &Path, options: GetOptions) -> Result { + let credential = self.get_credential().await?; + let url = self.object_url(path); + + let method = match options.head { + true => Method::HEAD, + false => Method::GET, + }; + + let mut request = self.client.request(method, url).with_get_options(options); + + if !credential.bearer.is_empty() { + request = request.bearer_auth(&credential.bearer); + } + + let response = request + .send_retry(&self.config.retry_config) + .await + .context(GetRequestSnafu { + path: path.as_ref(), + })?; + + Ok(response) + } +} + +#[async_trait] +impl ListClient for GoogleCloudStorageClient { + /// Perform a list request + async fn list_request( + &self, + prefix: Option<&str>, + delimiter: bool, + page_token: Option<&str>, + offset: Option<&str>, + ) -> Result<(ListResult, Option)> { + assert!(offset.is_none()); // Not yet supported + + let credential = self.get_credential().await?; + let url = format!("{}/{}", self.config.base_url, self.bucket_name_encoded); + + let mut query = Vec::with_capacity(5); + query.push(("list-type", "2")); + if delimiter { + query.push(("delimiter", DELIMITER)) + } + + if let Some(prefix) = &prefix { + query.push(("prefix", prefix)) + } + + if let Some(page_token) = page_token { + query.push(("continuation-token", page_token)) + } + + if let Some(max_results) = &self.max_list_results { + query.push(("max-keys", max_results)) + } + + let response = self + .client + .request(Method::GET, url) + .query(&query) + .bearer_auth(&credential.bearer) + .send_retry(&self.config.retry_config) + .await + .context(ListRequestSnafu)? + .bytes() + .await + .context(ListResponseBodySnafu)?; + + let mut response: ListResponse = quick_xml::de::from_reader(response.reader()) + .context(InvalidListResponseSnafu)?; + + let token = response.next_continuation_token.take(); + Ok((response.try_into()?, token)) + } +} + +#[derive(serde::Deserialize, Debug)] +#[serde(rename_all = "PascalCase")] +struct InitiateMultipartUploadResult { + upload_id: String, +} + +#[derive(serde::Serialize, Debug)] +#[serde(rename_all = "PascalCase", rename(serialize = "Part"))] +struct MultipartPart { + #[serde(rename = "PartNumber")] + part_number: usize, + e_tag: String, +} + +#[derive(serde::Serialize, Debug)] +#[serde(rename_all = "PascalCase")] +struct CompleteMultipartUpload { + #[serde(rename = "Part", default)] + parts: Vec, +} diff --git a/src/gcp/mod.rs b/src/gcp/mod.rs index 97755c0..7c69d28 100644 --- a/src/gcp/mod.rs +++ b/src/gcp/mod.rs @@ -29,176 +29,34 @@ //! to abort the upload and drop those unneeded parts. In addition, you may wish to //! consider implementing automatic clean up of unused parts that are older than one //! week. -use std::str::FromStr; use std::sync::Arc; -use async_trait::async_trait; -use bytes::{Buf, Bytes}; -use futures::stream::BoxStream; -use percent_encoding::{percent_encode, utf8_percent_encode, NON_ALPHANUMERIC}; -use reqwest::{header, Client, Method, Response, StatusCode}; -use serde::{Deserialize, Serialize}; -use snafu::{OptionExt, ResultExt, Snafu}; -use tokio::io::AsyncWrite; -use url::Url; - -use crate::client::get::{GetClient, GetClientExt}; -use crate::client::list::{ListClient, ListClientExt}; -use crate::client::list_response::ListResponse; -use crate::client::retry::RetryExt; -use crate::client::{ - ClientConfigKey, CredentialProvider, GetOptionsExt, StaticCredentialProvider, - TokenCredentialProvider, -}; +use crate::client::CredentialProvider; use crate::{ multipart::{PartId, PutPart, WriteMultiPart}, - path::{Path, DELIMITER}, - ClientOptions, GetOptions, GetResult, ListResult, MultipartId, ObjectMeta, - ObjectStore, PutResult, Result, RetryConfig, + path::Path, + GetOptions, GetResult, ListResult, MultipartId, ObjectMeta, ObjectStore, PutResult, + Result, }; +use async_trait::async_trait; +use bytes::Bytes; +use client::GoogleCloudStorageClient; +use futures::stream::BoxStream; +use tokio::io::AsyncWrite; -use credential::{InstanceCredentialProvider, ServiceAccountCredentials}; +use crate::client::get::GetClientExt; +use crate::client::list::ListClientExt; +pub use builder::{GoogleCloudStorageBuilder, GoogleConfigKey}; +pub use credential::GcpCredential; +mod builder; +mod client; mod credential; const STORE: &str = "GCS"; /// [`CredentialProvider`] for [`GoogleCloudStorage`] pub type GcpCredentialProvider = Arc>; -use crate::client::header::get_etag; -use crate::gcp::credential::{ApplicationDefaultCredentials, DEFAULT_GCS_BASE_URL}; -pub use credential::GcpCredential; - -#[derive(Debug, Snafu)] -enum Error { - #[snafu(display("Got invalid XML response for {} {}: {}", method, url, source))] - InvalidXMLResponse { - source: quick_xml::de::DeError, - method: String, - url: String, - data: Bytes, - }, - - #[snafu(display("Error performing list request: {}", source))] - ListRequest { source: crate::client::retry::Error }, - - #[snafu(display("Error getting list response body: {}", source))] - ListResponseBody { source: reqwest::Error }, - - #[snafu(display("Got invalid list response: {}", source))] - InvalidListResponse { source: quick_xml::de::DeError }, - - #[snafu(display("Error performing get request {}: {}", path, source))] - GetRequest { - source: crate::client::retry::Error, - path: String, - }, - - #[snafu(display("Error getting get response body {}: {}", path, source))] - GetResponseBody { - source: reqwest::Error, - path: String, - }, - - #[snafu(display("Error performing delete request {}: {}", path, source))] - DeleteRequest { - source: crate::client::retry::Error, - path: String, - }, - - #[snafu(display("Error performing put request {}: {}", path, source))] - PutRequest { - source: crate::client::retry::Error, - path: String, - }, - - #[snafu(display("Error getting put response body: {}", source))] - PutResponseBody { source: reqwest::Error }, - - #[snafu(display("Got invalid put response: {}", source))] - InvalidPutResponse { source: quick_xml::de::DeError }, - - #[snafu(display("Error performing post request {}: {}", path, source))] - PostRequest { - source: crate::client::retry::Error, - path: String, - }, - - #[snafu(display("Error decoding object size: {}", source))] - InvalidSize { source: std::num::ParseIntError }, - - #[snafu(display("Missing bucket name"))] - MissingBucketName {}, - - #[snafu(display( - "One of service account path or service account key may be provided." - ))] - ServiceAccountPathAndKeyProvided, - - #[snafu(display("GCP credential error: {}", source))] - Credential { source: credential::Error }, - - #[snafu(display("Unable parse source url. Url: {}, Error: {}", url, source))] - UnableToParseUrl { - source: url::ParseError, - url: String, - }, - - #[snafu(display( - "Unknown url scheme cannot be parsed into storage location: {}", - scheme - ))] - UnknownUrlScheme { scheme: String }, - - #[snafu(display("URL did not match any known pattern for scheme: {}", url))] - UrlNotRecognised { url: String }, - - #[snafu(display("Configuration key: '{}' is not known.", key))] - UnknownConfigurationKey { key: String }, - - #[snafu(display("Unable to extract metadata from headers: {}", source))] - Metadata { - source: crate::client::header::Error, - }, -} - -impl From for super::Error { - fn from(err: Error) -> Self { - match err { - Error::GetRequest { source, path } - | Error::DeleteRequest { source, path } - | Error::PutRequest { source, path } => source.error(STORE, path), - Error::UnknownConfigurationKey { key } => { - Self::UnknownConfigurationKey { store: STORE, key } - } - _ => Self::Generic { - store: STORE, - source: Box::new(err), - }, - } - } -} - -#[derive(serde::Deserialize, Debug)] -#[serde(rename_all = "PascalCase")] -struct InitiateMultipartUploadResult { - upload_id: String, -} - -#[derive(serde::Serialize, Debug)] -#[serde(rename_all = "PascalCase", rename(serialize = "Part"))] -struct MultipartPart { - #[serde(rename = "PartNumber")] - part_number: usize, - e_tag: String, -} - -#[derive(serde::Serialize, Debug)] -#[serde(rename_all = "PascalCase")] -struct CompleteMultipartUpload { - #[serde(rename = "Part", default)] - parts: Vec, -} /// Interface for [Google Cloud Storage](https://cloud.google.com/storage/). #[derive(Debug)] @@ -208,271 +66,18 @@ pub struct GoogleCloudStorage { impl std::fmt::Display for GoogleCloudStorage { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!(f, "GoogleCloudStorage({})", self.client.bucket_name) + write!( + f, + "GoogleCloudStorage({})", + self.client.config().bucket_name + ) } } impl GoogleCloudStorage { /// Returns the [`GcpCredentialProvider`] used by [`GoogleCloudStorage`] pub fn credentials(&self) -> &GcpCredentialProvider { - &self.client.credentials - } -} - -#[derive(Debug)] -struct GoogleCloudStorageClient { - client: Client, - base_url: String, - - credentials: GcpCredentialProvider, - - bucket_name: String, - bucket_name_encoded: String, - - retry_config: RetryConfig, - client_options: ClientOptions, - - // TODO: Hook this up in tests - max_list_results: Option, -} - -impl GoogleCloudStorageClient { - async fn get_credential(&self) -> Result> { - self.credentials.get_credential().await - } - - fn object_url(&self, path: &Path) -> String { - let encoded = utf8_percent_encode(path.as_ref(), NON_ALPHANUMERIC); - format!("{}/{}/{}", self.base_url, self.bucket_name_encoded, encoded) - } - - /// Perform a put request - /// - /// Returns the new ETag - async fn put_request( - &self, - path: &Path, - payload: Bytes, - query: &T, - ) -> Result { - let credential = self.get_credential().await?; - let url = self.object_url(path); - - let content_type = self - .client_options - .get_content_type(path) - .unwrap_or("application/octet-stream"); - - let response = self - .client - .request(Method::PUT, url) - .query(query) - .bearer_auth(&credential.bearer) - .header(header::CONTENT_TYPE, content_type) - .header(header::CONTENT_LENGTH, payload.len()) - .body(payload) - .send_retry(&self.retry_config) - .await - .context(PutRequestSnafu { - path: path.as_ref(), - })?; - - Ok(get_etag(response.headers()).context(MetadataSnafu)?) - } - - /// Initiate a multi-part upload - async fn multipart_initiate(&self, path: &Path) -> Result { - let credential = self.get_credential().await?; - let url = format!("{}/{}/{}", self.base_url, self.bucket_name_encoded, path); - - let content_type = self - .client_options - .get_content_type(path) - .unwrap_or("application/octet-stream"); - - let response = self - .client - .request(Method::POST, &url) - .bearer_auth(&credential.bearer) - .header(header::CONTENT_TYPE, content_type) - .header(header::CONTENT_LENGTH, "0") - .query(&[("uploads", "")]) - .send_retry(&self.retry_config) - .await - .context(PutRequestSnafu { - path: path.as_ref(), - })?; - - let data = response.bytes().await.context(PutResponseBodySnafu)?; - let result: InitiateMultipartUploadResult = - quick_xml::de::from_reader(data.as_ref().reader()) - .context(InvalidPutResponseSnafu)?; - - Ok(result.upload_id) - } - - /// Cleanup unused parts - async fn multipart_cleanup( - &self, - path: &str, - multipart_id: &MultipartId, - ) -> Result<()> { - let credential = self.get_credential().await?; - let url = format!("{}/{}/{}", self.base_url, self.bucket_name_encoded, path); - - self.client - .request(Method::DELETE, &url) - .bearer_auth(&credential.bearer) - .header(header::CONTENT_TYPE, "application/octet-stream") - .header(header::CONTENT_LENGTH, "0") - .query(&[("uploadId", multipart_id)]) - .send_retry(&self.retry_config) - .await - .context(PutRequestSnafu { path })?; - - Ok(()) - } - - /// Perform a delete request - async fn delete_request(&self, path: &Path) -> Result<()> { - let credential = self.get_credential().await?; - let url = self.object_url(path); - - let builder = self.client.request(Method::DELETE, url); - builder - .bearer_auth(&credential.bearer) - .send_retry(&self.retry_config) - .await - .context(DeleteRequestSnafu { - path: path.as_ref(), - })?; - - Ok(()) - } - - /// Perform a copy request - async fn copy_request( - &self, - from: &Path, - to: &Path, - if_not_exists: bool, - ) -> Result<()> { - let credential = self.get_credential().await?; - let url = self.object_url(to); - - let from = utf8_percent_encode(from.as_ref(), NON_ALPHANUMERIC); - let source = format!("{}/{}", self.bucket_name_encoded, from); - - let mut builder = self - .client - .request(Method::PUT, url) - .header("x-goog-copy-source", source); - - if if_not_exists { - builder = builder.header("x-goog-if-generation-match", 0); - } - - builder - .bearer_auth(&credential.bearer) - // Needed if reqwest is compiled with native-tls instead of rustls-tls - // See https://github.com/apache/arrow-rs/pull/3921 - .header(header::CONTENT_LENGTH, 0) - .send_retry(&self.retry_config) - .await - .map_err(|err| match err.status() { - Some(StatusCode::PRECONDITION_FAILED) => crate::Error::AlreadyExists { - source: Box::new(err), - path: to.to_string(), - }, - _ => err.error(STORE, from.to_string()), - })?; - - Ok(()) - } -} - -#[async_trait] -impl GetClient for GoogleCloudStorageClient { - const STORE: &'static str = STORE; - - /// Perform a get request - async fn get_request(&self, path: &Path, options: GetOptions) -> Result { - let credential = self.get_credential().await?; - let url = self.object_url(path); - - let method = match options.head { - true => Method::HEAD, - false => Method::GET, - }; - - let mut request = self.client.request(method, url).with_get_options(options); - - if !credential.bearer.is_empty() { - request = request.bearer_auth(&credential.bearer); - } - - let response = - request - .send_retry(&self.retry_config) - .await - .context(GetRequestSnafu { - path: path.as_ref(), - })?; - - Ok(response) - } -} - -#[async_trait] -impl ListClient for GoogleCloudStorageClient { - /// Perform a list request - async fn list_request( - &self, - prefix: Option<&str>, - delimiter: bool, - page_token: Option<&str>, - offset: Option<&str>, - ) -> Result<(ListResult, Option)> { - assert!(offset.is_none()); // Not yet supported - - let credential = self.get_credential().await?; - let url = format!("{}/{}", self.base_url, self.bucket_name_encoded); - - let mut query = Vec::with_capacity(5); - query.push(("list-type", "2")); - if delimiter { - query.push(("delimiter", DELIMITER)) - } - - if let Some(prefix) = &prefix { - query.push(("prefix", prefix)) - } - - if let Some(page_token) = page_token { - query.push(("continuation-token", page_token)) - } - - if let Some(max_results) = &self.max_list_results { - query.push(("max-keys", max_results)) - } - - let response = self - .client - .request(Method::GET, url) - .query(&query) - .bearer_auth(&credential.bearer) - .send_retry(&self.retry_config) - .await - .context(ListRequestSnafu)? - .bytes() - .await - .context(ListResponseBodySnafu)?; - - let mut response: ListResponse = quick_xml::de::from_reader(response.reader()) - .context(InvalidListResponseSnafu)?; - - let token = response.next_continuation_token.take(); - Ok((response.try_into()?, token)) + &self.client.config().credentials } } @@ -504,41 +109,9 @@ impl PutPart for GCSMultipartUpload { /// Complete a multipart upload async fn complete(&self, completed_parts: Vec) -> Result<()> { - let upload_id = self.multipart_id.clone(); - let url = self.client.object_url(&self.path); - - let parts = completed_parts - .into_iter() - .enumerate() - .map(|(part_number, part)| MultipartPart { - e_tag: part.content_id, - part_number: part_number + 1, - }) - .collect(); - - let credential = self.client.get_credential().await?; - let upload_info = CompleteMultipartUpload { parts }; - - let data = quick_xml::se::to_string(&upload_info) - .context(InvalidPutResponseSnafu)? - // We cannot disable the escaping that transforms "/" to ""e;" :( - // https://github.com/tafia/quick-xml/issues/362 - // https://github.com/tafia/quick-xml/issues/350 - .replace(""", "\""); - self.client - .client - .request(Method::POST, &url) - .bearer_auth(&credential.bearer) - .query(&[("uploadId", upload_id)]) - .body(data) - .send_retry(&self.client.retry_config) + .multipart_complete(&self.path, &self.multipart_id, completed_parts) .await - .context(PostRequestSnafu { - path: self.path.as_ref(), - })?; - - Ok(()) } } @@ -570,7 +143,7 @@ impl ObjectStore for GoogleCloudStorage { multipart_id: &MultipartId, ) -> Result<()> { self.client - .multipart_cleanup(location.as_ref(), multipart_id) + .multipart_cleanup(location, multipart_id) .await?; Ok(()) @@ -601,498 +174,16 @@ impl ObjectStore for GoogleCloudStorage { } } -/// Configure a connection to Google Cloud Storage using the specified -/// credentials. -/// -/// # Example -/// ``` -/// # let BUCKET_NAME = "foo"; -/// # let SERVICE_ACCOUNT_PATH = "/tmp/foo.json"; -/// # use object_store::gcp::GoogleCloudStorageBuilder; -/// let gcs = GoogleCloudStorageBuilder::new() -/// .with_service_account_path(SERVICE_ACCOUNT_PATH) -/// .with_bucket_name(BUCKET_NAME) -/// .build(); -/// ``` -#[derive(Debug, Clone)] -pub struct GoogleCloudStorageBuilder { - /// Bucket name - bucket_name: Option, - /// Url - url: Option, - /// Path to the service account file - service_account_path: Option, - /// The serialized service account key - service_account_key: Option, - /// Path to the application credentials file. - application_credentials_path: Option, - /// Retry config - retry_config: RetryConfig, - /// Client options - client_options: ClientOptions, - /// Credentials - credentials: Option, -} - -/// Configuration keys for [`GoogleCloudStorageBuilder`] -/// -/// Configuration via keys can be done via [`GoogleCloudStorageBuilder::with_config`] -/// -/// # Example -/// ``` -/// # use object_store::gcp::{GoogleCloudStorageBuilder, GoogleConfigKey}; -/// let builder = GoogleCloudStorageBuilder::new() -/// .with_config("google_service_account".parse().unwrap(), "my-service-account") -/// .with_config(GoogleConfigKey::Bucket, "my-bucket"); -/// ``` -#[derive(PartialEq, Eq, Hash, Clone, Debug, Copy, Serialize, Deserialize)] -#[non_exhaustive] -pub enum GoogleConfigKey { - /// Path to the service account file - /// - /// Supported keys: - /// - `google_service_account` - /// - `service_account` - /// - `google_service_account_path` - /// - `service_account_path` - ServiceAccount, - - /// The serialized service account key. - /// - /// Supported keys: - /// - `google_service_account_key` - /// - `service_account_key` - ServiceAccountKey, - - /// Bucket name - /// - /// See [`GoogleCloudStorageBuilder::with_bucket_name`] for details. - /// - /// Supported keys: - /// - `google_bucket` - /// - `google_bucket_name` - /// - `bucket` - /// - `bucket_name` - Bucket, - - /// Application credentials path - /// - /// See [`GoogleCloudStorageBuilder::with_application_credentials`]. - ApplicationCredentials, - - /// Client options - Client(ClientConfigKey), -} - -impl AsRef for GoogleConfigKey { - fn as_ref(&self) -> &str { - match self { - Self::ServiceAccount => "google_service_account", - Self::ServiceAccountKey => "google_service_account_key", - Self::Bucket => "google_bucket", - Self::ApplicationCredentials => "google_application_credentials", - Self::Client(key) => key.as_ref(), - } - } -} - -impl FromStr for GoogleConfigKey { - type Err = super::Error; - - fn from_str(s: &str) -> Result { - match s { - "google_service_account" - | "service_account" - | "google_service_account_path" - | "service_account_path" => Ok(Self::ServiceAccount), - "google_service_account_key" | "service_account_key" => { - Ok(Self::ServiceAccountKey) - } - "google_bucket" | "google_bucket_name" | "bucket" | "bucket_name" => { - Ok(Self::Bucket) - } - "google_application_credentials" => Ok(Self::ApplicationCredentials), - _ => match s.parse() { - Ok(key) => Ok(Self::Client(key)), - Err(_) => Err(Error::UnknownConfigurationKey { key: s.into() }.into()), - }, - } - } -} - -impl Default for GoogleCloudStorageBuilder { - fn default() -> Self { - Self { - bucket_name: None, - service_account_path: None, - service_account_key: None, - application_credentials_path: None, - retry_config: Default::default(), - client_options: ClientOptions::new().with_allow_http(true), - url: None, - credentials: None, - } - } -} - -impl GoogleCloudStorageBuilder { - /// Create a new [`GoogleCloudStorageBuilder`] with default values. - pub fn new() -> Self { - Default::default() - } - - /// Create an instance of [`GoogleCloudStorageBuilder`] with values pre-populated from environment variables. - /// - /// Variables extracted from environment: - /// * GOOGLE_SERVICE_ACCOUNT: location of service account file - /// * GOOGLE_SERVICE_ACCOUNT_PATH: (alias) location of service account file - /// * SERVICE_ACCOUNT: (alias) location of service account file - /// * GOOGLE_SERVICE_ACCOUNT_KEY: JSON serialized service account key - /// * GOOGLE_BUCKET: bucket name - /// * GOOGLE_BUCKET_NAME: (alias) bucket name - /// - /// # Example - /// ``` - /// use object_store::gcp::GoogleCloudStorageBuilder; - /// - /// let gcs = GoogleCloudStorageBuilder::from_env() - /// .with_bucket_name("foo") - /// .build(); - /// ``` - pub fn from_env() -> Self { - let mut builder = Self::default(); - - if let Ok(service_account_path) = std::env::var("SERVICE_ACCOUNT") { - builder.service_account_path = Some(service_account_path); - } - - for (os_key, os_value) in std::env::vars_os() { - if let (Some(key), Some(value)) = (os_key.to_str(), os_value.to_str()) { - if key.starts_with("GOOGLE_") { - if let Ok(config_key) = key.to_ascii_lowercase().parse() { - builder = builder.with_config(config_key, value); - } - } - } - } - - builder - } - - /// Parse available connection info form a well-known storage URL. - /// - /// The supported url schemes are: - /// - /// - `gs:///` - /// - /// Note: Settings derived from the URL will override any others set on this builder - /// - /// # Example - /// ``` - /// use object_store::gcp::GoogleCloudStorageBuilder; - /// - /// let gcs = GoogleCloudStorageBuilder::from_env() - /// .with_url("gs://bucket/path") - /// .build(); - /// ``` - pub fn with_url(mut self, url: impl Into) -> Self { - self.url = Some(url.into()); - self - } - - /// Set an option on the builder via a key - value pair. - pub fn with_config(mut self, key: GoogleConfigKey, value: impl Into) -> Self { - match key { - GoogleConfigKey::ServiceAccount => { - self.service_account_path = Some(value.into()) - } - GoogleConfigKey::ServiceAccountKey => { - self.service_account_key = Some(value.into()) - } - GoogleConfigKey::Bucket => self.bucket_name = Some(value.into()), - GoogleConfigKey::ApplicationCredentials => { - self.application_credentials_path = Some(value.into()) - } - GoogleConfigKey::Client(key) => { - self.client_options = self.client_options.with_config(key, value) - } - }; - self - } - - /// Set an option on the builder via a key - value pair. - #[deprecated(note = "Use with_config")] - pub fn try_with_option( - self, - key: impl AsRef, - value: impl Into, - ) -> Result { - Ok(self.with_config(key.as_ref().parse()?, value)) - } - - /// Hydrate builder from key value pairs - #[deprecated(note = "Use with_config")] - #[allow(deprecated)] - pub fn try_with_options< - I: IntoIterator, impl Into)>, - >( - mut self, - options: I, - ) -> Result { - for (key, value) in options { - self = self.try_with_option(key, value)?; - } - Ok(self) - } - - /// Get config value via a [`GoogleConfigKey`]. - /// - /// # Example - /// ``` - /// use object_store::gcp::{GoogleCloudStorageBuilder, GoogleConfigKey}; - /// - /// let builder = GoogleCloudStorageBuilder::from_env() - /// .with_service_account_key("foo"); - /// let service_account_key = builder.get_config_value(&GoogleConfigKey::ServiceAccountKey).unwrap_or_default(); - /// assert_eq!("foo", &service_account_key); - /// ``` - pub fn get_config_value(&self, key: &GoogleConfigKey) -> Option { - match key { - GoogleConfigKey::ServiceAccount => self.service_account_path.clone(), - GoogleConfigKey::ServiceAccountKey => self.service_account_key.clone(), - GoogleConfigKey::Bucket => self.bucket_name.clone(), - GoogleConfigKey::ApplicationCredentials => { - self.application_credentials_path.clone() - } - GoogleConfigKey::Client(key) => self.client_options.get_config_value(key), - } - } - - /// Sets properties on this builder based on a URL - /// - /// This is a separate member function to allow fallible computation to - /// be deferred until [`Self::build`] which in turn allows deriving [`Clone`] - fn parse_url(&mut self, url: &str) -> Result<()> { - let parsed = Url::parse(url).context(UnableToParseUrlSnafu { url })?; - let host = parsed.host_str().context(UrlNotRecognisedSnafu { url })?; - - let validate = |s: &str| match s.contains('.') { - true => Err(UrlNotRecognisedSnafu { url }.build()), - false => Ok(s.to_string()), - }; - - match parsed.scheme() { - "gs" => self.bucket_name = Some(validate(host)?), - scheme => return Err(UnknownUrlSchemeSnafu { scheme }.build().into()), - } - Ok(()) - } - - /// Set the bucket name (required) - pub fn with_bucket_name(mut self, bucket_name: impl Into) -> Self { - self.bucket_name = Some(bucket_name.into()); - self - } - - /// Set the path to the service account file. - /// - /// This or [`GoogleCloudStorageBuilder::with_service_account_key`] must be - /// set. - /// - /// Example `"/tmp/gcs.json"`. - /// - /// Example contents of `gcs.json`: - /// - /// ```json - /// { - /// "gcs_base_url": "https://localhost:4443", - /// "disable_oauth": true, - /// "client_email": "", - /// "private_key": "" - /// } - /// ``` - pub fn with_service_account_path( - mut self, - service_account_path: impl Into, - ) -> Self { - self.service_account_path = Some(service_account_path.into()); - self - } - - /// Set the service account key. The service account must be in the JSON - /// format. - /// - /// This or [`GoogleCloudStorageBuilder::with_service_account_path`] must be - /// set. - pub fn with_service_account_key( - mut self, - service_account: impl Into, - ) -> Self { - self.service_account_key = Some(service_account.into()); - self - } - - /// Set the path to the application credentials file. - /// - /// - pub fn with_application_credentials( - mut self, - application_credentials_path: impl Into, - ) -> Self { - self.application_credentials_path = Some(application_credentials_path.into()); - self - } - - /// Set the credential provider overriding any other options - pub fn with_credentials(mut self, credentials: GcpCredentialProvider) -> Self { - self.credentials = Some(credentials); - self - } - - /// Set the retry configuration - pub fn with_retry(mut self, retry_config: RetryConfig) -> Self { - self.retry_config = retry_config; - self - } - - /// Set the proxy_url to be used by the underlying client - pub fn with_proxy_url(mut self, proxy_url: impl Into) -> Self { - self.client_options = self.client_options.with_proxy_url(proxy_url); - self - } - - /// Set a trusted proxy CA certificate - pub fn with_proxy_ca_certificate( - mut self, - proxy_ca_certificate: impl Into, - ) -> Self { - self.client_options = self - .client_options - .with_proxy_ca_certificate(proxy_ca_certificate); - self - } - - /// Set a list of hosts to exclude from proxy connections - pub fn with_proxy_excludes(mut self, proxy_excludes: impl Into) -> Self { - self.client_options = self.client_options.with_proxy_excludes(proxy_excludes); - self - } - - /// Sets the client options, overriding any already set - pub fn with_client_options(mut self, options: ClientOptions) -> Self { - self.client_options = options; - self - } - - /// Configure a connection to Google Cloud Storage, returning a - /// new [`GoogleCloudStorage`] and consuming `self` - pub fn build(mut self) -> Result { - if let Some(url) = self.url.take() { - self.parse_url(&url)?; - } - - let bucket_name = self.bucket_name.ok_or(Error::MissingBucketName {})?; - - let client = self.client_options.client()?; - - // First try to initialize from the service account information. - let service_account_credentials = - match (self.service_account_path, self.service_account_key) { - (Some(path), None) => Some( - ServiceAccountCredentials::from_file(path) - .context(CredentialSnafu)?, - ), - (None, Some(key)) => Some( - ServiceAccountCredentials::from_key(&key).context(CredentialSnafu)?, - ), - (None, None) => None, - (Some(_), Some(_)) => { - return Err(Error::ServiceAccountPathAndKeyProvided.into()) - } - }; - - // Then try to initialize from the application credentials file, or the environment. - let application_default_credentials = ApplicationDefaultCredentials::read( - self.application_credentials_path.as_deref(), - )?; - - let disable_oauth = service_account_credentials - .as_ref() - .map(|c| c.disable_oauth) - .unwrap_or(false); - - let gcs_base_url: String = service_account_credentials - .as_ref() - .and_then(|c| c.gcs_base_url.clone()) - .unwrap_or_else(|| DEFAULT_GCS_BASE_URL.to_string()); - - let credentials = if let Some(credentials) = self.credentials { - credentials - } else if disable_oauth { - Arc::new(StaticCredentialProvider::new(GcpCredential { - bearer: "".to_string(), - })) as _ - } else if let Some(credentials) = service_account_credentials { - Arc::new(TokenCredentialProvider::new( - credentials.token_provider()?, - self.client_options.client()?, - self.retry_config.clone(), - )) as _ - } else if let Some(credentials) = application_default_credentials { - match credentials { - ApplicationDefaultCredentials::AuthorizedUser(token) => { - Arc::new(TokenCredentialProvider::new( - token, - self.client_options.client()?, - self.retry_config.clone(), - )) as _ - } - ApplicationDefaultCredentials::ServiceAccount(token) => { - Arc::new(TokenCredentialProvider::new( - token.token_provider()?, - self.client_options.client()?, - self.retry_config.clone(), - )) as _ - } - } - } else { - Arc::new(TokenCredentialProvider::new( - InstanceCredentialProvider::default(), - self.client_options.metadata_client()?, - self.retry_config.clone(), - )) as _ - }; - - let encoded_bucket_name = - percent_encode(bucket_name.as_bytes(), NON_ALPHANUMERIC).to_string(); - - Ok(GoogleCloudStorage { - client: Arc::new(GoogleCloudStorageClient { - client, - base_url: gcs_base_url, - credentials, - bucket_name, - bucket_name_encoded: encoded_bucket_name, - retry_config: self.retry_config, - client_options: self.client_options, - max_list_results: None, - }), - }) - } -} - #[cfg(test)] mod test { + use bytes::Bytes; - use std::collections::HashMap; - use std::io::Write; - use tempfile::NamedTempFile; + use credential::DEFAULT_GCS_BASE_URL; use crate::tests::*; use super::*; - const FAKE_KEY: &str = r#"{"private_key": "private_key", "private_key_id": "private_key_id", "client_email":"client_email", "disable_oauth":true}"#; const NON_EXISTENT_NAME: &str = "nonexistentname"; #[tokio::test] @@ -1104,7 +195,7 @@ mod test { list_uses_directories_correctly(&integration).await; list_with_delimiter(&integration).await; rename_and_copy(&integration).await; - if integration.client.base_url == DEFAULT_GCS_BASE_URL { + if integration.client.config().base_url == DEFAULT_GCS_BASE_URL { // Fake GCS server doesn't currently honor ifGenerationMatch // https://github.com/fsouza/fake-gcs-server/issues/994 copy_if_not_exists(&integration).await; @@ -1198,140 +289,4 @@ mod test { err ) } - - #[tokio::test] - async fn gcs_test_proxy_url() { - let mut tfile = NamedTempFile::new().unwrap(); - write!(tfile, "{FAKE_KEY}").unwrap(); - let service_account_path = tfile.path(); - let gcs = GoogleCloudStorageBuilder::new() - .with_service_account_path(service_account_path.to_str().unwrap()) - .with_bucket_name("foo") - .with_proxy_url("https://example.com") - .build(); - assert!(dbg!(gcs).is_ok()); - - let err = GoogleCloudStorageBuilder::new() - .with_service_account_path(service_account_path.to_str().unwrap()) - .with_bucket_name("foo") - .with_proxy_url("asdf://example.com") - .build() - .unwrap_err() - .to_string(); - - assert_eq!( - "Generic HTTP client error: builder error: unknown proxy scheme", - err - ); - } - - #[test] - fn gcs_test_urls() { - let mut builder = GoogleCloudStorageBuilder::new(); - builder.parse_url("gs://bucket/path").unwrap(); - assert_eq!(builder.bucket_name, Some("bucket".to_string())); - - let err_cases = ["mailto://bucket/path", "gs://bucket.mydomain/path"]; - let mut builder = GoogleCloudStorageBuilder::new(); - for case in err_cases { - builder.parse_url(case).unwrap_err(); - } - } - - #[test] - fn gcs_test_service_account_key_only() { - let _ = GoogleCloudStorageBuilder::new() - .with_service_account_key(FAKE_KEY) - .with_bucket_name("foo") - .build() - .unwrap(); - } - - #[test] - fn gcs_test_service_account_key_and_path() { - let mut tfile = NamedTempFile::new().unwrap(); - write!(tfile, "{FAKE_KEY}").unwrap(); - let _ = GoogleCloudStorageBuilder::new() - .with_service_account_key(FAKE_KEY) - .with_service_account_path(tfile.path().to_str().unwrap()) - .with_bucket_name("foo") - .build() - .unwrap_err(); - } - - #[test] - fn gcs_test_config_from_map() { - let google_service_account = "object_store:fake_service_account".to_string(); - let google_bucket_name = "object_store:fake_bucket".to_string(); - let options = HashMap::from([ - ("google_service_account", google_service_account.clone()), - ("google_bucket_name", google_bucket_name.clone()), - ]); - - let builder = options - .iter() - .fold(GoogleCloudStorageBuilder::new(), |builder, (key, value)| { - builder.with_config(key.parse().unwrap(), value) - }); - - assert_eq!( - builder.service_account_path.unwrap(), - google_service_account.as_str() - ); - assert_eq!(builder.bucket_name.unwrap(), google_bucket_name.as_str()); - } - - #[test] - fn gcs_test_config_get_value() { - let google_service_account = "object_store:fake_service_account".to_string(); - let google_bucket_name = "object_store:fake_bucket".to_string(); - let builder = GoogleCloudStorageBuilder::new() - .with_config(GoogleConfigKey::ServiceAccount, &google_service_account) - .with_config(GoogleConfigKey::Bucket, &google_bucket_name); - - assert_eq!( - builder - .get_config_value(&GoogleConfigKey::ServiceAccount) - .unwrap(), - google_service_account - ); - assert_eq!( - builder.get_config_value(&GoogleConfigKey::Bucket).unwrap(), - google_bucket_name - ); - } - - #[test] - fn gcs_test_config_aliases() { - // Service account path - for alias in [ - "google_service_account", - "service_account", - "google_service_account_path", - "service_account_path", - ] { - let builder = GoogleCloudStorageBuilder::new() - .with_config(alias.parse().unwrap(), "/fake/path.json"); - assert_eq!("/fake/path.json", builder.service_account_path.unwrap()); - } - - // Service account key - for alias in ["google_service_account_key", "service_account_key"] { - let builder = GoogleCloudStorageBuilder::new() - .with_config(alias.parse().unwrap(), FAKE_KEY); - assert_eq!(FAKE_KEY, builder.service_account_key.unwrap()); - } - - // Bucket name - for alias in [ - "google_bucket", - "google_bucket_name", - "bucket", - "bucket_name", - ] { - let builder = GoogleCloudStorageBuilder::new() - .with_config(alias.parse().unwrap(), "fake_bucket"); - assert_eq!("fake_bucket", builder.bucket_name.unwrap()); - } - } } From 8c58e6d6db64d29e807bacb423a7e6fe5172b7ff Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 19 Oct 2023 17:19:40 +0100 Subject: [PATCH 213/397] Use rustfmt default line width (#4960) * Use rustfmt default line width * Further format --- src/aws/builder.rs | 67 ++++++--------------------- src/aws/client.rs | 56 ++++++++-------------- src/aws/credential.rs | 39 ++++++---------- src/aws/mod.rs | 34 +++++--------- src/aws/resolve.rs | 5 +- src/azure/builder.rs | 85 ++++++++++------------------------ src/azure/client.rs | 32 +++++-------- src/azure/credential.rs | 60 +++++++++--------------- src/azure/mod.rs | 16 ++----- src/buffered.rs | 48 +++++++++++-------- src/chunked.rs | 9 +--- src/client/backoff.rs | 12 ++--- src/client/get.rs | 13 +++--- src/client/mock_server.rs | 3 +- src/client/mod.rs | 37 ++++----------- src/client/retry.rs | 7 ++- src/delimited.rs | 3 +- src/gcp/builder.rs | 87 +++++++++++------------------------ src/gcp/client.rs | 20 ++------ src/gcp/credential.rs | 12 ++--- src/gcp/mod.rs | 9 +--- src/http/client.rs | 15 ++---- src/http/mod.rs | 10 ++-- src/lib.rs | 70 +++++++--------------------- src/limit.rs | 26 +++-------- src/local.rs | 97 ++++++++++++--------------------------- src/memory.rs | 20 ++------ src/parse.rs | 10 +--- src/path/mod.rs | 30 +++++------- src/prefix.rs | 20 ++------ src/signer.rs | 7 +-- src/throttle.rs | 52 +++++---------------- src/util.rs | 19 ++------ tests/get_range_file.rs | 11 +---- 34 files changed, 320 insertions(+), 721 deletions(-) diff --git a/src/aws/builder.rs b/src/aws/builder.rs index 422ba15..75a5299 100644 --- a/src/aws/builder.rs +++ b/src/aws/builder.rs @@ -24,9 +24,7 @@ use crate::aws::{ }; use crate::client::TokenCredentialProvider; use crate::config::ConfigValue; -use crate::{ - ClientConfigKey, ClientOptions, Result, RetryConfig, StaticCredentialProvider, -}; +use crate::{ClientConfigKey, ClientOptions, Result, RetryConfig, StaticCredentialProvider}; use itertools::Itertools; use serde::{Deserialize, Serialize}; use snafu::{OptionExt, ResultExt, Snafu}; @@ -312,9 +310,7 @@ impl AsRef for AmazonS3ConfigKey { Self::MetadataEndpoint => "aws_metadata_endpoint", Self::UnsignedPayload => "aws_unsigned_payload", Self::Checksum => "aws_checksum_algorithm", - Self::ContainerCredentialsRelativeUri => { - "aws_container_credentials_relative_uri" - } + Self::ContainerCredentialsRelativeUri => "aws_container_credentials_relative_uri", Self::SkipSignature => "aws_skip_signature", Self::CopyIfNotExists => "copy_if_not_exists", Self::Client(opt) => opt.as_ref(), @@ -331,15 +327,9 @@ impl FromStr for AmazonS3ConfigKey { "aws_secret_access_key" | "secret_access_key" => Ok(Self::SecretAccessKey), "aws_default_region" | "default_region" => Ok(Self::DefaultRegion), "aws_region" | "region" => Ok(Self::Region), - "aws_bucket" | "aws_bucket_name" | "bucket_name" | "bucket" => { - Ok(Self::Bucket) - } - "aws_endpoint_url" | "aws_endpoint" | "endpoint_url" | "endpoint" => { - Ok(Self::Endpoint) - } - "aws_session_token" | "aws_token" | "session_token" | "token" => { - Ok(Self::Token) - } + "aws_bucket" | "aws_bucket_name" | "bucket_name" | "bucket" => Ok(Self::Bucket), + "aws_endpoint_url" | "aws_endpoint" | "endpoint_url" | "endpoint" => Ok(Self::Endpoint), + "aws_session_token" | "aws_token" | "session_token" | "token" => Ok(Self::Token), "aws_virtual_hosted_style_request" | "virtual_hosted_style_request" => { Ok(Self::VirtualHostedStyleRequest) } @@ -347,9 +337,7 @@ impl FromStr for AmazonS3ConfigKey { "aws_metadata_endpoint" | "metadata_endpoint" => Ok(Self::MetadataEndpoint), "aws_unsigned_payload" | "unsigned_payload" => Ok(Self::UnsignedPayload), "aws_checksum_algorithm" | "checksum_algorithm" => Ok(Self::Checksum), - "aws_container_credentials_relative_uri" => { - Ok(Self::ContainerCredentialsRelativeUri) - } + "aws_container_credentials_relative_uri" => Ok(Self::ContainerCredentialsRelativeUri), "aws_skip_signature" | "skip_signature" => Ok(Self::SkipSignature), "copy_if_not_exists" => Ok(Self::CopyIfNotExists), // Backwards compatibility @@ -428,16 +416,10 @@ impl AmazonS3Builder { } /// Set an option on the builder via a key - value pair. - pub fn with_config( - mut self, - key: AmazonS3ConfigKey, - value: impl Into, - ) -> Self { + pub fn with_config(mut self, key: AmazonS3ConfigKey, value: impl Into) -> Self { match key { AmazonS3ConfigKey::AccessKeyId => self.access_key_id = Some(value.into()), - AmazonS3ConfigKey::SecretAccessKey => { - self.secret_access_key = Some(value.into()) - } + AmazonS3ConfigKey::SecretAccessKey => self.secret_access_key = Some(value.into()), AmazonS3ConfigKey::Region => self.region = Some(value.into()), AmazonS3ConfigKey::Bucket => self.bucket_name = Some(value.into()), AmazonS3ConfigKey::Endpoint => self.endpoint = Some(value.into()), @@ -449,9 +431,7 @@ impl AmazonS3Builder { AmazonS3ConfigKey::DefaultRegion => { self.region = self.region.or_else(|| Some(value.into())) } - AmazonS3ConfigKey::MetadataEndpoint => { - self.metadata_endpoint = Some(value.into()) - } + AmazonS3ConfigKey::MetadataEndpoint => self.metadata_endpoint = Some(value.into()), AmazonS3ConfigKey::UnsignedPayload => self.unsigned_payload.parse(value), AmazonS3ConfigKey::Checksum => { self.checksum_algorithm = Some(ConfigValue::Deferred(value.into())) @@ -474,11 +454,7 @@ impl AmazonS3Builder { /// /// This method will return an `UnknownConfigKey` error if key cannot be parsed into [`AmazonS3ConfigKey`]. #[deprecated(note = "Use with_config")] - pub fn try_with_option( - self, - key: impl AsRef, - value: impl Into, - ) -> Result { + pub fn try_with_option(self, key: impl AsRef, value: impl Into) -> Result { Ok(self.with_config(key.as_ref().parse()?, value)) } @@ -487,9 +463,7 @@ impl AmazonS3Builder { /// This method will return an `UnknownConfigKey` error if any key cannot be parsed into [`AmazonS3ConfigKey`]. #[deprecated(note = "Use with_config")] #[allow(deprecated)] - pub fn try_with_options< - I: IntoIterator, impl Into)>, - >( + pub fn try_with_options, impl Into)>>( mut self, options: I, ) -> Result { @@ -514,9 +488,7 @@ impl AmazonS3Builder { match key { AmazonS3ConfigKey::AccessKeyId => self.access_key_id.clone(), AmazonS3ConfigKey::SecretAccessKey => self.secret_access_key.clone(), - AmazonS3ConfigKey::Region | AmazonS3ConfigKey::DefaultRegion => { - self.region.clone() - } + AmazonS3ConfigKey::Region | AmazonS3ConfigKey::DefaultRegion => self.region.clone(), AmazonS3ConfigKey::Bucket => self.bucket_name.clone(), AmazonS3ConfigKey::Endpoint => self.endpoint.clone(), AmazonS3ConfigKey::Token => self.token.clone(), @@ -586,10 +558,7 @@ impl AmazonS3Builder { } /// Set the AWS Secret Access Key (required) - pub fn with_secret_access_key( - mut self, - secret_access_key: impl Into, - ) -> Self { + pub fn with_secret_access_key(mut self, secret_access_key: impl Into) -> Self { self.secret_access_key = Some(secret_access_key.into()); self } @@ -648,10 +617,7 @@ impl AmazonS3Builder { /// consistent with `virtual_hosted_style_request`. /// i.e. if `virtual_hosted_style_request` is set to true /// then `endpoint` should have bucket name included. - pub fn with_virtual_hosted_style_request( - mut self, - virtual_hosted_style_request: bool, - ) -> Self { + pub fn with_virtual_hosted_style_request(mut self, virtual_hosted_style_request: bool) -> Self { self.virtual_hosted_style_request = virtual_hosted_style_request.into(); self } @@ -722,10 +688,7 @@ impl AmazonS3Builder { } /// Set a trusted proxy CA certificate - pub fn with_proxy_ca_certificate( - mut self, - proxy_ca_certificate: impl Into, - ) -> Self { + pub fn with_proxy_ca_certificate(mut self, proxy_ca_certificate: impl Into) -> Self { self.client_options = self .client_options .with_proxy_ca_certificate(proxy_ca_certificate); diff --git a/src/aws/client.rs b/src/aws/client.rs index eb81e92..6b34b18 100644 --- a/src/aws/client.rs +++ b/src/aws/client.rs @@ -17,9 +17,7 @@ use crate::aws::checksum::Checksum; use crate::aws::credential::{AwsCredential, CredentialExt}; -use crate::aws::{ - AwsCredentialProvider, S3CopyIfNotExists, STORE, STRICT_PATH_ENCODE_SET, -}; +use crate::aws::{AwsCredentialProvider, S3CopyIfNotExists, STORE, STRICT_PATH_ENCODE_SET}; use crate::client::get::GetClient; use crate::client::header::get_etag; use crate::client::list::ListClient; @@ -28,9 +26,7 @@ use crate::client::retry::RetryExt; use crate::client::GetOptionsExt; use crate::multipart::PartId; use crate::path::DELIMITER; -use crate::{ - ClientOptions, GetOptions, ListResult, MultipartId, Path, Result, RetryConfig, -}; +use crate::{ClientOptions, GetOptions, ListResult, MultipartId, Path, Result, RetryConfig}; use async_trait::async_trait; use base64::prelude::BASE64_STANDARD; use base64::Engine; @@ -264,8 +260,7 @@ impl S3Client { if let Some(checksum) = self.config().checksum { let digest = checksum.digest(&bytes); - builder = - builder.header(checksum.header_name(), BASE64_STANDARD.encode(&digest)); + builder = builder.header(checksum.header_name(), BASE64_STANDARD.encode(&digest)); if checksum == Checksum::SHA256 { payload_sha256 = Some(digest); } @@ -333,10 +328,7 @@ impl S3Client { /// there was an error for a certain path, the error will be returned in the /// vector. If there was an issue with making the overall request, an error /// will be returned at the top level. - pub async fn bulk_delete_request( - &self, - paths: Vec, - ) -> Result>> { + pub async fn bulk_delete_request(&self, paths: Vec) -> Result>> { if paths.is_empty() { return Ok(Vec::new()); } @@ -348,10 +340,8 @@ impl S3Client { let mut writer = quick_xml::Writer::new(&mut buffer); writer .write_event(xml_events::Event::Start( - xml_events::BytesStart::new("Delete").with_attributes([( - "xmlns", - "http://s3.amazonaws.com/doc/2006-03-01/", - )]), + xml_events::BytesStart::new("Delete") + .with_attributes([("xmlns", "http://s3.amazonaws.com/doc/2006-03-01/")]), )) .unwrap(); for path in &paths { @@ -415,9 +405,11 @@ impl S3Client { .await .context(DeleteObjectsResponseSnafu {})?; - let response: BatchDeleteResponse = quick_xml::de::from_reader(response.reader()) - .map_err(|err| Error::InvalidDeleteObjectsResponse { - source: Box::new(err), + let response: BatchDeleteResponse = + quick_xml::de::from_reader(response.reader()).map_err(|err| { + Error::InvalidDeleteObjectsResponse { + source: Box::new(err), + } })?; // Assume all were ok, then fill in errors. This guarantees output order @@ -425,11 +417,10 @@ impl S3Client { let mut results: Vec> = paths.iter().cloned().map(Ok).collect(); for content in response.content.into_iter() { if let DeleteObjectResult::Error(error) = content { - let path = Path::parse(&error.key).map_err(|err| { - Error::InvalidDeleteObjectsResponse { + let path = + Path::parse(&error.key).map_err(|err| Error::InvalidDeleteObjectsResponse { source: Box::new(err), - } - })?; + })?; let i = paths.iter().find_position(|&p| p == &path).unwrap().0; results[i] = Err(Error::from(error).into()); } @@ -439,12 +430,7 @@ impl S3Client { } /// Make an S3 Copy request - pub async fn copy_request( - &self, - from: &Path, - to: &Path, - overwrite: bool, - ) -> Result<()> { + pub async fn copy_request(&self, from: &Path, to: &Path, overwrite: bool) -> Result<()> { let credential = self.get_credential().await?; let url = self.config.path_url(to); let source = format!("{}/{}", self.config.bucket, encode_path(from)); @@ -461,9 +447,7 @@ impl S3Client { } None => { return Err(crate::Error::NotSupported { - source: "S3 does not support copy-if-not-exists" - .to_string() - .into(), + source: "S3 does not support copy-if-not-exists".to_string().into(), }) } } @@ -515,8 +499,8 @@ impl S3Client { .await .context(CreateMultipartResponseBodySnafu)?; - let response: InitiateMultipart = quick_xml::de::from_reader(response.reader()) - .context(InvalidMultipartResponseSnafu)?; + let response: InitiateMultipart = + quick_xml::de::from_reader(response.reader()).context(InvalidMultipartResponseSnafu)?; Ok(response.upload_id) } @@ -646,8 +630,8 @@ impl ListClient for S3Client { .await .context(ListResponseBodySnafu)?; - let mut response: ListResponse = quick_xml::de::from_reader(response.reader()) - .context(InvalidListResponseSnafu)?; + let mut response: ListResponse = + quick_xml::de::from_reader(response.reader()).context(InvalidListResponseSnafu)?; let token = response.next_continuation_token.take(); Ok((response.try_into()?, token)) diff --git a/src/aws/credential.rs b/src/aws/credential.rs index e0c5de5..d290da8 100644 --- a/src/aws/credential.rs +++ b/src/aws/credential.rs @@ -37,8 +37,7 @@ use url::Url; type StdError = Box; /// SHA256 hash of empty string -static EMPTY_SHA256_HASH: &str = - "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855"; +static EMPTY_SHA256_HASH: &str = "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855"; static UNSIGNED_PAYLOAD: &str = "UNSIGNED-PAYLOAD"; static STREAMING_PAYLOAD: &str = "STREAMING-AWS4-HMAC-SHA256-PAYLOAD"; @@ -57,13 +56,7 @@ impl AwsCredential { /// Signs a string /// /// - fn sign( - &self, - to_sign: &str, - date: DateTime, - region: &str, - service: &str, - ) -> String { + fn sign(&self, to_sign: &str, date: DateTime, region: &str, service: &str) -> String { let date_string = date.format("%Y%m%d").to_string(); let date_hmac = hmac_sha256(format!("AWS4{}", self.secret_key), date_string); let region_hmac = hmac_sha256(date_hmac, region); @@ -170,9 +163,9 @@ impl<'a> AwsAuthorizer<'a> { ); // sign the string - let signature = - self.credential - .sign(&string_to_sign, date, self.region, self.service); + let signature = self + .credential + .sign(&string_to_sign, date, self.region, self.service); // build the actual auth header let authorisation = format!( @@ -226,9 +219,9 @@ impl<'a> AwsAuthorizer<'a> { digest, ); - let signature = - self.credential - .sign(&string_to_sign, date, self.region, self.service); + let signature = self + .credential + .sign(&string_to_sign, date, self.region, self.service); url.query_pairs_mut() .append_pair("X-Amz-Signature", &signature); @@ -521,9 +514,7 @@ async fn instance_creds( let token = match token_result { Ok(t) => Some(t.text().await?), - Err(e) - if imdsv1_fallback && matches!(e.status(), Some(StatusCode::FORBIDDEN)) => - { + Err(e) if imdsv1_fallback && matches!(e.status(), Some(StatusCode::FORBIDDEN)) => { warn!("received 403 from metadata endpoint, falling back to IMDSv1"); None } @@ -545,8 +536,7 @@ async fn instance_creds( creds_request = creds_request.header(AWS_EC2_METADATA_TOKEN_HEADER, token); } - let creds: InstanceCredentials = - creds_request.send_retry(retry_config).await?.json().await?; + let creds: InstanceCredentials = creds_request.send_retry(retry_config).await?.json().await?; let now = Utc::now(); let ttl = (creds.expiration - now).to_std().unwrap_or_default(); @@ -659,8 +649,7 @@ async fn task_credential( retry: &RetryConfig, url: &str, ) -> Result>, StdError> { - let creds: InstanceCredentials = - client.get(url).send_retry(retry).await?.json().await?; + let creds: InstanceCredentials = client.get(url).send_retry(retry).await?.json().await?; let now = Utc::now(); let ttl = (creds.expiration - now).to_std().unwrap_or_default(); @@ -776,8 +765,7 @@ mod tests { sign_payload: false, }; - let mut url = - Url::parse("https://examplebucket.s3.amazonaws.com/test.txt").unwrap(); + let mut url = Url::parse("https://examplebucket.s3.amazonaws.com/test.txt").unwrap(); authorizer.sign(Method::GET, &mut url, Duration::from_secs(86400)); assert_eq!( @@ -790,7 +778,8 @@ mod tests { X-Amz-Expires=86400&\ X-Amz-SignedHeaders=host&\ X-Amz-Signature=aeeed9bbccd4d02ee5c0109b86d86835f995330da4c265957d157751f604d404" - ).unwrap() + ) + .unwrap() ); } diff --git a/src/aws/mod.rs b/src/aws/mod.rs index a4e39c3..25894a1 100644 --- a/src/aws/mod.rs +++ b/src/aws/mod.rs @@ -47,8 +47,8 @@ use crate::client::CredentialProvider; use crate::multipart::{PartId, PutPart, WriteMultiPart}; use crate::signer::Signer; use crate::{ - GetOptions, GetResult, ListResult, MultipartId, ObjectMeta, ObjectStore, Path, - PutResult, Result, + GetOptions, GetResult, ListResult, MultipartId, ObjectMeta, ObjectStore, Path, PutResult, + Result, }; mod builder; @@ -67,12 +67,11 @@ pub use resolve::resolve_bucket_region; // // Do not URI-encode any of the unreserved characters that RFC 3986 defines: // A-Z, a-z, 0-9, hyphen ( - ), underscore ( _ ), period ( . ), and tilde ( ~ ). -pub(crate) const STRICT_ENCODE_SET: percent_encoding::AsciiSet = - percent_encoding::NON_ALPHANUMERIC - .remove(b'-') - .remove(b'.') - .remove(b'_') - .remove(b'~'); +pub(crate) const STRICT_ENCODE_SET: percent_encoding::AsciiSet = percent_encoding::NON_ALPHANUMERIC + .remove(b'-') + .remove(b'.') + .remove(b'_') + .remove(b'~'); /// This struct is used to maintain the URI path encoding const STRICT_PATH_ENCODE_SET: percent_encoding::AsciiSet = STRICT_ENCODE_SET.remove(b'/'); @@ -141,15 +140,9 @@ impl Signer for AmazonS3 { /// # Ok(()) /// # } /// ``` - async fn signed_url( - &self, - method: Method, - path: &Path, - expires_in: Duration, - ) -> Result { + async fn signed_url(&self, method: Method, path: &Path, expires_in: Duration) -> Result { let credential = self.credentials().get_credential().await?; - let authorizer = - AwsAuthorizer::new(&credential, "s3", &self.client.config().region); + let authorizer = AwsAuthorizer::new(&credential, "s3", &self.client.config().region); let path_url = self.path_url(path); let mut url = Url::parse(&path_url).map_err(|e| crate::Error::Generic { @@ -185,11 +178,7 @@ impl ObjectStore for AmazonS3 { Ok((id, Box::new(WriteMultiPart::new(upload, 8)))) } - async fn abort_multipart( - &self, - location: &Path, - multipart_id: &MultipartId, - ) -> Result<()> { + async fn abort_multipart(&self, location: &Path, multipart_id: &MultipartId) -> Result<()> { self.client .delete_request(location, &[("uploadId", multipart_id)]) .await @@ -314,8 +303,7 @@ mod tests { put_get_delete_list_opts(&integration, is_local).await; // run integration test with checksum set to sha256 - let builder = - AmazonS3Builder::from_env().with_checksum_algorithm(Checksum::SHA256); + let builder = AmazonS3Builder::from_env().with_checksum_algorithm(Checksum::SHA256); let integration = builder.build().unwrap(); put_get_delete_list_opts(&integration, is_local).await; } diff --git a/src/aws/resolve.rs b/src/aws/resolve.rs index 2b21fab..12c9f26 100644 --- a/src/aws/resolve.rs +++ b/src/aws/resolve.rs @@ -48,10 +48,7 @@ impl From for crate::Error { /// Get the bucket region using the [HeadBucket API]. This will fail if the bucket does not exist. /// /// [HeadBucket API]: https://docs.aws.amazon.com/AmazonS3/latest/API/API_HeadBucket.html -pub async fn resolve_bucket_region( - bucket: &str, - client_options: &ClientOptions, -) -> Result { +pub async fn resolve_bucket_region(bucket: &str, client_options: &ClientOptions) -> Result { use reqwest::StatusCode; let endpoint = format!("https://{}.s3.amazonaws.com", bucket); diff --git a/src/azure/builder.rs b/src/azure/builder.rs index eb2de14..915e4c5 100644 --- a/src/azure/builder.rs +++ b/src/azure/builder.rs @@ -23,9 +23,7 @@ use crate::azure::credential::{ use crate::azure::{AzureCredential, AzureCredentialProvider, MicrosoftAzure, STORE}; use crate::client::TokenCredentialProvider; use crate::config::ConfigValue; -use crate::{ - ClientConfigKey, ClientOptions, Result, RetryConfig, StaticCredentialProvider, -}; +use crate::{ClientConfigKey, ClientOptions, Result, RetryConfig, StaticCredentialProvider}; use percent_encoding::percent_decode_str; use serde::{Deserialize, Serialize}; use snafu::{OptionExt, ResultExt, Snafu}; @@ -363,9 +361,7 @@ impl FromStr for AzureConfigKey { | "account_key" | "access_key" => Ok(Self::AccessKey), "azure_storage_account_name" | "account_name" => Ok(Self::AccountName), - "azure_storage_client_id" | "azure_client_id" | "client_id" => { - Ok(Self::ClientId) - } + "azure_storage_client_id" | "azure_client_id" | "client_id" => Ok(Self::ClientId), "azure_storage_client_secret" | "azure_client_secret" | "client_secret" => { Ok(Self::ClientSecret) } @@ -375,27 +371,20 @@ impl FromStr for AzureConfigKey { | "azure_authority_id" | "tenant_id" | "authority_id" => Ok(Self::AuthorityId), - "azure_storage_sas_key" - | "azure_storage_sas_token" - | "sas_key" - | "sas_token" => Ok(Self::SasKey), + "azure_storage_sas_key" | "azure_storage_sas_token" | "sas_key" | "sas_token" => { + Ok(Self::SasKey) + } "azure_storage_token" | "bearer_token" | "token" => Ok(Self::Token), "azure_storage_use_emulator" | "use_emulator" => Ok(Self::UseEmulator), - "azure_storage_endpoint" | "azure_endpoint" | "endpoint" => { - Ok(Self::Endpoint) - } + "azure_storage_endpoint" | "azure_endpoint" | "endpoint" => Ok(Self::Endpoint), "azure_msi_endpoint" | "azure_identity_endpoint" | "identity_endpoint" | "msi_endpoint" => Ok(Self::MsiEndpoint), "azure_object_id" | "object_id" => Ok(Self::ObjectId), "azure_msi_resource_id" | "msi_resource_id" => Ok(Self::MsiResourceId), - "azure_federated_token_file" | "federated_token_file" => { - Ok(Self::FederatedTokenFile) - } - "azure_use_fabric_endpoint" | "use_fabric_endpoint" => { - Ok(Self::UseFabricEndpoint) - } + "azure_federated_token_file" | "federated_token_file" => Ok(Self::FederatedTokenFile), + "azure_use_fabric_endpoint" | "use_fabric_endpoint" => Ok(Self::UseFabricEndpoint), "azure_use_azure_cli" | "use_azure_cli" => Ok(Self::UseAzureCli), "azure_container_name" | "container_name" => Ok(Self::ContainerName), // Backwards compatibility @@ -505,9 +494,7 @@ impl MicrosoftAzureBuilder { AzureConfigKey::MsiEndpoint => self.msi_endpoint = Some(value.into()), AzureConfigKey::ObjectId => self.object_id = Some(value.into()), AzureConfigKey::MsiResourceId => self.msi_resource_id = Some(value.into()), - AzureConfigKey::FederatedTokenFile => { - self.federated_token_file = Some(value.into()) - } + AzureConfigKey::FederatedTokenFile => self.federated_token_file = Some(value.into()), AzureConfigKey::UseAzureCli => self.use_azure_cli.parse(value), AzureConfigKey::UseEmulator => self.use_emulator.parse(value), AzureConfigKey::Endpoint => self.endpoint = Some(value.into()), @@ -522,20 +509,14 @@ impl MicrosoftAzureBuilder { /// Set an option on the builder via a key - value pair. #[deprecated(note = "Use with_config")] - pub fn try_with_option( - self, - key: impl AsRef, - value: impl Into, - ) -> Result { + pub fn try_with_option(self, key: impl AsRef, value: impl Into) -> Result { Ok(self.with_config(key.as_ref().parse()?, value)) } /// Hydrate builder from key value pairs #[deprecated(note = "Use with_config")] #[allow(deprecated)] - pub fn try_with_options< - I: IntoIterator, impl Into)>, - >( + pub fn try_with_options, impl Into)>>( mut self, options: I, ) -> Result { @@ -566,9 +547,7 @@ impl MicrosoftAzureBuilder { AzureConfigKey::SasKey => self.sas_key.clone(), AzureConfigKey::Token => self.bearer_token.clone(), AzureConfigKey::UseEmulator => Some(self.use_emulator.to_string()), - AzureConfigKey::UseFabricEndpoint => { - Some(self.use_fabric_endpoint.to_string()) - } + AzureConfigKey::UseFabricEndpoint => Some(self.use_fabric_endpoint.to_string()), AzureConfigKey::Endpoint => self.endpoint.clone(), AzureConfigKey::MsiEndpoint => self.msi_endpoint.clone(), AzureConfigKey::ObjectId => self.object_id.clone(), @@ -612,12 +591,10 @@ impl MicrosoftAzureBuilder { } } "https" => match host.split_once('.') { - Some((a, "dfs.core.windows.net")) - | Some((a, "blob.core.windows.net")) => { + Some((a, "dfs.core.windows.net")) | Some((a, "blob.core.windows.net")) => { self.account_name = Some(validate(a)?); } - Some((a, "dfs.fabric.microsoft.com")) - | Some((a, "blob.fabric.microsoft.com")) => { + Some((a, "dfs.fabric.microsoft.com")) | Some((a, "blob.fabric.microsoft.com")) => { self.account_name = Some(validate(a)?); // Attempt to infer the container name from the URL // - https://onelake.dfs.fabric.microsoft.com///Files/test.csv @@ -657,10 +634,7 @@ impl MicrosoftAzureBuilder { } /// Set a static bearer token to be used for authorizing requests - pub fn with_bearer_token_authorization( - mut self, - bearer_token: impl Into, - ) -> Self { + pub fn with_bearer_token_authorization(mut self, bearer_token: impl Into) -> Self { self.bearer_token = Some(bearer_token.into()); self } @@ -697,10 +671,7 @@ impl MicrosoftAzureBuilder { } /// Set query pairs appended to the url for shared access signature authorization - pub fn with_sas_authorization( - mut self, - query_pairs: impl Into>, - ) -> Self { + pub fn with_sas_authorization(mut self, query_pairs: impl Into>) -> Self { self.sas_query_pairs = Some(query_pairs.into()); self } @@ -769,10 +740,7 @@ impl MicrosoftAzureBuilder { } /// Set a trusted proxy CA certificate - pub fn with_proxy_ca_certificate( - mut self, - proxy_ca_certificate: impl Into, - ) -> Self { + pub fn with_proxy_ca_certificate(mut self, proxy_ca_certificate: impl Into) -> Self { self.client_options = self .client_options .with_proxy_ca_certificate(proxy_ca_certificate); @@ -800,10 +768,7 @@ impl MicrosoftAzureBuilder { /// Sets a file path for acquiring azure federated identity token in k8s /// /// requires `client_id` and `tenant_id` to be set - pub fn with_federated_token_file( - mut self, - federated_token_file: impl Into, - ) -> Self { + pub fn with_federated_token_file(mut self, federated_token_file: impl Into) -> Self { self.federated_token_file = Some(federated_token_file.into()); self } @@ -855,8 +820,8 @@ impl MicrosoftAzureBuilder { }, }; - let url = Url::parse(&account_url) - .context(UnableToParseUrlSnafu { url: account_url })?; + let url = + Url::parse(&account_url).context(UnableToParseUrlSnafu { url: account_url })?; let credential = if let Some(credential) = self.credentials { credential @@ -934,12 +899,10 @@ impl MicrosoftAzureBuilder { /// if present, otherwise falls back to default_url fn url_from_env(env_name: &str, default_url: &str) -> Result { let url = match std::env::var(env_name) { - Ok(env_value) => { - Url::parse(&env_value).context(UnableToParseEmulatorUrlSnafu { - env_name, - env_value, - })? - } + Ok(env_value) => Url::parse(&env_value).context(UnableToParseEmulatorUrlSnafu { + env_name, + env_value, + })?, Err(_) => Url::parse(default_url).expect("Failed to parse default URL"), }; Ok(url) diff --git a/src/azure/client.rs b/src/azure/client.rs index f65388b..b5ef021 100644 --- a/src/azure/client.rs +++ b/src/azure/client.rs @@ -24,9 +24,7 @@ use crate::client::retry::RetryExt; use crate::client::GetOptionsExt; use crate::path::DELIMITER; use crate::util::deserialize_rfc1123; -use crate::{ - ClientOptions, GetOptions, ListResult, ObjectMeta, Path, Result, RetryConfig, -}; +use crate::{ClientOptions, GetOptions, ListResult, ObjectMeta, Path, Result, RetryConfig}; use async_trait::async_trait; use base64::prelude::BASE64_STANDARD; use base64::Engine; @@ -215,12 +213,7 @@ impl AzureClient { } /// Make an Azure Copy request - pub async fn copy_request( - &self, - from: &Path, - to: &Path, - overwrite: bool, - ) -> Result<()> { + pub async fn copy_request(&self, from: &Path, to: &Path, overwrite: bool) -> Result<()> { let credential = self.get_credential().await?; let url = self.config.path_url(to); let mut source = self.config.path_url(from); @@ -288,16 +281,14 @@ impl GetClient for AzureClient { })?; match response.headers().get("x-ms-resource-type") { - Some(resource) if resource.as_ref() != b"file" => { - Err(crate::Error::NotFound { - path: path.to_string(), - source: format!( - "Not a file, got x-ms-resource-type: {}", - String::from_utf8_lossy(resource.as_ref()) - ) - .into(), - }) - } + Some(resource) if resource.as_ref() != b"file" => Err(crate::Error::NotFound { + path: path.to_string(), + source: format!( + "Not a file, got x-ms-resource-type: {}", + String::from_utf8_lossy(resource.as_ref()) + ) + .into(), + }), _ => Ok(response), } } @@ -347,8 +338,7 @@ impl ListClient for AzureClient { .context(ListResponseBodySnafu)?; let mut response: ListResultInternal = - quick_xml::de::from_reader(response.reader()) - .context(InvalidListResponseSnafu)?; + quick_xml::de::from_reader(response.reader()).context(InvalidListResponseSnafu)?; let token = response.next_marker.take(); Ok((to_list_result(response, prefix)?, token)) diff --git a/src/azure/credential.rs b/src/azure/credential.rs index 8dc6136..fc96ce4 100644 --- a/src/azure/credential.rs +++ b/src/azure/credential.rs @@ -28,9 +28,9 @@ use chrono::{DateTime, Utc}; use reqwest::header::ACCEPT; use reqwest::{ header::{ - HeaderMap, HeaderName, HeaderValue, AUTHORIZATION, CONTENT_ENCODING, - CONTENT_LANGUAGE, CONTENT_LENGTH, CONTENT_TYPE, DATE, IF_MATCH, - IF_MODIFIED_SINCE, IF_NONE_MATCH, IF_UNMODIFIED_SINCE, RANGE, + HeaderMap, HeaderName, HeaderValue, AUTHORIZATION, CONTENT_ENCODING, CONTENT_LANGUAGE, + CONTENT_LENGTH, CONTENT_TYPE, DATE, IF_MATCH, IF_MODIFIED_SINCE, IF_NONE_MATCH, + IF_UNMODIFIED_SINCE, RANGE, }, Client, Method, RequestBuilder, }; @@ -46,8 +46,7 @@ use url::Url; static AZURE_VERSION: HeaderValue = HeaderValue::from_static("2021-08-06"); static VERSION: HeaderName = HeaderName::from_static("x-ms-version"); pub(crate) static BLOB_TYPE: HeaderName = HeaderName::from_static("x-ms-blob-type"); -pub(crate) static DELETE_SNAPSHOTS: HeaderName = - HeaderName::from_static("x-ms-delete-snapshots"); +pub(crate) static DELETE_SNAPSHOTS: HeaderName = HeaderName::from_static("x-ms-delete-snapshots"); pub(crate) static COPY_SOURCE: HeaderName = HeaderName::from_static("x-ms-copy-source"); static CONTENT_MD5: HeaderName = HeaderName::from_static("content-md5"); pub(crate) const RFC1123_FMT: &str = "%a, %d %h %Y %T GMT"; @@ -126,19 +125,11 @@ pub mod authority_hosts { pub(crate) trait CredentialExt { /// Apply authorization to requests against azure storage accounts /// - fn with_azure_authorization( - self, - credential: &AzureCredential, - account: &str, - ) -> Self; + fn with_azure_authorization(self, credential: &AzureCredential, account: &str) -> Self; } impl CredentialExt for RequestBuilder { - fn with_azure_authorization( - mut self, - credential: &AzureCredential, - account: &str, - ) -> Self { + fn with_azure_authorization(mut self, credential: &AzureCredential, account: &str) -> Self { // rfc2822 string should never contain illegal characters let date = Utc::now(); let date_str = date.format(RFC1123_FMT).to_string(); @@ -324,8 +315,8 @@ impl ClientSecretOAuthProvider { tenant_id: impl AsRef, authority_host: Option, ) -> Self { - let authority_host = authority_host - .unwrap_or_else(|| authority_hosts::AZURE_PUBLIC_CLOUD.to_owned()); + let authority_host = + authority_host.unwrap_or_else(|| authority_hosts::AZURE_PUBLIC_CLOUD.to_owned()); Self { token_url: format!( @@ -409,9 +400,8 @@ impl ImdsManagedIdentityProvider { msi_res_id: Option, msi_endpoint: Option, ) -> Self { - let msi_endpoint = msi_endpoint.unwrap_or_else(|| { - "http://169.254.169.254/metadata/identity/oauth2/token".to_owned() - }); + let msi_endpoint = msi_endpoint + .unwrap_or_else(|| "http://169.254.169.254/metadata/identity/oauth2/token".to_owned()); Self { msi_endpoint, @@ -493,8 +483,8 @@ impl WorkloadIdentityOAuthProvider { tenant_id: impl AsRef, authority_host: Option, ) -> Self { - let authority_host = authority_host - .unwrap_or_else(|| authority_hosts::AZURE_PUBLIC_CLOUD.to_owned()); + let authority_host = + authority_host.unwrap_or_else(|| authority_hosts::AZURE_PUBLIC_CLOUD.to_owned()); Self { token_url: format!( @@ -553,9 +543,7 @@ mod az_cli_date_format { use chrono::{DateTime, TimeZone}; use serde::{self, Deserialize, Deserializer}; - pub fn deserialize<'de, D>( - deserializer: D, - ) -> Result, D::Error> + pub fn deserialize<'de, D>(deserializer: D) -> Result, D::Error> where D: Deserializer<'de>, { @@ -614,14 +602,12 @@ impl AzureCliCredential { match Command::new(program).args(args).output() { Ok(az_output) if az_output.status.success() => { - let output = - str::from_utf8(&az_output.stdout).map_err(|_| Error::AzureCli { - message: "az response is not a valid utf-8 string".to_string(), - })?; - - let token_response = - serde_json::from_str::(output) - .context(AzureCliResponseSnafu)?; + let output = str::from_utf8(&az_output.stdout).map_err(|_| Error::AzureCli { + message: "az response is not a valid utf-8 string".to_string(), + })?; + + let token_response = serde_json::from_str::(output) + .context(AzureCliResponseSnafu)?; if !token_response.token_type.eq_ignore_ascii_case("bearer") { return Err(Error::AzureCli { message: format!( @@ -630,12 +616,10 @@ impl AzureCliCredential { ), }); } - let duration = token_response.expires_on.naive_local() - - chrono::Local::now().naive_local(); + let duration = + token_response.expires_on.naive_local() - chrono::Local::now().naive_local(); Ok(TemporaryToken { - token: Arc::new(AzureCredential::BearerToken( - token_response.access_token, - )), + token: Arc::new(AzureCredential::BearerToken(token_response.access_token)), expiry: Some( Instant::now() + duration.to_std().map_err(|_| Error::AzureCli { diff --git a/src/azure/mod.rs b/src/azure/mod.rs index 7e1db5b..5f76875 100644 --- a/src/azure/mod.rs +++ b/src/azure/mod.rs @@ -30,8 +30,7 @@ use self::client::{BlockId, BlockList}; use crate::{ multipart::{PartId, PutPart, WriteMultiPart}, path::Path, - GetOptions, GetResult, ListResult, MultipartId, ObjectMeta, ObjectStore, PutResult, - Result, + GetOptions, GetResult, ListResult, MultipartId, ObjectMeta, ObjectStore, PutResult, Result, }; use async_trait::async_trait; use base64::prelude::BASE64_STANDARD; @@ -52,8 +51,7 @@ mod client; mod credential; /// [`CredentialProvider`] for [`MicrosoftAzure`] -pub type AzureCredentialProvider = - Arc>; +pub type AzureCredentialProvider = Arc>; use crate::client::header::get_etag; pub use builder::{AzureConfigKey, MicrosoftAzureBuilder}; pub use credential::AzureCredential; @@ -109,11 +107,7 @@ impl ObjectStore for MicrosoftAzure { Ok((String::new(), Box::new(WriteMultiPart::new(inner, 8)))) } - async fn abort_multipart( - &self, - _location: &Path, - _multipart_id: &MultipartId, - ) -> Result<()> { + async fn abort_multipart(&self, _location: &Path, _multipart_id: &MultipartId) -> Result<()> { // There is no way to drop blocks that have been uploaded. Instead, they simply // expire in 7 days. Ok(()) @@ -202,8 +196,8 @@ impl PutPart for AzureMultiPartUpload { mod tests { use super::*; use crate::tests::{ - copy_if_not_exists, get_opts, list_uses_directories_correctly, - list_with_delimiter, put_get_delete_list_opts, rename_and_copy, stream_get, + copy_if_not_exists, get_opts, list_uses_directories_correctly, list_with_delimiter, + put_get_delete_list_opts, rename_and_copy, stream_get, }; #[tokio::test] diff --git a/src/buffered.rs b/src/buffered.rs index bdc3f4c..3a1354f 100644 --- a/src/buffered.rs +++ b/src/buffered.rs @@ -87,11 +87,7 @@ impl BufReader { } /// Create a new [`BufReader`] from the provided [`ObjectMeta`], [`ObjectStore`], and `capacity` - pub fn with_capacity( - store: Arc, - meta: &ObjectMeta, - capacity: usize, - ) -> Self { + pub fn with_capacity(store: Arc, meta: &ObjectMeta, capacity: usize) -> Self { Self { path: meta.location.clone(), size: meta.size as _, @@ -138,21 +134,32 @@ impl AsyncSeek for BufReader { fn start_seek(mut self: Pin<&mut Self>, position: SeekFrom) -> std::io::Result<()> { self.cursor = match position { SeekFrom::Start(offset) => offset, - SeekFrom::End(offset) => { - checked_add_signed(self.size,offset).ok_or_else(|| Error::new(ErrorKind::InvalidInput, format!("Seeking {offset} from end of {} byte file would result in overflow", self.size)))? - } + SeekFrom::End(offset) => checked_add_signed(self.size, offset).ok_or_else(|| { + Error::new( + ErrorKind::InvalidInput, + format!( + "Seeking {offset} from end of {} byte file would result in overflow", + self.size + ), + ) + })?, SeekFrom::Current(offset) => { - checked_add_signed(self.cursor, offset).ok_or_else(|| Error::new(ErrorKind::InvalidInput, format!("Seeking {offset} from current offset of {} would result in overflow", self.cursor)))? + checked_add_signed(self.cursor, offset).ok_or_else(|| { + Error::new( + ErrorKind::InvalidInput, + format!( + "Seeking {offset} from current offset of {} would result in overflow", + self.cursor + ), + ) + })? } }; self.buffer = Buffer::Empty; Ok(()) } - fn poll_complete( - self: Pin<&mut Self>, - _cx: &mut Context<'_>, - ) -> Poll> { + fn poll_complete(self: Pin<&mut Self>, _cx: &mut Context<'_>) -> Poll> { Poll::Ready(Ok(self.cursor)) } } @@ -179,10 +186,7 @@ impl AsyncRead for BufReader { } impl AsyncBufRead for BufReader { - fn poll_fill_buf( - self: Pin<&mut Self>, - cx: &mut Context<'_>, - ) -> Poll> { + fn poll_fill_buf(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { let capacity = self.capacity; self.get_mut().poll_fill_buf_impl(cx, capacity) } @@ -238,7 +242,10 @@ mod tests { assert_eq!(&out, &data); let err = reader.seek(SeekFrom::Current(i64::MIN)).await.unwrap_err(); - assert_eq!(err.to_string(), "Seeking -9223372036854775808 from current offset of 4096 would result in overflow"); + assert_eq!( + err.to_string(), + "Seeking -9223372036854775808 from current offset of 4096 would result in overflow" + ); reader.rewind().await.unwrap(); @@ -254,7 +261,10 @@ mod tests { assert!(buf.is_empty()); let err = reader.seek(SeekFrom::Current(1)).await.unwrap_err(); - assert_eq!(err.to_string(), "Seeking 1 from current offset of 18446744073709551615 would result in overflow"); + assert_eq!( + err.to_string(), + "Seeking 1 from current offset of 18446744073709551615 would result in overflow" + ); for capacity in [200, 1024, 4096, DEFAULT_BUFFER_SIZE] { let store = Arc::clone(&store); diff --git a/src/chunked.rs b/src/chunked.rs index 5694c55..021f9f5 100644 --- a/src/chunked.rs +++ b/src/chunked.rs @@ -29,8 +29,7 @@ use tokio::io::AsyncWrite; use crate::path::Path; use crate::{ - GetOptions, GetResult, GetResultPayload, ListResult, ObjectMeta, ObjectStore, - PutResult, + GetOptions, GetResult, GetResultPayload, ListResult, ObjectMeta, ObjectStore, PutResult, }; use crate::{MultipartId, Result}; @@ -74,11 +73,7 @@ impl ObjectStore for ChunkedStore { self.inner.put_multipart(location).await } - async fn abort_multipart( - &self, - location: &Path, - multipart_id: &MultipartId, - ) -> Result<()> { + async fn abort_multipart(&self, location: &Path, multipart_id: &MultipartId) -> Result<()> { self.inner.abort_multipart(location, multipart_id).await } diff --git a/src/client/backoff.rs b/src/client/backoff.rs index a4ca976..e015891 100644 --- a/src/client/backoff.rs +++ b/src/client/backoff.rs @@ -98,10 +98,7 @@ impl Backoff { }; let next_backoff = self.max_backoff_secs.min(rand_backoff); - Duration::from_secs_f64(std::mem::replace( - &mut self.next_backoff_secs, - next_backoff, - )) + Duration::from_secs_f64(std::mem::replace(&mut self.next_backoff_secs, next_backoff)) } } @@ -122,8 +119,7 @@ mod tests { base, }; - let assert_fuzzy_eq = - |a: f64, b: f64| assert!((b - a).abs() < 0.0001, "{a} != {b}"); + let assert_fuzzy_eq = |a: f64, b: f64| assert!((b - a).abs() < 0.0001, "{a} != {b}"); // Create a static rng that takes the minimum of the range let rng = Box::new(StepRng::new(0, 0)); @@ -149,8 +145,8 @@ mod tests { let mut value = init_backoff_secs; for _ in 0..20 { assert_fuzzy_eq(backoff.next().as_secs_f64(), value); - value = (init_backoff_secs + (value * base - init_backoff_secs) / 2.) - .min(max_backoff_secs); + value = + (init_backoff_secs + (value * base - init_backoff_secs) / 2.).min(max_backoff_secs); } } } diff --git a/src/client/get.rs b/src/client/get.rs index 7f68b6d..ed1762f 100644 --- a/src/client/get.rs +++ b/src/client/get.rs @@ -48,13 +48,12 @@ impl GetClientExt for T { async fn get_opts(&self, location: &Path, options: GetOptions) -> Result { let range = options.range.clone(); let response = self.get_request(location, options).await?; - let meta = - header_meta(location, response.headers(), T::HEADER_CONFIG).map_err(|e| { - Error::Generic { - store: T::STORE, - source: Box::new(e), - } - })?; + let meta = header_meta(location, response.headers(), T::HEADER_CONFIG).map_err(|e| { + Error::Generic { + store: T::STORE, + source: Box::new(e), + } + })?; let stream = response .bytes_stream() diff --git a/src/client/mock_server.rs b/src/client/mock_server.rs index adb7e0f..36c6b65 100644 --- a/src/client/mock_server.rs +++ b/src/client/mock_server.rs @@ -57,8 +57,7 @@ impl MockServer { }); let (shutdown, rx) = oneshot::channel::<()>(); - let server = - Server::bind(&SocketAddr::from(([127, 0, 0, 1], 0))).serve(make_service); + let server = Server::bind(&SocketAddr::from(([127, 0, 0, 1], 0))).serve(make_service); let url = format!("http://{}", server.local_addr()); diff --git a/src/client/mod.rs b/src/client/mod.rs index 137da2b..3c968f1 100644 --- a/src/client/mod.rs +++ b/src/client/mod.rs @@ -61,8 +61,7 @@ fn map_client_error(e: reqwest::Error) -> super::Error { } } -static DEFAULT_USER_AGENT: &str = - concat!(env!("CARGO_PKG_NAME"), "/", env!("CARGO_PKG_VERSION"),); +static DEFAULT_USER_AGENT: &str = concat!(env!("CARGO_PKG_NAME"), "/", env!("CARGO_PKG_VERSION"),); /// Configuration keys for [`ClientOptions`] #[derive(PartialEq, Eq, Hash, Clone, Debug, Copy, Deserialize, Serialize)] @@ -231,9 +230,7 @@ impl ClientOptions { ClientConfigKey::ConnectTimeout => { self.connect_timeout = Some(ConfigValue::Deferred(value.into())) } - ClientConfigKey::DefaultContentType => { - self.default_content_type = Some(value.into()) - } + ClientConfigKey::DefaultContentType => self.default_content_type = Some(value.into()), ClientConfigKey::Http1Only => self.http1_only.parse(value), ClientConfigKey::Http2Only => self.http2_only.parse(value), ClientConfigKey::Http2KeepAliveInterval => { @@ -252,13 +249,9 @@ impl ClientOptions { self.pool_max_idle_per_host = Some(ConfigValue::Deferred(value.into())) } ClientConfigKey::ProxyUrl => self.proxy_url = Some(value.into()), - ClientConfigKey::ProxyCaCertificate => { - self.proxy_ca_certificate = Some(value.into()) - } + ClientConfigKey::ProxyCaCertificate => self.proxy_ca_certificate = Some(value.into()), ClientConfigKey::ProxyExcludes => self.proxy_excludes = Some(value.into()), - ClientConfigKey::Timeout => { - self.timeout = Some(ConfigValue::Deferred(value.into())) - } + ClientConfigKey::Timeout => self.timeout = Some(ConfigValue::Deferred(value.into())), ClientConfigKey::UserAgent => { self.user_agent = Some(ConfigValue::Deferred(value.into())) } @@ -270,12 +263,8 @@ impl ClientOptions { pub fn get_config_value(&self, key: &ClientConfigKey) -> Option { match key { ClientConfigKey::AllowHttp => Some(self.allow_http.to_string()), - ClientConfigKey::AllowInvalidCertificates => { - Some(self.allow_insecure.to_string()) - } - ClientConfigKey::ConnectTimeout => { - self.connect_timeout.as_ref().map(fmt_duration) - } + ClientConfigKey::AllowInvalidCertificates => Some(self.allow_insecure.to_string()), + ClientConfigKey::ConnectTimeout => self.connect_timeout.as_ref().map(fmt_duration), ClientConfigKey::DefaultContentType => self.default_content_type.clone(), ClientConfigKey::Http1Only => Some(self.http1_only.to_string()), ClientConfigKey::Http2KeepAliveInterval => { @@ -288,9 +277,7 @@ impl ClientOptions { Some(self.http2_keep_alive_while_idle.to_string()) } ClientConfigKey::Http2Only => Some(self.http2_only.to_string()), - ClientConfigKey::PoolIdleTimeout => { - self.pool_idle_timeout.as_ref().map(fmt_duration) - } + ClientConfigKey::PoolIdleTimeout => self.pool_idle_timeout.as_ref().map(fmt_duration), ClientConfigKey::PoolMaxIdlePerHost => { self.pool_max_idle_per_host.as_ref().map(|v| v.to_string()) } @@ -378,10 +365,7 @@ impl ClientOptions { } /// Set a trusted proxy CA certificate - pub fn with_proxy_ca_certificate( - mut self, - proxy_ca_certificate: impl Into, - ) -> Self { + pub fn with_proxy_ca_certificate(mut self, proxy_ca_certificate: impl Into) -> Self { self.proxy_ca_certificate = Some(proxy_ca_certificate.into()); self } @@ -522,9 +506,8 @@ impl ClientOptions { let mut proxy = Proxy::all(proxy).map_err(map_client_error)?; if let Some(certificate) = &self.proxy_ca_certificate { - let certificate = - reqwest::tls::Certificate::from_pem(certificate.as_bytes()) - .map_err(map_client_error)?; + let certificate = reqwest::tls::Certificate::from_pem(certificate.as_bytes()) + .map_err(map_client_error)?; builder = builder.add_root_certificate(certificate); } diff --git a/src/client/retry.rs b/src/client/retry.rs index e4d246c..d70d6d8 100644 --- a/src/client/retry.rs +++ b/src/client/retry.rs @@ -403,7 +403,12 @@ mod tests { } let e = do_request().await.unwrap_err().to_string(); - assert!(e.starts_with("Error after 2 retries: HTTP status server error (502 Bad Gateway) for url"), "{e}"); + assert!( + e.starts_with( + "Error after 2 retries: HTTP status server error (502 Bad Gateway) for url" + ), + "{e}" + ); // Panic results in an incomplete message error in the client mock.push_fn(|_| panic!()); diff --git a/src/delimited.rs b/src/delimited.rs index 1321486..4f25c9d 100644 --- a/src/delimited.rs +++ b/src/delimited.rs @@ -228,8 +228,7 @@ mod tests { #[tokio::test] async fn test_delimiter_stream() { let input = vec!["hello\nworld\nbin", "go\ncup", "cakes"]; - let input_stream = - futures::stream::iter(input.into_iter().map(|s| Ok(Bytes::from(s)))); + let input_stream = futures::stream::iter(input.into_iter().map(|s| Ok(Bytes::from(s)))); let stream = newline_delimited_stream(input_stream); let results: Vec<_> = stream.try_collect().await.unwrap(); diff --git a/src/gcp/builder.rs b/src/gcp/builder.rs index 920ab8b..2039d23 100644 --- a/src/gcp/builder.rs +++ b/src/gcp/builder.rs @@ -21,12 +21,8 @@ use crate::gcp::credential::{ ApplicationDefaultCredentials, InstanceCredentialProvider, ServiceAccountCredentials, DEFAULT_GCS_BASE_URL, }; -use crate::gcp::{ - credential, GcpCredential, GcpCredentialProvider, GoogleCloudStorage, STORE, -}; -use crate::{ - ClientConfigKey, ClientOptions, Result, RetryConfig, StaticCredentialProvider, -}; +use crate::gcp::{credential, GcpCredential, GcpCredentialProvider, GoogleCloudStorage, STORE}; +use crate::{ClientConfigKey, ClientOptions, Result, RetryConfig, StaticCredentialProvider}; use serde::{Deserialize, Serialize}; use snafu::{OptionExt, ResultExt, Snafu}; use std::str::FromStr; @@ -38,9 +34,7 @@ enum Error { #[snafu(display("Missing bucket name"))] MissingBucketName {}, - #[snafu(display( - "One of service account path or service account key may be provided." - ))] + #[snafu(display("One of service account path or service account key may be provided."))] ServiceAccountPathAndKeyProvided, #[snafu(display("Unable parse source url. Url: {}, Error: {}", url, source))] @@ -188,12 +182,8 @@ impl FromStr for GoogleConfigKey { | "service_account" | "google_service_account_path" | "service_account_path" => Ok(Self::ServiceAccount), - "google_service_account_key" | "service_account_key" => { - Ok(Self::ServiceAccountKey) - } - "google_bucket" | "google_bucket_name" | "bucket" | "bucket_name" => { - Ok(Self::Bucket) - } + "google_service_account_key" | "service_account_key" => Ok(Self::ServiceAccountKey), + "google_bucket" | "google_bucket_name" | "bucket" | "bucket_name" => Ok(Self::Bucket), "google_application_credentials" => Ok(Self::ApplicationCredentials), _ => match s.parse() { Ok(key) => Ok(Self::Client(key)), @@ -286,12 +276,8 @@ impl GoogleCloudStorageBuilder { /// Set an option on the builder via a key - value pair. pub fn with_config(mut self, key: GoogleConfigKey, value: impl Into) -> Self { match key { - GoogleConfigKey::ServiceAccount => { - self.service_account_path = Some(value.into()) - } - GoogleConfigKey::ServiceAccountKey => { - self.service_account_key = Some(value.into()) - } + GoogleConfigKey::ServiceAccount => self.service_account_path = Some(value.into()), + GoogleConfigKey::ServiceAccountKey => self.service_account_key = Some(value.into()), GoogleConfigKey::Bucket => self.bucket_name = Some(value.into()), GoogleConfigKey::ApplicationCredentials => { self.application_credentials_path = Some(value.into()) @@ -305,20 +291,14 @@ impl GoogleCloudStorageBuilder { /// Set an option on the builder via a key - value pair. #[deprecated(note = "Use with_config")] - pub fn try_with_option( - self, - key: impl AsRef, - value: impl Into, - ) -> Result { + pub fn try_with_option(self, key: impl AsRef, value: impl Into) -> Result { Ok(self.with_config(key.as_ref().parse()?, value)) } /// Hydrate builder from key value pairs #[deprecated(note = "Use with_config")] #[allow(deprecated)] - pub fn try_with_options< - I: IntoIterator, impl Into)>, - >( + pub fn try_with_options, impl Into)>>( mut self, options: I, ) -> Result { @@ -344,9 +324,7 @@ impl GoogleCloudStorageBuilder { GoogleConfigKey::ServiceAccount => self.service_account_path.clone(), GoogleConfigKey::ServiceAccountKey => self.service_account_key.clone(), GoogleConfigKey::Bucket => self.bucket_name.clone(), - GoogleConfigKey::ApplicationCredentials => { - self.application_credentials_path.clone() - } + GoogleConfigKey::ApplicationCredentials => self.application_credentials_path.clone(), GoogleConfigKey::Client(key) => self.client_options.get_config_value(key), } } @@ -394,10 +372,7 @@ impl GoogleCloudStorageBuilder { /// "private_key": "" /// } /// ``` - pub fn with_service_account_path( - mut self, - service_account_path: impl Into, - ) -> Self { + pub fn with_service_account_path(mut self, service_account_path: impl Into) -> Self { self.service_account_path = Some(service_account_path.into()); self } @@ -407,10 +382,7 @@ impl GoogleCloudStorageBuilder { /// /// This or [`GoogleCloudStorageBuilder::with_service_account_path`] must be /// set. - pub fn with_service_account_key( - mut self, - service_account: impl Into, - ) -> Self { + pub fn with_service_account_key(mut self, service_account: impl Into) -> Self { self.service_account_key = Some(service_account.into()); self } @@ -445,10 +417,7 @@ impl GoogleCloudStorageBuilder { } /// Set a trusted proxy CA certificate - pub fn with_proxy_ca_certificate( - mut self, - proxy_ca_certificate: impl Into, - ) -> Self { + pub fn with_proxy_ca_certificate(mut self, proxy_ca_certificate: impl Into) -> Self { self.client_options = self .client_options .with_proxy_ca_certificate(proxy_ca_certificate); @@ -479,23 +448,19 @@ impl GoogleCloudStorageBuilder { // First try to initialize from the service account information. let service_account_credentials = match (self.service_account_path, self.service_account_key) { - (Some(path), None) => Some( - ServiceAccountCredentials::from_file(path) - .context(CredentialSnafu)?, - ), - (None, Some(key)) => Some( - ServiceAccountCredentials::from_key(&key).context(CredentialSnafu)?, - ), - (None, None) => None, - (Some(_), Some(_)) => { - return Err(Error::ServiceAccountPathAndKeyProvided.into()) + (Some(path), None) => { + Some(ServiceAccountCredentials::from_file(path).context(CredentialSnafu)?) + } + (None, Some(key)) => { + Some(ServiceAccountCredentials::from_key(&key).context(CredentialSnafu)?) } + (None, None) => None, + (Some(_), Some(_)) => return Err(Error::ServiceAccountPathAndKeyProvided.into()), }; // Then try to initialize from the application credentials file, or the environment. - let application_default_credentials = ApplicationDefaultCredentials::read( - self.application_credentials_path.as_deref(), - )?; + let application_default_credentials = + ApplicationDefaultCredentials::read(self.application_credentials_path.as_deref())?; let disable_oauth = service_account_credentials .as_ref() @@ -617,8 +582,8 @@ mod tests { // Service account key for alias in ["google_service_account_key", "service_account_key"] { - let builder = GoogleCloudStorageBuilder::new() - .with_config(alias.parse().unwrap(), FAKE_KEY); + let builder = + GoogleCloudStorageBuilder::new().with_config(alias.parse().unwrap(), FAKE_KEY); assert_eq!(FAKE_KEY, builder.service_account_key.unwrap()); } @@ -629,8 +594,8 @@ mod tests { "bucket", "bucket_name", ] { - let builder = GoogleCloudStorageBuilder::new() - .with_config(alias.parse().unwrap(), "fake_bucket"); + let builder = + GoogleCloudStorageBuilder::new().with_config(alias.parse().unwrap(), "fake_bucket"); assert_eq!("fake_bucket", builder.bucket_name.unwrap()); } } diff --git a/src/gcp/client.rs b/src/gcp/client.rs index 9141a9d..4165d78 100644 --- a/src/gcp/client.rs +++ b/src/gcp/client.rs @@ -210,18 +210,13 @@ impl GoogleCloudStorageClient { let data = response.bytes().await.context(PutResponseBodySnafu)?; let result: InitiateMultipartUploadResult = - quick_xml::de::from_reader(data.as_ref().reader()) - .context(InvalidPutResponseSnafu)?; + quick_xml::de::from_reader(data.as_ref().reader()).context(InvalidPutResponseSnafu)?; Ok(result.upload_id) } /// Cleanup unused parts - pub async fn multipart_cleanup( - &self, - path: &Path, - multipart_id: &MultipartId, - ) -> Result<()> { + pub async fn multipart_cleanup(&self, path: &Path, multipart_id: &MultipartId) -> Result<()> { let credential = self.get_credential().await?; let url = self.object_url(path); @@ -300,12 +295,7 @@ impl GoogleCloudStorageClient { } /// Perform a copy request - pub async fn copy_request( - &self, - from: &Path, - to: &Path, - if_not_exists: bool, - ) -> Result<()> { + pub async fn copy_request(&self, from: &Path, to: &Path, if_not_exists: bool) -> Result<()> { let credential = self.get_credential().await?; let url = self.object_url(to); @@ -416,8 +406,8 @@ impl ListClient for GoogleCloudStorageClient { .await .context(ListResponseBodySnafu)?; - let mut response: ListResponse = quick_xml::de::from_reader(response.reader()) - .context(InvalidListResponseSnafu)?; + let mut response: ListResponse = + quick_xml::de::from_reader(response.reader()).context(InvalidListResponseSnafu)?; let token = response.next_continuation_token.take(); Ok((response.try_into()?, token)) diff --git a/src/gcp/credential.rs b/src/gcp/credential.rs index 87f8e24..29c7b45 100644 --- a/src/gcp/credential.rs +++ b/src/gcp/credential.rs @@ -226,9 +226,7 @@ impl TokenProvider for SelfSignedJwt { } } -fn read_credentials_file( - service_account_path: impl AsRef, -) -> Result +fn read_credentials_file(service_account_path: impl AsRef) -> Result where T: serde::de::DeserializeOwned, { @@ -329,9 +327,8 @@ async fn make_metadata_request( hostname: &str, retry: &RetryConfig, ) -> crate::Result { - let url = format!( - "http://{hostname}/computeMetadata/v1/instance/service-accounts/default/token" - ); + let url = + format!("http://{hostname}/computeMetadata/v1/instance/service-accounts/default/token"); let response: TokenResponse = client .request(Method::GET, url) .header("Metadata-Flavor", "Google") @@ -396,8 +393,7 @@ pub enum ApplicationDefaultCredentials { } impl ApplicationDefaultCredentials { - const CREDENTIALS_PATH: &'static str = - ".config/gcloud/application_default_credentials.json"; + const CREDENTIALS_PATH: &'static str = ".config/gcloud/application_default_credentials.json"; // Create a new application default credential in the following situations: // 1. a file is passed in and the type matches. diff --git a/src/gcp/mod.rs b/src/gcp/mod.rs index 7c69d28..6512a8b 100644 --- a/src/gcp/mod.rs +++ b/src/gcp/mod.rs @@ -35,8 +35,7 @@ use crate::client::CredentialProvider; use crate::{ multipart::{PartId, PutPart, WriteMultiPart}, path::Path, - GetOptions, GetResult, ListResult, MultipartId, ObjectMeta, ObjectStore, PutResult, - Result, + GetOptions, GetResult, ListResult, MultipartId, ObjectMeta, ObjectStore, PutResult, Result, }; use async_trait::async_trait; use bytes::Bytes; @@ -137,11 +136,7 @@ impl ObjectStore for GoogleCloudStorage { Ok((upload_id, Box::new(WriteMultiPart::new(inner, 8)))) } - async fn abort_multipart( - &self, - location: &Path, - multipart_id: &MultipartId, - ) -> Result<()> { + async fn abort_multipart(&self, location: &Path, multipart_id: &MultipartId) -> Result<()> { self.client .multipart_cleanup(location, multipart_id) .await?; diff --git a/src/http/client.rs b/src/http/client.rs index 4c2a7fc..f7593be 100644 --- a/src/http/client.rs +++ b/src/http/client.rs @@ -90,11 +90,7 @@ pub struct Client { } impl Client { - pub fn new( - url: Url, - client_options: ClientOptions, - retry_config: RetryConfig, - ) -> Result { + pub fn new(url: Url, client_options: ClientOptions, retry_config: RetryConfig) -> Result { let client = client_options.client()?; Ok(Self { url, @@ -183,11 +179,7 @@ impl Client { } } - pub async fn list( - &self, - location: Option<&Path>, - depth: &str, - ) -> Result { + pub async fn list(&self, location: Option<&Path>, depth: &str) -> Result { let url = location .map(|path| self.path_url(path)) .unwrap_or_else(|| self.url.clone()); @@ -220,8 +212,7 @@ impl Client { Err(source) => return Err(Error::Request { source }.into()), }; - let status = quick_xml::de::from_reader(response.reader()) - .context(InvalidPropFindSnafu)?; + let status = quick_xml::de::from_reader(response.reader()).context(InvalidPropFindSnafu)?; Ok(status) } diff --git a/src/http/mod.rs b/src/http/mod.rs index e41e4f9..8f61011 100644 --- a/src/http/mod.rs +++ b/src/http/mod.rs @@ -45,8 +45,8 @@ use crate::client::header::get_etag; use crate::http::client::Client; use crate::path::Path; use crate::{ - ClientConfigKey, ClientOptions, GetOptions, GetResult, ListResult, MultipartId, - ObjectMeta, ObjectStore, PutResult, Result, RetryConfig, + ClientConfigKey, ClientOptions, GetOptions, GetResult, ListResult, MultipartId, ObjectMeta, + ObjectStore, PutResult, Result, RetryConfig, }; mod client; @@ -113,11 +113,7 @@ impl ObjectStore for HttpStore { Err(super::Error::NotImplemented) } - async fn abort_multipart( - &self, - _location: &Path, - _multipart_id: &MultipartId, - ) -> Result<()> { + async fn abort_multipart(&self, _location: &Path, _multipart_id: &MultipartId) -> Result<()> { Err(super::Error::NotImplemented) } diff --git a/src/lib.rs b/src/lib.rs index 8631361..375302e 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -256,8 +256,8 @@ mod client; #[cfg(feature = "cloud")] pub use client::{ - backoff::BackoffConfig, retry::RetryConfig, ClientConfigKey, ClientOptions, - CredentialProvider, StaticCredentialProvider, + backoff::BackoffConfig, retry::RetryConfig, ClientConfigKey, ClientOptions, CredentialProvider, + StaticCredentialProvider, }; #[cfg(feature = "cloud")] @@ -323,11 +323,7 @@ pub trait ObjectStore: std::fmt::Display + Send + Sync + Debug + 'static { /// /// See documentation for individual stores for exact behavior, as capabilities /// vary by object store. - async fn abort_multipart( - &self, - location: &Path, - multipart_id: &MultipartId, - ) -> Result<()>; + async fn abort_multipart(&self, location: &Path, multipart_id: &MultipartId) -> Result<()>; /// Returns an [`AsyncWrite`] that can be used to append to the object at `location` /// @@ -349,10 +345,7 @@ pub trait ObjectStore: std::fmt::Display + Send + Sync + Debug + 'static { /// Additionally some stores, such as Azure, may only support appending to objects created /// with [`ObjectStore::append`], and not with [`ObjectStore::put`], [`ObjectStore::copy`], or /// [`ObjectStore::put_multipart`] - async fn append( - &self, - _location: &Path, - ) -> Result> { + async fn append(&self, _location: &Path) -> Result> { Err(Error::NotImplemented) } @@ -376,11 +369,7 @@ pub trait ObjectStore: std::fmt::Display + Send + Sync + Debug + 'static { /// Return the bytes that are stored at the specified location /// in the given byte ranges - async fn get_ranges( - &self, - location: &Path, - ranges: &[Range], - ) -> Result> { + async fn get_ranges(&self, location: &Path, ranges: &[Range]) -> Result> { coalesce_ranges( ranges, |range| self.get_range(location, range), @@ -547,10 +536,7 @@ macro_rules! as_ref_impl { self.as_ref().abort_multipart(location, multipart_id).await } - async fn append( - &self, - location: &Path, - ) -> Result> { + async fn append(&self, location: &Path) -> Result> { self.as_ref().append(location).await } @@ -558,19 +544,11 @@ macro_rules! as_ref_impl { self.as_ref().get(location).await } - async fn get_opts( - &self, - location: &Path, - options: GetOptions, - ) -> Result { + async fn get_opts(&self, location: &Path, options: GetOptions) -> Result { self.as_ref().get_opts(location, options).await } - async fn get_range( - &self, - location: &Path, - range: Range, - ) -> Result { + async fn get_range(&self, location: &Path, range: Range) -> Result { self.as_ref().get_range(location, range).await } @@ -609,10 +587,7 @@ macro_rules! as_ref_impl { self.as_ref().list_with_offset(prefix, offset) } - async fn list_with_delimiter( - &self, - prefix: Option<&Path>, - ) -> Result { + async fn list_with_delimiter(&self, prefix: Option<&Path>) -> Result { self.as_ref().list_with_delimiter(prefix).await } @@ -799,20 +774,16 @@ impl GetResult { #[cfg(not(target_arch = "wasm32"))] GetResultPayload::File(mut file, path) => { maybe_spawn_blocking(move || { - file.seek(SeekFrom::Start(self.range.start as _)).map_err( - |source| local::Error::Seek { + file.seek(SeekFrom::Start(self.range.start as _)) + .map_err(|source| local::Error::Seek { source, path: path.clone(), - }, - )?; + })?; let mut buffer = Vec::with_capacity(len); file.take(len as _) .read_to_end(&mut buffer) - .map_err(|source| local::Error::UnableToReadBytes { - source, - path, - })?; + .map_err(|source| local::Error::UnableToReadBytes { source, path })?; Ok(buffer.into()) }) @@ -915,11 +886,7 @@ pub enum Error { #[snafu(display("Operation not yet implemented."))] NotImplemented, - #[snafu(display( - "Configuration key: '{}' is not valid for store '{}'.", - key, - store - ))] + #[snafu(display("Configuration key: '{}' is not valid for store '{}'.", key, store))] UnknownConfigurationKey { store: &'static str, key: String }, } @@ -1245,8 +1212,7 @@ mod tests { for (prefix, offset) in cases { let s = storage.list_with_offset(prefix.as_ref(), &offset); - let mut actual: Vec<_> = - s.map_ok(|x| x.location).try_collect().await.unwrap(); + let mut actual: Vec<_> = s.map_ok(|x| x.location).try_collect().await.unwrap(); actual.sort_unstable(); @@ -1254,8 +1220,7 @@ mod tests { .iter() .cloned() .filter(|x| { - let prefix_match = - prefix.as_ref().map(|p| x.prefix_matches(p)).unwrap_or(true); + let prefix_match = prefix.as_ref().map(|p| x.prefix_matches(p)).unwrap_or(true); prefix_match && x > &offset }) .collect(); @@ -1627,8 +1592,7 @@ mod tests { storage: &DynObjectStore, location: Option, ) -> crate::Result { - let location = - location.unwrap_or_else(|| Path::from("this_file_should_not_exist")); + let location = location.unwrap_or_else(|| Path::from("this_file_should_not_exist")); let err = storage.head(&location).await.unwrap_err(); assert!(matches!(err, crate::Error::NotFound { .. })); diff --git a/src/limit.rs b/src/limit.rs index 8a45381..cd01a96 100644 --- a/src/limit.rs +++ b/src/limit.rs @@ -18,8 +18,8 @@ //! An object store that limits the maximum concurrency of the wrapped implementation use crate::{ - BoxStream, GetOptions, GetResult, GetResultPayload, ListResult, MultipartId, - ObjectMeta, ObjectStore, Path, PutResult, Result, StreamExt, + BoxStream, GetOptions, GetResult, GetResultPayload, ListResult, MultipartId, ObjectMeta, + ObjectStore, Path, PutResult, Result, StreamExt, }; use async_trait::async_trait; use bytes::Bytes; @@ -86,19 +86,12 @@ impl ObjectStore for LimitStore { Ok((id, Box::new(PermitWrapper::new(write, permit)))) } - async fn abort_multipart( - &self, - location: &Path, - multipart_id: &MultipartId, - ) -> Result<()> { + async fn abort_multipart(&self, location: &Path, multipart_id: &MultipartId) -> Result<()> { let _permit = self.semaphore.acquire().await.unwrap(); self.inner.abort_multipart(location, multipart_id).await } - async fn append( - &self, - location: &Path, - ) -> Result> { + async fn append(&self, location: &Path) -> Result> { let permit = Arc::clone(&self.semaphore).acquire_owned().await.unwrap(); let write = self.inner.append(location).await?; Ok(Box::new(PermitWrapper::new(write, permit))) @@ -121,11 +114,7 @@ impl ObjectStore for LimitStore { self.inner.get_range(location, range).await } - async fn get_ranges( - &self, - location: &Path, - ranges: &[Range], - ) -> Result> { + async fn get_ranges(&self, location: &Path, ranges: &[Range]) -> Result> { let _permit = self.semaphore.acquire().await.unwrap(); self.inner.get_ranges(location, ranges).await } @@ -226,10 +215,7 @@ impl PermitWrapper { impl Stream for PermitWrapper { type Item = T::Item; - fn poll_next( - mut self: Pin<&mut Self>, - cx: &mut Context<'_>, - ) -> Poll> { + fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { Pin::new(&mut self.inner).poll_next(cx) } diff --git a/src/local.rs b/src/local.rs index 4b7c963..9be3ee9 100644 --- a/src/local.rs +++ b/src/local.rs @@ -19,8 +19,8 @@ use crate::{ maybe_spawn_blocking, path::{absolute_path_to_url, Path}, - GetOptions, GetResult, GetResultPayload, ListResult, MultipartId, ObjectMeta, - ObjectStore, PutResult, Result, + GetOptions, GetResult, GetResultPayload, ListResult, MultipartId, ObjectMeta, ObjectStore, + PutResult, Result, }; use async_trait::async_trait; use bytes::Bytes; @@ -311,11 +311,7 @@ impl ObjectStore for LocalFileSystem { )) } - async fn abort_multipart( - &self, - location: &Path, - multipart_id: &MultipartId, - ) -> Result<()> { + async fn abort_multipart(&self, location: &Path, multipart_id: &MultipartId) -> Result<()> { let dest = self.config.path_to_filesystem(location)?; let path: PathBuf = staged_upload_path(&dest, multipart_id); @@ -329,10 +325,7 @@ impl ObjectStore for LocalFileSystem { .await } - async fn append( - &self, - location: &Path, - ) -> Result> { + async fn append(&self, location: &Path) -> Result> { // Get the path to the file from the configuration. let path = self.config.path_to_filesystem(location)?; loop { @@ -352,11 +345,10 @@ impl ObjectStore for LocalFileSystem { // If the error is that the file was not found, attempt to create the file and any necessary parent directories. Err(source) if source.kind() == ErrorKind::NotFound => { // Get the path to the parent directory of the file. - let parent = - path.parent().ok_or_else(|| Error::UnableToCreateFile { - path: path.to_path_buf(), - source, - })?; + let parent = path.parent().ok_or_else(|| Error::UnableToCreateFile { + path: path.to_path_buf(), + source, + })?; // Create the parent directory and any necessary ancestors. tokio::fs::create_dir_all(parent) @@ -367,9 +359,7 @@ impl ObjectStore for LocalFileSystem { continue; } // If any other error occurs, return a `UnableToOpenFile` error. - Err(source) => { - return Err(Error::UnableToOpenFile { source, path }.into()) - } + Err(source) => return Err(Error::UnableToOpenFile { source, path }.into()), } } } @@ -400,11 +390,7 @@ impl ObjectStore for LocalFileSystem { .await } - async fn get_ranges( - &self, - location: &Path, - ranges: &[Range], - ) -> Result> { + async fn get_ranges(&self, location: &Path, ranges: &[Range]) -> Result> { let path = self.config.path_to_filesystem(location)?; let ranges = ranges.to_vec(); maybe_spawn_blocking(move || { @@ -719,9 +705,7 @@ impl AsyncWrite for LocalUpload { runtime .spawn_blocking(move || (&*file2).write_all(&data)) .map(move |res| match res { - Err(err) => { - Err(io::Error::new(ErrorKind::Other, err)) - } + Err(err) => Err(io::Error::new(ErrorKind::Other, err)), Ok(res) => res.map(move |_| data_len), }), ), @@ -771,31 +755,24 @@ impl AsyncWrite for LocalUpload { // We are moving file into the future, and it will be dropped on it's completion, closing the file. let file = Arc::clone(file); self.inner_state = LocalUploadState::ShuttingDown(Box::pin( - runtime.spawn_blocking(move || (*file).sync_all()).map( - move |res| match res { - Err(err) => { - Err(io::Error::new(io::ErrorKind::Other, err)) - } + runtime + .spawn_blocking(move || (*file).sync_all()) + .map(move |res| match res { + Err(err) => Err(io::Error::new(io::ErrorKind::Other, err)), Ok(res) => res, - }, - ), + }), )); } LocalUploadState::ShuttingDown(fut) => match fut.poll_unpin(cx) { Poll::Ready(res) => { res?; - let staging_path = - staged_upload_path(&self.dest, &self.multipart_id); + let staging_path = staged_upload_path(&self.dest, &self.multipart_id); let dest = self.dest.clone(); self.inner_state = LocalUploadState::Committing(Box::pin( runtime - .spawn_blocking(move || { - std::fs::rename(&staging_path, &dest) - }) + .spawn_blocking(move || std::fs::rename(&staging_path, &dest)) .map(move |res| match res { - Err(err) => { - Err(io::Error::new(io::ErrorKind::Other, err)) - } + Err(err) => Err(io::Error::new(io::ErrorKind::Other, err)), Ok(res) => res, }), )); @@ -905,11 +882,7 @@ pub(crate) fn chunked_stream( .boxed() } -pub(crate) fn read_range( - file: &mut File, - path: &PathBuf, - range: Range, -) -> Result { +pub(crate) fn read_range(file: &mut File, path: &PathBuf, range: Range) -> Result { let to_read = range.end - range.start; file.seek(SeekFrom::Start(range.start as u64)) .context(SeekSnafu { path })?; @@ -1231,11 +1204,7 @@ mod tests { fs.list_with_delimiter(None).await.unwrap(); } - async fn check_list( - integration: &LocalFileSystem, - prefix: Option<&Path>, - expected: &[&str], - ) { + async fn check_list(integration: &LocalFileSystem, prefix: Option<&Path>, expected: &[&str]) { let result: Vec<_> = integration.list(prefix).try_collect().await.unwrap(); let mut strings: Vec<_> = result.iter().map(|x| x.location.as_ref()).collect(); @@ -1262,8 +1231,7 @@ mod tests { // Follow out of tree symlink let other = NamedTempFile::new().unwrap(); - std::os::unix::fs::symlink(other.path(), root.path().join("test.parquet")) - .unwrap(); + std::os::unix::fs::symlink(other.path(), root.path().join("test.parquet")).unwrap(); // Should return test.parquet even though out of tree check_list(&integration, None, &["a/file.parquet", "test.parquet"]).await; @@ -1288,11 +1256,7 @@ mod tests { .unwrap(); // Ignore broken symlink - std::os::unix::fs::symlink( - root.path().join("foo.parquet"), - root.path().join("c"), - ) - .unwrap(); + std::os::unix::fs::symlink(root.path().join("foo.parquet"), root.path().join("c")).unwrap(); check_list( &integration, @@ -1388,7 +1352,9 @@ mod tests { .to_string(); assert!( - err.contains("Encountered illegal character sequence \"💀\" whilst parsing path segment \"💀\""), + err.contains( + "Encountered illegal character sequence \"💀\" whilst parsing path segment \"💀\"" + ), "{}", err ); @@ -1401,12 +1367,10 @@ mod tests { let location = Path::from("some_file"); let data = Bytes::from("arbitrary data"); - let (multipart_id, mut writer) = - integration.put_multipart(&location).await.unwrap(); + let (multipart_id, mut writer) = integration.put_multipart(&location).await.unwrap(); writer.write_all(&data).await.unwrap(); - let (multipart_id_2, mut writer_2) = - integration.put_multipart(&location).await.unwrap(); + let (multipart_id_2, mut writer_2) = integration.put_multipart(&location).await.unwrap(); assert_ne!(multipart_id, multipart_id_2); writer_2.write_all(&data).await.unwrap(); @@ -1588,9 +1552,8 @@ mod unix_test { unistd::mkfifo(&path, stat::Mode::S_IRWXU).unwrap(); // Need to open read and write side in parallel - let spawned = tokio::task::spawn_blocking(|| { - OpenOptions::new().write(true).open(path).unwrap() - }); + let spawned = + tokio::task::spawn_blocking(|| OpenOptions::new().write(true).open(path).unwrap()); let location = Path::from(filename); integration.head(&location).await.unwrap(); diff --git a/src/memory.rs b/src/memory.rs index 952b457..da7b55d 100644 --- a/src/memory.rs +++ b/src/memory.rs @@ -17,8 +17,7 @@ //! An in-memory object store implementation use crate::{ - path::Path, GetResult, GetResultPayload, ListResult, ObjectMeta, ObjectStore, - PutResult, Result, + path::Path, GetResult, GetResultPayload, ListResult, ObjectMeta, ObjectStore, PutResult, Result, }; use crate::{GetOptions, MultipartId}; use async_trait::async_trait; @@ -145,19 +144,12 @@ impl ObjectStore for InMemory { )) } - async fn abort_multipart( - &self, - _location: &Path, - _multipart_id: &MultipartId, - ) -> Result<()> { + async fn abort_multipart(&self, _location: &Path, _multipart_id: &MultipartId) -> Result<()> { // Nothing to clean up Ok(()) } - async fn append( - &self, - location: &Path, - ) -> Result> { + async fn append(&self, location: &Path) -> Result> { Ok(Box::new(InMemoryAppend { location: location.clone(), data: Vec::::new(), @@ -195,11 +187,7 @@ impl ObjectStore for InMemory { }) } - async fn get_ranges( - &self, - location: &Path, - ranges: &[Range], - ) -> Result> { + async fn get_ranges(&self, location: &Path, ranges: &[Range]) -> Result> { let entry = self.entry(location).await?; ranges .iter() diff --git a/src/parse.rs b/src/parse.rs index 2e72a71..170726f 100644 --- a/src/parse.rs +++ b/src/parse.rs @@ -81,8 +81,7 @@ impl ObjectStoreScheme { } ("http", Some(_)) => (Self::Http, url.path()), ("https", Some(host)) => { - if host.ends_with("dfs.core.windows.net") - || host.ends_with("blob.core.windows.net") + if host.ends_with("dfs.core.windows.net") || host.ends_with("blob.core.windows.net") { (Self::MicrosoftAzure, url.path()) } else if host.ends_with("amazonaws.com") { @@ -166,12 +165,7 @@ where let url = &url[..url::Position::BeforePath]; Box::new(crate::http::HttpBuilder::new().with_url(url).build()?) as _ } - #[cfg(not(all( - feature = "aws", - feature = "azure", - feature = "gcp", - feature = "http" - )))] + #[cfg(not(all(feature = "aws", feature = "azure", feature = "gcp", feature = "http")))] s => { return Err(super::Error::Generic { store: "parse_url", diff --git a/src/path/mod.rs b/src/path/mod.rs index ab30e0e..e065c31 100644 --- a/src/path/mod.rs +++ b/src/path/mod.rs @@ -168,9 +168,7 @@ impl Path { /// as defined on the docstring for [`Path`] or does not exist /// /// Note: this will canonicalize the provided path, resolving any symlinks - pub fn from_filesystem_path( - path: impl AsRef, - ) -> Result { + pub fn from_filesystem_path(path: impl AsRef) -> Result { let absolute = std::fs::canonicalize(&path).context(CanonicalizeSnafu { path: path.as_ref(), })?; @@ -199,12 +197,14 @@ impl Path { ) -> Result { let url = absolute_path_to_url(path)?; let path = match base { - Some(prefix) => url.path().strip_prefix(prefix.path()).ok_or_else(|| { - Error::PrefixMismatch { - path: url.path().to_string(), - prefix: prefix.to_string(), - } - })?, + Some(prefix) => { + url.path() + .strip_prefix(prefix.path()) + .ok_or_else(|| Error::PrefixMismatch { + path: url.path().to_string(), + prefix: prefix.to_string(), + })? + } None => url.path(), }; @@ -256,10 +256,7 @@ impl Path { /// Returns an iterator of the [`PathPart`] of this [`Path`] after `prefix` /// /// Returns `None` if the prefix does not match - pub fn prefix_match( - &self, - prefix: &Self, - ) -> Option> + '_> { + pub fn prefix_match(&self, prefix: &Self) -> Option> + '_> { let mut stripped = self.raw.strip_prefix(&prefix.raw)?; if !stripped.is_empty() && !prefix.raw.is_empty() { stripped = stripped.strip_prefix(DELIMITER)?; @@ -333,9 +330,7 @@ where #[cfg(not(target_arch = "wasm32"))] /// Given an absolute filesystem path convert it to a URL representation without canonicalization -pub(crate) fn absolute_path_to_url( - path: impl AsRef, -) -> Result { +pub(crate) fn absolute_path_to_url(path: impl AsRef) -> Result { Url::from_file_path(&path).map_err(|_| Error::InvalidPath { path: path.as_ref().into(), }) @@ -498,8 +493,7 @@ mod tests { #[test] fn prefix_matches_with_file_name() { - let haystack = - Path::from_iter(["foo/bar", "baz%2Ftest", "something", "foo.segment"]); + let haystack = Path::from_iter(["foo/bar", "baz%2Ftest", "something", "foo.segment"]); // All directories match and file name is a prefix let needle = Path::from_iter(["foo/bar", "baz%2Ftest", "something", "foo"]); diff --git a/src/prefix.rs b/src/prefix.rs index 21f6c1d..c4cb77b 100644 --- a/src/prefix.rs +++ b/src/prefix.rs @@ -23,8 +23,7 @@ use tokio::io::AsyncWrite; use crate::path::Path; use crate::{ - GetOptions, GetResult, ListResult, MultipartId, ObjectMeta, ObjectStore, PutResult, - Result, + GetOptions, GetResult, ListResult, MultipartId, ObjectMeta, ObjectStore, PutResult, Result, }; #[doc(hidden)] @@ -93,19 +92,12 @@ impl ObjectStore for PrefixStore { self.inner.put_multipart(&full_path).await } - async fn abort_multipart( - &self, - location: &Path, - multipart_id: &MultipartId, - ) -> Result<()> { + async fn abort_multipart(&self, location: &Path, multipart_id: &MultipartId) -> Result<()> { let full_path = self.full_path(location); self.inner.abort_multipart(&full_path, multipart_id).await } - async fn append( - &self, - location: &Path, - ) -> Result> { + async fn append(&self, location: &Path) -> Result> { let full_path = self.full_path(location); self.inner.append(&full_path).await } @@ -125,11 +117,7 @@ impl ObjectStore for PrefixStore { self.inner.get_opts(&full_path, options).await } - async fn get_ranges( - &self, - location: &Path, - ranges: &[Range], - ) -> Result> { + async fn get_ranges(&self, location: &Path, ranges: &[Range]) -> Result> { let full_path = self.full_path(location); self.inner.get_ranges(&full_path, ranges).await } diff --git a/src/signer.rs b/src/signer.rs index f1f35de..f792397 100644 --- a/src/signer.rs +++ b/src/signer.rs @@ -31,10 +31,5 @@ pub trait Signer: Send + Sync + fmt::Debug + 'static { /// the URL should be valid, return a signed [`Url`] created with the object store /// implementation's credentials such that the URL can be handed to something that doesn't have /// access to the object store's credentials, to allow limited access to the object store. - async fn signed_url( - &self, - method: Method, - path: &Path, - expires_in: Duration, - ) -> Result; + async fn signed_url(&self, method: Method, path: &Path, expires_in: Duration) -> Result; } diff --git a/src/throttle.rs b/src/throttle.rs index d6f191b..c552125 100644 --- a/src/throttle.rs +++ b/src/throttle.rs @@ -21,8 +21,7 @@ use std::ops::Range; use std::{convert::TryInto, sync::Arc}; use crate::{ - path::Path, GetResult, GetResultPayload, ListResult, ObjectMeta, ObjectStore, - PutResult, Result, + path::Path, GetResult, GetResultPayload, ListResult, ObjectMeta, ObjectStore, PutResult, Result, }; use crate::{GetOptions, MultipartId}; use async_trait::async_trait; @@ -161,18 +160,11 @@ impl ObjectStore for ThrottledStore { Err(super::Error::NotImplemented) } - async fn abort_multipart( - &self, - _location: &Path, - _multipart_id: &MultipartId, - ) -> Result<()> { + async fn abort_multipart(&self, _location: &Path, _multipart_id: &MultipartId) -> Result<()> { Err(super::Error::NotImplemented) } - async fn append( - &self, - _location: &Path, - ) -> Result> { + async fn append(&self, _location: &Path) -> Result> { Err(super::Error::NotImplemented) } @@ -199,19 +191,15 @@ impl ObjectStore for ThrottledStore { async fn get_range(&self, location: &Path, range: Range) -> Result { let config = self.config(); - let sleep_duration = config.wait_get_per_call - + config.wait_get_per_byte * (range.end - range.start) as u32; + let sleep_duration = + config.wait_get_per_call + config.wait_get_per_byte * (range.end - range.start) as u32; sleep(sleep_duration).await; self.inner.get_range(location, range).await } - async fn get_ranges( - &self, - location: &Path, - ranges: &[Range], - ) -> Result> { + async fn get_ranges(&self, location: &Path, ranges: &[Range]) -> Result> { let config = self.config(); let total_bytes: usize = ranges.iter().map(|range| range.end - range.start).sum(); @@ -266,8 +254,7 @@ impl ObjectStore for ThrottledStore { match self.inner.list_with_delimiter(prefix).await { Ok(list_result) => { let entries_len = usize_to_u32_saturate(list_result.objects.len()); - sleep(self.config().wait_list_with_delimiter_per_entry * entries_len) - .await; + sleep(self.config().wait_list_with_delimiter_per_entry * entries_len).await; Ok(list_result) } Err(err) => Err(err), @@ -487,10 +474,7 @@ mod tests { assert_bounds!(measure_put(&store, 0).await, 0); } - async fn place_test_object( - store: &ThrottledStore, - n_bytes: Option, - ) -> Path { + async fn place_test_object(store: &ThrottledStore, n_bytes: Option) -> Path { let path = Path::from("foo"); if let Some(n_bytes) = n_bytes { @@ -506,10 +490,7 @@ mod tests { } #[allow(dead_code)] - async fn place_test_objects( - store: &ThrottledStore, - n_entries: usize, - ) -> Path { + async fn place_test_objects(store: &ThrottledStore, n_entries: usize) -> Path { let prefix = Path::from("foo"); // clean up store @@ -530,10 +511,7 @@ mod tests { prefix } - async fn measure_delete( - store: &ThrottledStore, - n_bytes: Option, - ) -> Duration { + async fn measure_delete(store: &ThrottledStore, n_bytes: Option) -> Duration { let path = place_test_object(store, n_bytes).await; let t0 = Instant::now(); @@ -543,10 +521,7 @@ mod tests { } #[allow(dead_code)] - async fn measure_get( - store: &ThrottledStore, - n_bytes: Option, - ) -> Duration { + async fn measure_get(store: &ThrottledStore, n_bytes: Option) -> Duration { let path = place_test_object(store, n_bytes).await; let t0 = Instant::now(); @@ -570,10 +545,7 @@ mod tests { } #[allow(dead_code)] - async fn measure_list( - store: &ThrottledStore, - n_entries: usize, - ) -> Duration { + async fn measure_list(store: &ThrottledStore, n_entries: usize) -> Duration { let prefix = place_test_objects(store, n_entries).await; let t0 = Instant::now(); diff --git a/src/util.rs b/src/util.rs index 764582a..fd86ba7 100644 --- a/src/util.rs +++ b/src/util.rs @@ -32,25 +32,19 @@ where D: serde::Deserializer<'de>, { let s: String = serde::Deserialize::deserialize(deserializer)?; - let naive = chrono::NaiveDateTime::parse_from_str(&s, RFC1123_FMT) - .map_err(serde::de::Error::custom)?; + let naive = + chrono::NaiveDateTime::parse_from_str(&s, RFC1123_FMT).map_err(serde::de::Error::custom)?; Ok(chrono::TimeZone::from_utc_datetime(&chrono::Utc, &naive)) } #[cfg(any(feature = "aws", feature = "azure"))] -pub(crate) fn hmac_sha256( - secret: impl AsRef<[u8]>, - bytes: impl AsRef<[u8]>, -) -> ring::hmac::Tag { +pub(crate) fn hmac_sha256(secret: impl AsRef<[u8]>, bytes: impl AsRef<[u8]>) -> ring::hmac::Tag { let key = ring::hmac::Key::new(ring::hmac::HMAC_SHA256, secret.as_ref()); ring::hmac::sign(&key, bytes.as_ref()) } /// Collect a stream into [`Bytes`] avoiding copying in the event of a single chunk -pub async fn collect_bytes( - mut stream: S, - size_hint: Option, -) -> Result +pub async fn collect_bytes(mut stream: S, size_hint: Option) -> Result where E: Send, S: Stream> + Send + Unpin, @@ -136,10 +130,7 @@ where } /// Returns a sorted list of ranges that cover `ranges` -fn merge_ranges( - ranges: &[std::ops::Range], - coalesce: usize, -) -> Vec> { +fn merge_ranges(ranges: &[std::ops::Range], coalesce: usize) -> Vec> { if ranges.is_empty() { return vec![]; } diff --git a/tests/get_range_file.rs b/tests/get_range_file.rs index 5703d7f..3fa1cc7 100644 --- a/tests/get_range_file.rs +++ b/tests/get_range_file.rs @@ -51,11 +51,7 @@ impl ObjectStore for MyStore { todo!() } - async fn abort_multipart( - &self, - _: &Path, - _: &MultipartId, - ) -> object_store::Result<()> { + async fn abort_multipart(&self, _: &Path, _: &MultipartId) -> object_store::Result<()> { todo!() } @@ -79,10 +75,7 @@ impl ObjectStore for MyStore { todo!() } - async fn list_with_delimiter( - &self, - _: Option<&Path>, - ) -> object_store::Result { + async fn list_with_delimiter(&self, _: Option<&Path>) -> object_store::Result { todo!() } From 08bf7df9348fed1b3ddf780e1f394b7008d4f1e0 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Fri, 20 Oct 2023 14:41:10 +0100 Subject: [PATCH 214/397] Add ObjectMeta::version and GetOptions::version (#4925) (#4935) --- src/aws/client.rs | 13 ++++++++++++- src/azure/client.rs | 14 +++++++++++++- src/client/get.rs | 5 +---- src/client/header.rs | 13 +++++++++++-- src/client/list_response.rs | 1 + src/gcp/client.rs | 7 ++++++- src/http/client.rs | 2 ++ src/lib.rs | 23 +++++++++++++++++++++++ src/local.rs | 1 + src/memory.rs | 4 ++++ src/prefix.rs | 1 + 11 files changed, 75 insertions(+), 9 deletions(-) diff --git a/src/aws/client.rs b/src/aws/client.rs index 6b34b18..00d6ee4 100644 --- a/src/aws/client.rs +++ b/src/aws/client.rs @@ -20,6 +20,7 @@ use crate::aws::credential::{AwsCredential, CredentialExt}; use crate::aws::{AwsCredentialProvider, S3CopyIfNotExists, STORE, STRICT_PATH_ENCODE_SET}; use crate::client::get::GetClient; use crate::client::header::get_etag; +use crate::client::header::HeaderConfig; use crate::client::list::ListClient; use crate::client::list_response::ListResponse; use crate::client::retry::RetryExt; @@ -549,6 +550,12 @@ impl S3Client { impl GetClient for S3Client { const STORE: &'static str = STORE; + const HEADER_CONFIG: HeaderConfig = HeaderConfig { + etag_required: false, + last_modified_required: false, + version_header: Some("x-amz-version-id"), + }; + /// Make an S3 GET request async fn get_request(&self, path: &Path, options: GetOptions) -> Result { let credential = self.get_credential().await?; @@ -558,7 +565,11 @@ impl GetClient for S3Client { false => Method::GET, }; - let builder = self.client.request(method, url); + let mut builder = self.client.request(method, url); + + if let Some(v) = &options.version { + builder = builder.query(&[("versionId", v)]) + } let response = builder .with_get_options(options) diff --git a/src/azure/client.rs b/src/azure/client.rs index b5ef021..cd3df8c 100644 --- a/src/azure/client.rs +++ b/src/azure/client.rs @@ -19,6 +19,7 @@ use super::credential::AzureCredential; use crate::azure::credential::*; use crate::azure::{AzureCredentialProvider, STORE}; use crate::client::get::GetClient; +use crate::client::header::HeaderConfig; use crate::client::list::ListClient; use crate::client::retry::RetryExt; use crate::client::GetOptionsExt; @@ -254,6 +255,12 @@ impl AzureClient { impl GetClient for AzureClient { const STORE: &'static str = STORE; + const HEADER_CONFIG: HeaderConfig = HeaderConfig { + etag_required: true, + last_modified_required: true, + version_header: Some("x-ms-version-id"), + }; + /// Make an Azure GET request /// /// @@ -265,12 +272,16 @@ impl GetClient for AzureClient { false => Method::GET, }; - let builder = self + let mut builder = self .client .request(method, url) .header(CONTENT_LENGTH, HeaderValue::from_static("0")) .body(Bytes::new()); + if let Some(v) = &options.version { + builder = builder.query(&[("versionid", v)]) + } + let response = builder .with_get_options(options) .with_azure_authorization(&credential, &self.config.account) @@ -427,6 +438,7 @@ impl TryFrom for ObjectMeta { last_modified: value.properties.last_modified, size: value.properties.content_length as usize, e_tag: value.properties.e_tag, + version: None, // For consistency with S3 and GCP which don't include this }) } } diff --git a/src/client/get.rs b/src/client/get.rs index ed1762f..5f9cac9 100644 --- a/src/client/get.rs +++ b/src/client/get.rs @@ -29,10 +29,7 @@ pub trait GetClient: Send + Sync + 'static { const STORE: &'static str; /// Configure the [`HeaderConfig`] for this client - const HEADER_CONFIG: HeaderConfig = HeaderConfig { - etag_required: true, - last_modified_required: true, - }; + const HEADER_CONFIG: HeaderConfig; async fn get_request(&self, path: &Path, options: GetOptions) -> Result; } diff --git a/src/client/header.rs b/src/client/header.rs index 17f83a2..e674968 100644 --- a/src/client/header.rs +++ b/src/client/header.rs @@ -35,6 +35,9 @@ pub struct HeaderConfig { /// /// Defaults to `true` pub last_modified_required: bool, + + /// The version header name if any + pub version_header: Option<&'static str>, } #[derive(Debug, Snafu)] @@ -98,14 +101,20 @@ pub fn header_meta( .context(MissingContentLengthSnafu)?; let content_length = content_length.to_str().context(BadHeaderSnafu)?; - let content_length = content_length + let size = content_length .parse() .context(InvalidContentLengthSnafu { content_length })?; + let version = match cfg.version_header.and_then(|h| headers.get(h)) { + Some(v) => Some(v.to_str().context(BadHeaderSnafu)?.to_string()), + None => None, + }; + Ok(ObjectMeta { location: location.clone(), last_modified, - size: content_length, + version, + size, e_tag, }) } diff --git a/src/client/list_response.rs b/src/client/list_response.rs index 6a3889e..7a170c5 100644 --- a/src/client/list_response.rs +++ b/src/client/list_response.rs @@ -80,6 +80,7 @@ impl TryFrom for ObjectMeta { last_modified: value.last_modified, size: value.size, e_tag: value.e_tag, + version: None, }) } } diff --git a/src/gcp/client.rs b/src/gcp/client.rs index 4165d78..558a6f8 100644 --- a/src/gcp/client.rs +++ b/src/gcp/client.rs @@ -16,7 +16,7 @@ // under the License. use crate::client::get::GetClient; -use crate::client::header::get_etag; +use crate::client::header::{get_etag, HeaderConfig}; use crate::client::list::ListClient; use crate::client::list_response::ListResponse; use crate::client::retry::RetryExt; @@ -333,6 +333,11 @@ impl GoogleCloudStorageClient { #[async_trait] impl GetClient for GoogleCloudStorageClient { const STORE: &'static str = STORE; + const HEADER_CONFIG: HeaderConfig = HeaderConfig { + etag_required: true, + last_modified_required: true, + version_header: Some("x-goog-generation"), + }; /// Perform a get request async fn get_request(&self, path: &Path, options: GetOptions) -> Result { diff --git a/src/http/client.rs b/src/http/client.rs index f7593be..a7dbdfc 100644 --- a/src/http/client.rs +++ b/src/http/client.rs @@ -277,6 +277,7 @@ impl GetClient for Client { const HEADER_CONFIG: HeaderConfig = HeaderConfig { etag_required: false, last_modified_required: false, + version_header: None, }; async fn get_request(&self, path: &Path, options: GetOptions) -> Result { @@ -375,6 +376,7 @@ impl MultiStatusResponse { last_modified, size: self.size()?, e_tag: self.prop_stat.prop.e_tag.clone(), + version: None, }) } diff --git a/src/lib.rs b/src/lib.rs index 375302e..656b303 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -637,6 +637,8 @@ pub struct ObjectMeta { /// /// pub e_tag: Option, + /// A version indicator for this object + pub version: Option, } /// Options for a get request, such as range @@ -685,6 +687,8 @@ pub struct GetOptions { /// /// pub range: Option>, + /// Request a particular object version + pub version: Option, /// Request transfer of no content /// /// @@ -1379,6 +1383,24 @@ mod tests { }; let err = storage.get_opts(&path, options).await.unwrap_err(); assert!(matches!(err, Error::Precondition { .. }), "{err}"); + + if let Some(version) = meta.version { + storage.put(&path, "bar".into()).await.unwrap(); + + let options = GetOptions { + version: Some(version), + ..GetOptions::default() + }; + + // Can retrieve previous version + let get_opts = storage.get_opts(&path, options).await.unwrap(); + let old = get_opts.bytes().await.unwrap(); + assert_eq!(old, b"foo".as_slice()); + + // Current version contains the updated data + let current = storage.get(&path).await.unwrap().bytes().await.unwrap(); + assert_eq!(¤t, b"bar".as_slice()); + } } /// Returns a chunk of length `chunk_length` @@ -1691,6 +1713,7 @@ mod tests { last_modified: Utc.timestamp_nanos(100), size: 100, e_tag: Some("123".to_string()), + version: None, }; let mut options = GetOptions::default(); diff --git a/src/local.rs b/src/local.rs index 9be3ee9..ce9aa46 100644 --- a/src/local.rs +++ b/src/local.rs @@ -969,6 +969,7 @@ fn convert_metadata(metadata: Metadata, location: Path) -> Result { last_modified, size, e_tag: Some(get_etag(&metadata)), + version: None, }) } diff --git a/src/memory.rs b/src/memory.rs index da7b55d..8b9522e 100644 --- a/src/memory.rs +++ b/src/memory.rs @@ -166,6 +166,7 @@ impl ObjectStore for InMemory { last_modified: entry.last_modified, size: entry.data.len(), e_tag: Some(e_tag), + version: None, }; options.check_preconditions(&meta)?; @@ -212,6 +213,7 @@ impl ObjectStore for InMemory { last_modified: entry.last_modified, size: entry.data.len(), e_tag: Some(entry.e_tag.to_string()), + version: None, }) } @@ -241,6 +243,7 @@ impl ObjectStore for InMemory { last_modified: value.last_modified, size: value.data.len(), e_tag: Some(value.e_tag.to_string()), + version: None, }) }) .collect(); @@ -285,6 +288,7 @@ impl ObjectStore for InMemory { last_modified: v.last_modified, size: v.data.len(), e_tag: Some(v.e_tag.to_string()), + version: None, }; objects.push(object); } diff --git a/src/prefix.rs b/src/prefix.rs index c4cb77b..b5bff8b 100644 --- a/src/prefix.rs +++ b/src/prefix.rs @@ -73,6 +73,7 @@ impl PrefixStore { size: meta.size, location: self.strip_prefix(meta.location), e_tag: meta.e_tag, + version: None, } } } From 97821add23845b3a229cf577cd4efd57b7e7d565 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Mon, 23 Oct 2023 22:27:19 +0100 Subject: [PATCH 215/397] Support ImdsManagedIdentityProvider in Azure Functions (#4976) (#4977) --- src/azure/credential.rs | 37 +++++++++++++++++++++++-------------- 1 file changed, 23 insertions(+), 14 deletions(-) diff --git a/src/azure/credential.rs b/src/azure/credential.rs index fc96ce4..283d7ff 100644 --- a/src/azure/credential.rs +++ b/src/azure/credential.rs @@ -40,7 +40,7 @@ use std::borrow::Cow; use std::process::Command; use std::str; use std::sync::Arc; -use std::time::{Duration, Instant}; +use std::time::{Duration, Instant, SystemTime}; use url::Url; static AZURE_VERSION: HeaderValue = HeaderValue::from_static("2021-08-06"); @@ -293,13 +293,16 @@ fn lexy_sort<'a>( values } +/// #[derive(Deserialize, Debug)] -struct TokenResponse { +struct OAuthTokenResponse { access_token: String, expires_in: u64, } /// Encapsulates the logic to perform an OAuth token challenge +/// +/// #[derive(Debug)] pub struct ClientSecretOAuthProvider { token_url: String, @@ -340,7 +343,7 @@ impl TokenProvider for ClientSecretOAuthProvider { client: &Client, retry: &RetryConfig, ) -> crate::Result>> { - let response: TokenResponse = client + let response: OAuthTokenResponse = client .request(Method::POST, &self.token_url) .header(ACCEPT, HeaderValue::from_static(CONTENT_TYPE_JSON)) .form(&[ @@ -363,21 +366,27 @@ impl TokenProvider for ClientSecretOAuthProvider { } } -fn expires_in_string<'de, D>(deserializer: D) -> std::result::Result +fn expires_on_string<'de, D>(deserializer: D) -> std::result::Result where D: serde::de::Deserializer<'de>, { let v = String::deserialize(deserializer)?; - v.parse::().map_err(serde::de::Error::custom) + let v = v.parse::().map_err(serde::de::Error::custom)?; + let now = SystemTime::now() + .duration_since(SystemTime::UNIX_EPOCH) + .map_err(serde::de::Error::custom)?; + + Ok(Instant::now() + Duration::from_secs(v.saturating_sub(now.as_secs()))) } -// NOTE: expires_on is a String version of unix epoch time, not an integer. -// +/// NOTE: expires_on is a String version of unix epoch time, not an integer. +/// +/// #[derive(Debug, Clone, Deserialize)] -struct MsiTokenResponse { +struct ImdsTokenResponse { pub access_token: String, - #[serde(deserialize_with = "expires_in_string")] - pub expires_in: u64, + #[serde(deserialize_with = "expires_on_string")] + pub expires_on: Instant, } /// Attempts authentication using a managed identity that has been assigned to the deployment environment. @@ -450,7 +459,7 @@ impl TokenProvider for ImdsManagedIdentityProvider { builder = builder.header("x-identity-header", val); }; - let response: MsiTokenResponse = builder + let response: ImdsTokenResponse = builder .send_retry(retry) .await .context(TokenRequestSnafu)? @@ -460,12 +469,12 @@ impl TokenProvider for ImdsManagedIdentityProvider { Ok(TemporaryToken { token: Arc::new(AzureCredential::BearerToken(response.access_token)), - expiry: Some(Instant::now() + Duration::from_secs(response.expires_in)), + expiry: Some(response.expires_on), }) } } -/// Credential for using workload identity dfederation +/// Credential for using workload identity federation /// /// #[derive(Debug)] @@ -512,7 +521,7 @@ impl TokenProvider for WorkloadIdentityOAuthProvider { .map_err(|_| Error::FederatedTokenFile)?; // https://learn.microsoft.com/en-us/azure/active-directory/develop/v2-oauth2-client-creds-grant-flow#third-case-access-token-request-with-a-federated-credential - let response: TokenResponse = client + let response: OAuthTokenResponse = client .request(Method::POST, &self.token_url) .header(ACCEPT, HeaderValue::from_static(CONTENT_TYPE_JSON)) .form(&[ From c339cc6f1b9156b06f2afa6500ffea7239f5677e Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 24 Oct 2023 14:50:18 +0100 Subject: [PATCH 216/397] Update quick-xml requirement from 0.30.0 to 0.31.0 in /object_store (#4983) Updates the requirements on [quick-xml](https://github.com/tafia/quick-xml) to permit the latest version. - [Release notes](https://github.com/tafia/quick-xml/releases) - [Changelog](https://github.com/tafia/quick-xml/blob/master/Changelog.md) - [Commits](https://github.com/tafia/quick-xml/compare/v0.30.0...v0.31.0) --- updated-dependencies: - dependency-name: quick-xml dependency-type: direct:production ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index 7928648..cb820b5 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -46,7 +46,7 @@ walkdir = "2" # Cloud storage support base64 = { version = "0.21", default-features = false, features = ["std"], optional = true } hyper = { version = "0.14", default-features = false, optional = true } -quick-xml = { version = "0.30.0", features = ["serialize", "overlapped-lists"], optional = true } +quick-xml = { version = "0.31.0", features = ["serialize", "overlapped-lists"], optional = true } serde = { version = "1.0", default-features = false, features = ["derive"], optional = true } serde_json = { version = "1.0", default-features = false, optional = true } rand = { version = "0.8", default-features = false, features = ["std", "std_rng"], optional = true } From 6e1f1c921fff553461a76bac300283e2f3df985e Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Wed, 25 Oct 2023 11:20:46 +0100 Subject: [PATCH 217/397] Increase default timeout to 30 seconds (#4989) --- src/client/mod.rs | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/client/mod.rs b/src/client/mod.rs index 3c968f1..77eee7f 100644 --- a/src/client/mod.rs +++ b/src/client/mod.rs @@ -193,6 +193,9 @@ impl Default for ClientOptions { // // // Which recommend a connection timeout of 3.1s and a request timeout of 2s + // + // As object store requests may involve the transfer of non-trivial volumes of data + // we opt for a slightly higher default timeout of 30 seconds Self { user_agent: None, content_type_map: Default::default(), @@ -203,7 +206,7 @@ impl Default for ClientOptions { proxy_excludes: None, allow_http: Default::default(), allow_insecure: Default::default(), - timeout: Some(Duration::from_secs(5).into()), + timeout: Some(Duration::from_secs(30).into()), connect_timeout: Some(Duration::from_secs(5).into()), pool_idle_timeout: None, pool_max_idle_per_host: None, From b5ef9d38cc7ed01d1a1584f0b27a7fce727c4530 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Wed, 25 Oct 2023 17:59:37 +0100 Subject: [PATCH 218/397] Add MultiPartStore (#4961) (#4608) (#4971) * Add MultiPartStore (#4961) (#4608) * Parse CompleteMultipartUploadResult (#4965) * More docs * Add integration test * Fix azure * More docs * Don't gate multipart behind feature flag --- src/aws/client.rs | 51 ++++++++++++++++++++++++++++--- src/aws/mod.rs | 49 ++++++++++++++++++++++-------- src/azure/client.rs | 49 ++++++++++++++++++++++++++++-- src/azure/mod.rs | 73 ++++++++++++++++++++++----------------------- src/gcp/client.rs | 34 ++++++++++++++++++--- src/gcp/mod.rs | 52 ++++++++++++++++++++++---------- src/lib.rs | 42 ++++++++++++++++++++++++-- src/multipart.rs | 61 ++++++++++++++++++++++++++++++++++--- src/signer.rs | 3 +- 9 files changed, 329 insertions(+), 85 deletions(-) diff --git a/src/aws/client.rs b/src/aws/client.rs index 00d6ee4..4e98f25 100644 --- a/src/aws/client.rs +++ b/src/aws/client.rs @@ -27,7 +27,9 @@ use crate::client::retry::RetryExt; use crate::client::GetOptionsExt; use crate::multipart::PartId; use crate::path::DELIMITER; -use crate::{ClientOptions, GetOptions, ListResult, MultipartId, Path, Result, RetryConfig}; +use crate::{ + ClientOptions, GetOptions, ListResult, MultipartId, Path, PutResult, Result, RetryConfig, +}; use async_trait::async_trait; use base64::prelude::BASE64_STANDARD; use base64::Engine; @@ -115,6 +117,9 @@ pub(crate) enum Error { #[snafu(display("Error performing complete multipart request: {}", source))] CompleteMultipartRequest { source: crate::client::retry::Error }, + #[snafu(display("Error getting complete multipart response body: {}", source))] + CompleteMultipartResponseBody { source: reqwest::Error }, + #[snafu(display("Got invalid list response: {}", source))] InvalidListResponse { source: quick_xml::de::DeError }, @@ -162,6 +167,13 @@ struct MultipartPart { part_number: usize, } +#[derive(Debug, Deserialize)] +#[serde(rename_all = "PascalCase", rename = "CompleteMultipartUploadResult")] +struct CompleteMultipartResult { + #[serde(rename = "ETag")] + e_tag: String, +} + #[derive(Deserialize)] #[serde(rename_all = "PascalCase", rename = "DeleteResult")] struct BatchDeleteResponse { @@ -506,12 +518,32 @@ impl S3Client { Ok(response.upload_id) } + pub async fn put_part( + &self, + path: &Path, + upload_id: &MultipartId, + part_idx: usize, + data: Bytes, + ) -> Result { + let part = (part_idx + 1).to_string(); + + let content_id = self + .put_request( + path, + data, + &[("partNumber", &part), ("uploadId", upload_id)], + ) + .await?; + + Ok(PartId { content_id }) + } + pub async fn complete_multipart( &self, location: &Path, upload_id: &str, parts: Vec, - ) -> Result<()> { + ) -> Result { let parts = parts .into_iter() .enumerate() @@ -527,7 +559,8 @@ impl S3Client { let credential = self.get_credential().await?; let url = self.config.path_url(location); - self.client + let response = self + .client .request(Method::POST, url) .query(&[("uploadId", upload_id)]) .body(body) @@ -542,7 +575,17 @@ impl S3Client { .await .context(CompleteMultipartRequestSnafu)?; - Ok(()) + let data = response + .bytes() + .await + .context(CompleteMultipartResponseBodySnafu)?; + + let response: CompleteMultipartResult = + quick_xml::de::from_reader(data.reader()).context(InvalidMultipartResponseSnafu)?; + + Ok(PutResult { + e_tag: Some(response.e_tag), + }) } } diff --git a/src/aws/mod.rs b/src/aws/mod.rs index 25894a1..57254c7 100644 --- a/src/aws/mod.rs +++ b/src/aws/mod.rs @@ -44,7 +44,7 @@ use crate::aws::client::S3Client; use crate::client::get::GetClientExt; use crate::client::list::ListClientExt; use crate::client::CredentialProvider; -use crate::multipart::{PartId, PutPart, WriteMultiPart}; +use crate::multipart::{MultiPartStore, PartId, PutPart, WriteMultiPart}; use crate::signer::Signer; use crate::{ GetOptions, GetResult, ListResult, MultipartId, ObjectMeta, ObjectStore, Path, PutResult, @@ -246,18 +246,9 @@ struct S3MultiPartUpload { #[async_trait] impl PutPart for S3MultiPartUpload { async fn put_part(&self, buf: Vec, part_idx: usize) -> Result { - let part = (part_idx + 1).to_string(); - - let content_id = self - .client - .put_request( - &self.location, - buf.into(), - &[("partNumber", &part), ("uploadId", &self.upload_id)], - ) - .await?; - - Ok(PartId { content_id }) + self.client + .put_part(&self.location, &self.upload_id, part_idx, buf.into()) + .await } async fn complete(&self, completed_parts: Vec) -> Result<()> { @@ -268,6 +259,36 @@ impl PutPart for S3MultiPartUpload { } } +#[async_trait] +impl MultiPartStore for AmazonS3 { + async fn create_multipart(&self, path: &Path) -> Result { + self.client.create_multipart(path).await + } + + async fn put_part( + &self, + path: &Path, + id: &MultipartId, + part_idx: usize, + data: Bytes, + ) -> Result { + self.client.put_part(path, id, part_idx, data).await + } + + async fn complete_multipart( + &self, + path: &Path, + id: &MultipartId, + parts: Vec, + ) -> Result { + self.client.complete_multipart(path, id, parts).await + } + + async fn abort_multipart(&self, path: &Path, id: &MultipartId) -> Result<()> { + self.client.delete_request(path, &[("uploadId", id)]).await + } +} + #[cfg(test)] mod tests { use super::*; @@ -293,6 +314,8 @@ mod tests { list_with_delimiter(&integration).await; rename_and_copy(&integration).await; stream_get(&integration).await; + multipart(&integration, &integration).await; + if test_not_exists { copy_if_not_exists(&integration).await; } diff --git a/src/azure/client.rs b/src/azure/client.rs index cd3df8c..9f47b9a 100644 --- a/src/azure/client.rs +++ b/src/azure/client.rs @@ -19,13 +19,16 @@ use super::credential::AzureCredential; use crate::azure::credential::*; use crate::azure::{AzureCredentialProvider, STORE}; use crate::client::get::GetClient; -use crate::client::header::HeaderConfig; +use crate::client::header::{get_etag, HeaderConfig}; use crate::client::list::ListClient; use crate::client::retry::RetryExt; use crate::client::GetOptionsExt; +use crate::multipart::PartId; use crate::path::DELIMITER; use crate::util::deserialize_rfc1123; -use crate::{ClientOptions, GetOptions, ListResult, ObjectMeta, Path, Result, RetryConfig}; +use crate::{ + ClientOptions, GetOptions, ListResult, ObjectMeta, Path, PutResult, Result, RetryConfig, +}; use async_trait::async_trait; use base64::prelude::BASE64_STANDARD; use base64::Engine; @@ -84,6 +87,11 @@ pub(crate) enum Error { Authorization { source: crate::azure::credential::Error, }, + + #[snafu(display("Unable to extract metadata from headers: {}", source))] + Metadata { + source: crate::client::header::Error, + }, } impl From for crate::Error { @@ -190,6 +198,43 @@ impl AzureClient { Ok(response) } + /// PUT a block + pub async fn put_block(&self, path: &Path, part_idx: usize, data: Bytes) -> Result { + let content_id = format!("{part_idx:20}"); + let block_id: BlockId = content_id.clone().into(); + + self.put_request( + path, + Some(data), + true, + &[ + ("comp", "block"), + ("blockid", &BASE64_STANDARD.encode(block_id)), + ], + ) + .await?; + + Ok(PartId { content_id }) + } + + /// PUT a block list + pub async fn put_block_list(&self, path: &Path, parts: Vec) -> Result { + let blocks = parts + .into_iter() + .map(|part| BlockId::from(part.content_id)) + .collect(); + + let block_list = BlockList { blocks }; + let block_xml = block_list.to_xml(); + + let response = self + .put_request(path, Some(block_xml.into()), true, &[("comp", "blocklist")]) + .await?; + + let e_tag = get_etag(response.headers()).context(MetadataSnafu)?; + Ok(PutResult { e_tag: Some(e_tag) }) + } + /// Make an Azure Delete request pub async fn delete_request( &self, diff --git a/src/azure/mod.rs b/src/azure/mod.rs index 5f76875..779ac2f 100644 --- a/src/azure/mod.rs +++ b/src/azure/mod.rs @@ -26,15 +26,12 @@ //! [ObjectStore::abort_multipart] is a no-op, since Azure Blob Store doesn't provide //! a way to drop old blocks. Instead unused blocks are automatically cleaned up //! after 7 days. -use self::client::{BlockId, BlockList}; use crate::{ multipart::{PartId, PutPart, WriteMultiPart}, path::Path, GetOptions, GetResult, ListResult, MultipartId, ObjectMeta, ObjectStore, PutResult, Result, }; use async_trait::async_trait; -use base64::prelude::BASE64_STANDARD; -use base64::Engine; use bytes::Bytes; use futures::stream::BoxStream; use std::fmt::Debug; @@ -53,6 +50,7 @@ mod credential; /// [`CredentialProvider`] for [`MicrosoftAzure`] pub type AzureCredentialProvider = Arc>; use crate::client::header::get_etag; +use crate::multipart::MultiPartStore; pub use builder::{AzureConfigKey, MicrosoftAzureBuilder}; pub use credential::AzureCredential; @@ -151,43 +149,44 @@ struct AzureMultiPartUpload { #[async_trait] impl PutPart for AzureMultiPartUpload { - async fn put_part(&self, buf: Vec, part_idx: usize) -> Result { - let content_id = format!("{part_idx:20}"); - let block_id: BlockId = content_id.clone().into(); - - self.client - .put_request( - &self.location, - Some(buf.into()), - true, - &[ - ("comp", "block"), - ("blockid", &BASE64_STANDARD.encode(block_id)), - ], - ) - .await?; + async fn put_part(&self, buf: Vec, idx: usize) -> Result { + self.client.put_block(&self.location, idx, buf.into()).await + } - Ok(PartId { content_id }) + async fn complete(&self, parts: Vec) -> Result<()> { + self.client.put_block_list(&self.location, parts).await?; + Ok(()) } +} - async fn complete(&self, completed_parts: Vec) -> Result<()> { - let blocks = completed_parts - .into_iter() - .map(|part| BlockId::from(part.content_id)) - .collect(); +#[async_trait] +impl MultiPartStore for MicrosoftAzure { + async fn create_multipart(&self, _: &Path) -> Result { + Ok(String::new()) + } - let block_list = BlockList { blocks }; - let block_xml = block_list.to_xml(); + async fn put_part( + &self, + path: &Path, + _: &MultipartId, + part_idx: usize, + data: Bytes, + ) -> Result { + self.client.put_block(path, part_idx, data).await + } - self.client - .put_request( - &self.location, - Some(block_xml.into()), - true, - &[("comp", "blocklist")], - ) - .await?; + async fn complete_multipart( + &self, + path: &Path, + _: &MultipartId, + parts: Vec, + ) -> Result { + self.client.put_block_list(path, parts).await + } + async fn abort_multipart(&self, _: &Path, _: &MultipartId) -> Result<()> { + // There is no way to drop blocks that have been uploaded. Instead, they simply + // expire in 7 days. Ok(()) } } @@ -195,10 +194,7 @@ impl PutPart for AzureMultiPartUpload { #[cfg(test)] mod tests { use super::*; - use crate::tests::{ - copy_if_not_exists, get_opts, list_uses_directories_correctly, list_with_delimiter, - put_get_delete_list_opts, rename_and_copy, stream_get, - }; + use crate::tests::*; #[tokio::test] async fn azure_blob_test() { @@ -212,6 +208,7 @@ mod tests { rename_and_copy(&integration).await; copy_if_not_exists(&integration).await; stream_get(&integration).await; + multipart(&integration, &integration).await; } #[test] diff --git a/src/gcp/client.rs b/src/gcp/client.rs index 558a6f8..8c44f90 100644 --- a/src/gcp/client.rs +++ b/src/gcp/client.rs @@ -24,7 +24,7 @@ use crate::client::GetOptionsExt; use crate::gcp::{GcpCredential, GcpCredentialProvider, STORE}; use crate::multipart::PartId; use crate::path::{Path, DELIMITER}; -use crate::{ClientOptions, GetOptions, ListResult, MultipartId, Result, RetryConfig}; +use crate::{ClientOptions, GetOptions, ListResult, MultipartId, PutResult, Result, RetryConfig}; use async_trait::async_trait; use bytes::{Buf, Bytes}; use percent_encoding::{percent_encode, utf8_percent_encode, NON_ALPHANUMERIC}; @@ -184,6 +184,30 @@ impl GoogleCloudStorageClient { Ok(get_etag(response.headers()).context(MetadataSnafu)?) } + /// Perform a put part request + /// + /// Returns the new [`PartId`] + pub async fn put_part( + &self, + path: &Path, + upload_id: &MultipartId, + part_idx: usize, + data: Bytes, + ) -> Result { + let content_id = self + .put_request( + path, + data, + &[ + ("partNumber", &format!("{}", part_idx + 1)), + ("uploadId", upload_id), + ], + ) + .await?; + + Ok(PartId { content_id }) + } + /// Initiate a multi-part upload pub async fn multipart_initiate(&self, path: &Path) -> Result { let credential = self.get_credential().await?; @@ -240,7 +264,7 @@ impl GoogleCloudStorageClient { path: &Path, multipart_id: &MultipartId, completed_parts: Vec, - ) -> Result<()> { + ) -> Result { let upload_id = multipart_id.clone(); let url = self.object_url(path); @@ -263,7 +287,8 @@ impl GoogleCloudStorageClient { // https://github.com/tafia/quick-xml/issues/350 .replace(""", "\""); - self.client + let result = self + .client .request(Method::POST, &url) .bearer_auth(&credential.bearer) .query(&[("uploadId", upload_id)]) @@ -274,7 +299,8 @@ impl GoogleCloudStorageClient { path: path.as_ref(), })?; - Ok(()) + let etag = get_etag(result.headers()).context(MetadataSnafu)?; + Ok(PutResult { e_tag: Some(etag) }) } /// Perform a delete request diff --git a/src/gcp/mod.rs b/src/gcp/mod.rs index 6512a8b..0eb3e9c 100644 --- a/src/gcp/mod.rs +++ b/src/gcp/mod.rs @@ -45,6 +45,7 @@ use tokio::io::AsyncWrite; use crate::client::get::GetClientExt; use crate::client::list::ListClientExt; +use crate::multipart::MultiPartStore; pub use builder::{GoogleCloudStorageBuilder, GoogleConfigKey}; pub use credential::GcpCredential; @@ -90,27 +91,17 @@ struct GCSMultipartUpload { impl PutPart for GCSMultipartUpload { /// Upload an object part async fn put_part(&self, buf: Vec, part_idx: usize) -> Result { - let upload_id = self.multipart_id.clone(); - let content_id = self - .client - .put_request( - &self.path, - buf.into(), - &[ - ("partNumber", format!("{}", part_idx + 1)), - ("uploadId", upload_id), - ], - ) - .await?; - - Ok(PartId { content_id }) + self.client + .put_part(&self.path, &self.multipart_id, part_idx, buf.into()) + .await } /// Complete a multipart upload async fn complete(&self, completed_parts: Vec) -> Result<()> { self.client .multipart_complete(&self.path, &self.multipart_id, completed_parts) - .await + .await?; + Ok(()) } } @@ -169,6 +160,36 @@ impl ObjectStore for GoogleCloudStorage { } } +#[async_trait] +impl MultiPartStore for GoogleCloudStorage { + async fn create_multipart(&self, path: &Path) -> Result { + self.client.multipart_initiate(path).await + } + + async fn put_part( + &self, + path: &Path, + id: &MultipartId, + part_idx: usize, + data: Bytes, + ) -> Result { + self.client.put_part(path, id, part_idx, data).await + } + + async fn complete_multipart( + &self, + path: &Path, + id: &MultipartId, + parts: Vec, + ) -> Result { + self.client.multipart_complete(path, id, parts).await + } + + async fn abort_multipart(&self, path: &Path, id: &MultipartId) -> Result<()> { + self.client.multipart_cleanup(path, id).await + } +} + #[cfg(test)] mod test { @@ -197,6 +218,7 @@ mod test { // Fake GCS server does not yet implement XML Multipart uploads // https://github.com/fsouza/fake-gcs-server/issues/852 stream_get(&integration).await; + multipart(&integration, &integration).await; // Fake GCS server doesn't currently honor preconditions get_opts(&integration).await; } diff --git a/src/lib.rs b/src/lib.rs index 656b303..9a06672 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -263,7 +263,6 @@ pub use client::{ #[cfg(feature = "cloud")] mod config; -#[cfg(feature = "cloud")] pub mod multipart; mod parse; mod util; @@ -302,18 +301,29 @@ pub trait ObjectStore: std::fmt::Display + Send + Sync + Debug + 'static { /// should be able to observe a partially written object async fn put(&self, location: &Path, bytes: Bytes) -> Result; - /// Get a multi-part upload that allows writing data in chunks + /// Get a multi-part upload that allows writing data in chunks. /// /// Most cloud-based uploads will buffer and upload parts in parallel. /// /// To complete the upload, [AsyncWrite::poll_shutdown] must be called /// to completion. This operation is guaranteed to be atomic, it will either /// make all the written data available at `location`, or fail. No clients - /// should be able to observe a partially written object + /// should be able to observe a partially written object. /// /// For some object stores (S3, GCS, and local in particular), if the /// writer fails or panics, you must call [ObjectStore::abort_multipart] /// to clean up partially written data. + /// + /// For applications requiring fine-grained control of multipart uploads + /// see [`MultiPartStore`], although note that this interface cannot be + /// supported by all [`ObjectStore`] backends. + /// + /// For applications looking to implement this interface for a custom + /// multipart API, see [`WriteMultiPart`] which handles the complexities + /// of performing parallel uploads of fixed size parts. + /// + /// [`WriteMultiPart`]: multipart::WriteMultiPart + /// [`MultiPartStore`]: multipart::MultiPartStore async fn put_multipart( &self, location: &Path, @@ -934,6 +944,7 @@ mod test_util { #[cfg(test)] mod tests { use super::*; + use crate::multipart::MultiPartStore; use crate::test_util::flatten_list_stream; use chrono::TimeZone; use rand::{thread_rng, Rng}; @@ -1681,6 +1692,31 @@ mod tests { storage.delete(&path2).await.unwrap(); } + pub(crate) async fn multipart(storage: &dyn ObjectStore, multipart: &dyn MultiPartStore) { + let path = Path::from("test_multipart"); + let chunk_size = 5 * 1024 * 1024; + + let chunks = get_chunks(chunk_size, 2); + + let id = multipart.create_multipart(&path).await.unwrap(); + + let parts: Vec<_> = futures::stream::iter(chunks) + .enumerate() + .map(|(idx, b)| multipart.put_part(&path, &id, idx, b)) + .buffered(2) + .try_collect() + .await + .unwrap(); + + multipart + .complete_multipart(&path, &id, parts) + .await + .unwrap(); + + let meta = storage.head(&path).await.unwrap(); + assert_eq!(meta.size, chunk_size * 2); + } + async fn delete_fixtures(storage: &DynObjectStore) { let paths = storage.list(None).map_ok(|meta| meta.location).boxed(); storage diff --git a/src/multipart.rs b/src/multipart.rs index d4c911f..1dcd5a6 100644 --- a/src/multipart.rs +++ b/src/multipart.rs @@ -22,17 +22,18 @@ //! especially useful when dealing with large files or high-throughput systems. use async_trait::async_trait; +use bytes::Bytes; use futures::{stream::FuturesUnordered, Future, StreamExt}; use std::{io, pin::Pin, sync::Arc, task::Poll}; use tokio::io::AsyncWrite; -use crate::Result; +use crate::path::Path; +use crate::{MultipartId, PutResult, Result}; type BoxedTryFuture = Pin> + Send>>; -/// A trait that can be implemented by cloud-based object stores -/// and used in combination with [`WriteMultiPart`] to provide -/// multipart upload support +/// A trait used in combination with [`WriteMultiPart`] to implement +/// [`AsyncWrite`] on top of an API for multipart upload #[async_trait] pub trait PutPart: Send + Sync + 'static { /// Upload a single part @@ -52,6 +53,9 @@ pub struct PartId { } /// Wrapper around a [`PutPart`] that implements [`AsyncWrite`] +/// +/// Data will be uploaded in fixed size chunks of 10 MiB in parallel, +/// up to the configured maximum concurrency pub struct WriteMultiPart { inner: Arc, /// A list of completed parts, in sequential order. @@ -263,3 +267,52 @@ impl std::fmt::Debug for WriteMultiPart { .finish() } } + +/// A low-level interface for interacting with multipart upload APIs +/// +/// Most use-cases should prefer [`ObjectStore::put_multipart`] as this is supported by more +/// backends, including [`LocalFileSystem`], and automatically handles uploading fixed +/// size parts of sufficient size in parallel +/// +/// [`ObjectStore::put_multipart`]: crate::ObjectStore::put_multipart +/// [`LocalFileSystem`]: crate::local::LocalFileSystem +#[async_trait] +pub trait MultiPartStore: Send + Sync + 'static { + /// Creates a new multipart upload, returning the [`MultipartId`] + async fn create_multipart(&self, path: &Path) -> Result; + + /// Uploads a new part with index `part_idx` + /// + /// `part_idx` should be an integer in the range `0..N` where `N` is the number of + /// parts in the upload. Parts may be uploaded concurrently and in any order. + /// + /// Most stores require that all parts excluding the last are at least 5 MiB, and some + /// further require that all parts excluding the last be the same size, e.g. [R2]. + /// [`WriteMultiPart`] performs writes in fixed size blocks of 10 MiB, and clients wanting + /// to maximise compatibility should look to do likewise. + /// + /// [R2]: https://developers.cloudflare.com/r2/objects/multipart-objects/#limitations + async fn put_part( + &self, + path: &Path, + id: &MultipartId, + part_idx: usize, + data: Bytes, + ) -> Result; + + /// Completes a multipart upload + /// + /// The `i`'th value of `parts` must be a [`PartId`] returned by a call to [`Self::put_part`] + /// with a `part_idx` of `i`, and the same `path` and `id` as provided to this method. Calling + /// this method with out of sequence or repeated [`PartId`], or [`PartId`] returned for other + /// values of `path` or `id`, will result in implementation-defined behaviour + async fn complete_multipart( + &self, + path: &Path, + id: &MultipartId, + parts: Vec, + ) -> Result; + + /// Aborts a multipart upload + async fn abort_multipart(&self, path: &Path, id: &MultipartId) -> Result<()>; +} diff --git a/src/signer.rs b/src/signer.rs index f792397..ed92e28 100644 --- a/src/signer.rs +++ b/src/signer.rs @@ -23,8 +23,7 @@ use reqwest::Method; use std::{fmt, time::Duration}; use url::Url; -/// Universal API to presigned URLs generated from multiple object store services. Not supported by -/// all object store services. +/// Universal API to generate presigned URLs from multiple object store services. #[async_trait] pub trait Signer: Send + Sync + fmt::Debug + 'static { /// Given the intended [`Method`] and [`Path`] to use and the desired length of time for which From 9ec128cb140941e1aeb31d95013aabddaee5da79 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 26 Oct 2023 09:16:48 +0100 Subject: [PATCH 219/397] Support bucket name with `.` when parsing GCS URL (#4991) (#4992) * Support bucket name with `.` when parsing GCS URL (#4991) * Update test --- src/gcp/builder.rs | 18 ++++++------------ src/parse.rs | 4 ++++ 2 files changed, 10 insertions(+), 12 deletions(-) diff --git a/src/gcp/builder.rs b/src/gcp/builder.rs index 2039d23..5f718d6 100644 --- a/src/gcp/builder.rs +++ b/src/gcp/builder.rs @@ -337,13 +337,8 @@ impl GoogleCloudStorageBuilder { let parsed = Url::parse(url).context(UnableToParseUrlSnafu { url })?; let host = parsed.host_str().context(UrlNotRecognisedSnafu { url })?; - let validate = |s: &str| match s.contains('.') { - true => Err(UrlNotRecognisedSnafu { url }.build()), - false => Ok(s.to_string()), - }; - match parsed.scheme() { - "gs" => self.bucket_name = Some(validate(host)?), + "gs" => self.bucket_name = Some(host.to_string()), scheme => return Err(UnknownUrlSchemeSnafu { scheme }.build().into()), } Ok(()) @@ -630,13 +625,12 @@ mod tests { fn gcs_test_urls() { let mut builder = GoogleCloudStorageBuilder::new(); builder.parse_url("gs://bucket/path").unwrap(); - assert_eq!(builder.bucket_name, Some("bucket".to_string())); + assert_eq!(builder.bucket_name.as_deref(), Some("bucket")); - let err_cases = ["mailto://bucket/path", "gs://bucket.mydomain/path"]; - let mut builder = GoogleCloudStorageBuilder::new(); - for case in err_cases { - builder.parse_url(case).unwrap_err(); - } + builder.parse_url("gs://bucket.mydomain/path").unwrap(); + assert_eq!(builder.bucket_name.as_deref(), Some("bucket.mydomain")); + + builder.parse_url("mailto://bucket/path").unwrap_err(); } #[test] diff --git a/src/parse.rs b/src/parse.rs index 170726f..51993e2 100644 --- a/src/parse.rs +++ b/src/parse.rs @@ -234,6 +234,10 @@ mod tests { "gs://bucket/path", (ObjectStoreScheme::GoogleCloudStorage, "path"), ), + ( + "gs://test.example.com/path", + (ObjectStoreScheme::GoogleCloudStorage, "path"), + ), ("http://mydomain/path", (ObjectStoreScheme::Http, "path")), ("https://mydomain/path", (ObjectStoreScheme::Http, "path")), ]; From 356a8b333a42473059c86219545950b2d9fcb664 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 26 Oct 2023 09:17:23 +0100 Subject: [PATCH 220/397] Don't panic on invalid Azure access key (#4972) (#4974) --- src/azure/builder.rs | 14 ++++++++------ src/azure/credential.rs | 23 +++++++++++++++++++---- 2 files changed, 27 insertions(+), 10 deletions(-) diff --git a/src/azure/builder.rs b/src/azure/builder.rs index 915e4c5..02e0762 100644 --- a/src/azure/builder.rs +++ b/src/azure/builder.rs @@ -17,7 +17,7 @@ use crate::azure::client::{AzureClient, AzureConfig}; use crate::azure::credential::{ - AzureCliCredential, ClientSecretOAuthProvider, ImdsManagedIdentityProvider, + AzureAccessKey, AzureCliCredential, ClientSecretOAuthProvider, ImdsManagedIdentityProvider, WorkloadIdentityOAuthProvider, }; use crate::azure::{AzureCredential, AzureCredentialProvider, MicrosoftAzure, STORE}; @@ -800,11 +800,12 @@ impl MicrosoftAzureBuilder { // Allow overriding defaults. Values taken from // from https://docs.rs/azure_storage/0.2.0/src/azure_storage/core/clients/storage_account_client.rs.html#129-141 let url = url_from_env("AZURITE_BLOB_STORAGE_URL", "http://127.0.0.1:10000")?; - let account_key = self - .access_key - .unwrap_or_else(|| EMULATOR_ACCOUNT_KEY.to_string()); + let key = match self.access_key { + Some(k) => AzureAccessKey::try_new(&k)?, + None => AzureAccessKey::try_new(EMULATOR_ACCOUNT_KEY)?, + }; - let credential = static_creds(AzureCredential::AccessKey(account_key)); + let credential = static_creds(AzureCredential::AccessKey(key)); self.client_options = self.client_options.with_allow_http(true); (true, url, credential, account_name) @@ -828,7 +829,8 @@ impl MicrosoftAzureBuilder { } else if let Some(bearer_token) = self.bearer_token { static_creds(AzureCredential::BearerToken(bearer_token)) } else if let Some(access_key) = self.access_key { - static_creds(AzureCredential::AccessKey(access_key)) + let key = AzureAccessKey::try_new(&access_key)?; + static_creds(AzureCredential::AccessKey(key)) } else if let (Some(client_id), Some(tenant_id), Some(federated_token_file)) = (&self.client_id, &self.tenant_id, self.federated_token_file) { diff --git a/src/azure/credential.rs b/src/azure/credential.rs index 283d7ff..2b8788d 100644 --- a/src/azure/credential.rs +++ b/src/azure/credential.rs @@ -75,6 +75,9 @@ pub enum Error { #[snafu(display("Error reading federated token file "))] FederatedTokenFile, + #[snafu(display("Invalid Access Key: {}", source))] + InvalidAccessKey { source: base64::DecodeError }, + #[snafu(display("'az account get-access-token' command failed: {message}"))] AzureCli { message: String }, @@ -93,13 +96,25 @@ impl From for crate::Error { } } +/// A shared Azure Storage Account Key +#[derive(Debug, Eq, PartialEq)] +pub struct AzureAccessKey(Vec); + +impl AzureAccessKey { + /// Create a new [`AzureAccessKey`], checking it for validity + pub fn try_new(key: &str) -> Result { + let key = BASE64_STANDARD.decode(key).context(InvalidAccessKeySnafu)?; + Ok(Self(key)) + } +} + /// An Azure storage credential #[derive(Debug, Eq, PartialEq)] pub enum AzureCredential { /// A shared access key /// /// - AccessKey(String), + AccessKey(AzureAccessKey), /// A shared access signature /// /// @@ -149,7 +164,7 @@ impl CredentialExt for RequestBuilder { request.url(), request.method(), account, - key.as_str(), + key, ); // "signature" is a base 64 encoded string so it should never @@ -174,10 +189,10 @@ fn generate_authorization( u: &Url, method: &Method, account: &str, - key: &str, + key: &AzureAccessKey, ) -> String { let str_to_sign = string_to_sign(h, u, method, account); - let auth = hmac_sha256(BASE64_STANDARD.decode(key).unwrap(), str_to_sign); + let auth = hmac_sha256(&key.0, str_to_sign); format!("SharedKey {}:{}", account, BASE64_STANDARD.encode(auth)) } From b45b8381aacb60283228f04467e2686dddcb485f Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Fri, 27 Oct 2023 11:21:03 +0100 Subject: [PATCH 221/397] Conditional Put (#4879) (#4984) * Add version to PutResult * Conditional Put (#4879) * Don't support HttpStore * Add R2 Support * Update Azure StatusCode * Fixes * Clippy * Clippy * PutRequestBuilder * Clippy * Add stress test * Clippy --- src/aws/builder.rs | 30 +++- src/aws/client.rs | 177 +++++++++++----------- src/aws/mod.rs | 43 +++++- src/aws/{copy.rs => precondition.rs} | 45 +++++- src/azure/client.rs | 139 +++++++++-------- src/azure/mod.rs | 17 +-- src/chunked.rs | 7 +- src/client/header.rs | 17 +++ src/client/mod.rs | 2 +- src/client/retry.rs | 4 + src/client/{list_response.rs => s3.rs} | 46 +++++- src/gcp/client.rs | 197 +++++++++++++++---------- src/gcp/mod.rs | 9 +- src/http/client.rs | 4 + src/http/mod.rs | 15 +- src/lib.rs | 171 ++++++++++++++++++++- src/limit.rs | 6 +- src/local.rs | 48 ++++-- src/memory.rs | 67 ++++++++- src/prefix.rs | 8 +- src/throttle.rs | 9 +- tests/get_range_file.rs | 32 ++-- 22 files changed, 791 insertions(+), 302 deletions(-) rename src/aws/{copy.rs => precondition.rs} (68%) rename src/client/{list_response.rs => s3.rs} (68%) diff --git a/src/aws/builder.rs b/src/aws/builder.rs index 75a5299..79ea75b 100644 --- a/src/aws/builder.rs +++ b/src/aws/builder.rs @@ -20,7 +20,8 @@ use crate::aws::credential::{ InstanceCredentialProvider, TaskCredentialProvider, WebIdentityProvider, }; use crate::aws::{ - AmazonS3, AwsCredential, AwsCredentialProvider, Checksum, S3CopyIfNotExists, STORE, + AmazonS3, AwsCredential, AwsCredentialProvider, Checksum, S3ConditionalPut, S3CopyIfNotExists, + STORE, }; use crate::client::TokenCredentialProvider; use crate::config::ConfigValue; @@ -152,6 +153,8 @@ pub struct AmazonS3Builder { skip_signature: ConfigValue, /// Copy if not exists copy_if_not_exists: Option>, + /// Put precondition + conditional_put: Option>, } /// Configuration keys for [`AmazonS3Builder`] @@ -288,6 +291,11 @@ pub enum AmazonS3ConfigKey { /// See [`S3CopyIfNotExists`] CopyIfNotExists, + /// Configure how to provide conditional put operations + /// + /// See [`S3ConditionalPut`] + ConditionalPut, + /// Skip signing request SkipSignature, @@ -312,7 +320,8 @@ impl AsRef for AmazonS3ConfigKey { Self::Checksum => "aws_checksum_algorithm", Self::ContainerCredentialsRelativeUri => "aws_container_credentials_relative_uri", Self::SkipSignature => "aws_skip_signature", - Self::CopyIfNotExists => "copy_if_not_exists", + Self::CopyIfNotExists => "aws_copy_if_not_exists", + Self::ConditionalPut => "aws_conditional_put", Self::Client(opt) => opt.as_ref(), } } @@ -339,7 +348,8 @@ impl FromStr for AmazonS3ConfigKey { "aws_checksum_algorithm" | "checksum_algorithm" => Ok(Self::Checksum), "aws_container_credentials_relative_uri" => Ok(Self::ContainerCredentialsRelativeUri), "aws_skip_signature" | "skip_signature" => Ok(Self::SkipSignature), - "copy_if_not_exists" => Ok(Self::CopyIfNotExists), + "aws_copy_if_not_exists" | "copy_if_not_exists" => Ok(Self::CopyIfNotExists), + "aws_conditional_put" | "conditional_put" => Ok(Self::ConditionalPut), // Backwards compatibility "aws_allow_http" => Ok(Self::Client(ClientConfigKey::AllowHttp)), _ => match s.parse() { @@ -446,6 +456,9 @@ impl AmazonS3Builder { AmazonS3ConfigKey::CopyIfNotExists => { self.copy_if_not_exists = Some(ConfigValue::Deferred(value.into())) } + AmazonS3ConfigKey::ConditionalPut => { + self.conditional_put = Some(ConfigValue::Deferred(value.into())) + } }; self } @@ -509,6 +522,9 @@ impl AmazonS3Builder { AmazonS3ConfigKey::CopyIfNotExists => { self.copy_if_not_exists.as_ref().map(ToString::to_string) } + AmazonS3ConfigKey::ConditionalPut => { + self.conditional_put.as_ref().map(ToString::to_string) + } } } @@ -713,6 +729,12 @@ impl AmazonS3Builder { self } + /// Configure how to provide conditional put operations + pub fn with_conditional_put(mut self, config: S3ConditionalPut) -> Self { + self.conditional_put = Some(config.into()); + self + } + /// Create a [`AmazonS3`] instance from the provided values, /// consuming `self`. pub fn build(mut self) -> Result { @@ -724,6 +746,7 @@ impl AmazonS3Builder { let region = self.region.context(MissingRegionSnafu)?; let checksum = self.checksum_algorithm.map(|x| x.get()).transpose()?; let copy_if_not_exists = self.copy_if_not_exists.map(|x| x.get()).transpose()?; + let put_precondition = self.conditional_put.map(|x| x.get()).transpose()?; let credentials = if let Some(credentials) = self.credentials { credentials @@ -830,6 +853,7 @@ impl AmazonS3Builder { skip_signature: self.skip_signature.get()?, checksum, copy_if_not_exists, + conditional_put: put_precondition, }; let client = Arc::new(S3Client::new(config)?); diff --git a/src/aws/client.rs b/src/aws/client.rs index 4e98f25..20c2a96 100644 --- a/src/aws/client.rs +++ b/src/aws/client.rs @@ -17,13 +17,18 @@ use crate::aws::checksum::Checksum; use crate::aws::credential::{AwsCredential, CredentialExt}; -use crate::aws::{AwsCredentialProvider, S3CopyIfNotExists, STORE, STRICT_PATH_ENCODE_SET}; +use crate::aws::{ + AwsCredentialProvider, S3ConditionalPut, S3CopyIfNotExists, STORE, STRICT_PATH_ENCODE_SET, +}; use crate::client::get::GetClient; -use crate::client::header::get_etag; use crate::client::header::HeaderConfig; +use crate::client::header::{get_put_result, get_version}; use crate::client::list::ListClient; -use crate::client::list_response::ListResponse; use crate::client::retry::RetryExt; +use crate::client::s3::{ + CompleteMultipartUpload, CompleteMultipartUploadResult, InitiateMultipartUploadResult, + ListResponse, +}; use crate::client::GetOptionsExt; use crate::multipart::PartId; use crate::path::DELIMITER; @@ -34,17 +39,20 @@ use async_trait::async_trait; use base64::prelude::BASE64_STANDARD; use base64::Engine; use bytes::{Buf, Bytes}; +use hyper::http::HeaderName; use itertools::Itertools; use percent_encoding::{utf8_percent_encode, PercentEncode}; use quick_xml::events::{self as xml_events}; use reqwest::{ header::{CONTENT_LENGTH, CONTENT_TYPE}, - Client as ReqwestClient, Method, Response, StatusCode, + Client as ReqwestClient, Method, RequestBuilder, Response, StatusCode, }; use serde::{Deserialize, Serialize}; use snafu::{ResultExt, Snafu}; use std::sync::Arc; +const VERSION_HEADER: &str = "x-amz-version-id"; + /// A specialized `Error` for object store-related errors #[derive(Debug, Snafu)] #[allow(missing_docs)] @@ -147,33 +155,6 @@ impl From for crate::Error { } } -#[derive(Debug, Deserialize)] -#[serde(rename_all = "PascalCase")] -struct InitiateMultipart { - upload_id: String, -} - -#[derive(Debug, Serialize)] -#[serde(rename_all = "PascalCase", rename = "CompleteMultipartUpload")] -struct CompleteMultipart { - part: Vec, -} - -#[derive(Debug, Serialize)] -struct MultipartPart { - #[serde(rename = "ETag")] - e_tag: String, - #[serde(rename = "PartNumber")] - part_number: usize, -} - -#[derive(Debug, Deserialize)] -#[serde(rename_all = "PascalCase", rename = "CompleteMultipartUploadResult")] -struct CompleteMultipartResult { - #[serde(rename = "ETag")] - e_tag: String, -} - #[derive(Deserialize)] #[serde(rename_all = "PascalCase", rename = "DeleteResult")] struct BatchDeleteResponse { @@ -225,12 +206,61 @@ pub struct S3Config { pub skip_signature: bool, pub checksum: Option, pub copy_if_not_exists: Option, + pub conditional_put: Option, } impl S3Config { pub(crate) fn path_url(&self, path: &Path) -> String { format!("{}/{}", self.bucket_endpoint, encode_path(path)) } + + async fn get_credential(&self) -> Result>> { + Ok(match self.skip_signature { + false => Some(self.credentials.get_credential().await?), + true => None, + }) + } +} + +/// A builder for a put request allowing customisation of the headers and query string +pub(crate) struct PutRequest<'a> { + path: &'a Path, + config: &'a S3Config, + builder: RequestBuilder, + payload_sha256: Option>, +} + +impl<'a> PutRequest<'a> { + pub fn query(self, query: &T) -> Self { + let builder = self.builder.query(query); + Self { builder, ..self } + } + + pub fn header(self, k: &HeaderName, v: &str) -> Self { + let builder = self.builder.header(k, v); + Self { builder, ..self } + } + + pub async fn send(self) -> Result { + let credential = self.config.get_credential().await?; + + let response = self + .builder + .with_aws_sigv4( + credential.as_deref(), + &self.config.region, + "s3", + self.config.sign_payload, + self.payload_sha256.as_deref(), + ) + .send_retry(&self.config.retry_config) + .await + .context(PutRequestSnafu { + path: self.path.as_ref(), + })?; + + Ok(get_put_result(response.headers(), VERSION_HEADER).context(MetadataSnafu)?) + } } #[derive(Debug)] @@ -250,23 +280,10 @@ impl S3Client { &self.config } - async fn get_credential(&self) -> Result>> { - Ok(match self.config.skip_signature { - false => Some(self.config.credentials.get_credential().await?), - true => None, - }) - } - /// Make an S3 PUT request /// /// Returns the ETag - pub async fn put_request( - &self, - path: &Path, - bytes: Bytes, - query: &T, - ) -> Result { - let credential = self.get_credential().await?; + pub fn put_request<'a>(&'a self, path: &'a Path, bytes: Bytes) -> PutRequest<'a> { let url = self.config.path_url(path); let mut builder = self.client.request(Method::PUT, url); let mut payload_sha256 = None; @@ -288,22 +305,12 @@ impl S3Client { builder = builder.header(CONTENT_TYPE, value); } - let response = builder - .query(query) - .with_aws_sigv4( - credential.as_deref(), - &self.config.region, - "s3", - self.config.sign_payload, - payload_sha256.as_deref(), - ) - .send_retry(&self.config.retry_config) - .await - .context(PutRequestSnafu { - path: path.as_ref(), - })?; - - Ok(get_etag(response.headers()).context(MetadataSnafu)?) + PutRequest { + path, + builder, + payload_sha256, + config: &self.config, + } } /// Make an S3 Delete request @@ -312,7 +319,7 @@ impl S3Client { path: &Path, query: &T, ) -> Result<()> { - let credential = self.get_credential().await?; + let credential = self.config.get_credential().await?; let url = self.config.path_url(path); self.client @@ -346,7 +353,7 @@ impl S3Client { return Ok(Vec::new()); } - let credential = self.get_credential().await?; + let credential = self.config.get_credential().await?; let url = format!("{}?delete", self.config.bucket_endpoint); let mut buffer = Vec::new(); @@ -444,7 +451,7 @@ impl S3Client { /// Make an S3 Copy request pub async fn copy_request(&self, from: &Path, to: &Path, overwrite: bool) -> Result<()> { - let credential = self.get_credential().await?; + let credential = self.config.get_credential().await?; let url = self.config.path_url(to); let source = format!("{}/{}", self.config.bucket, encode_path(from)); @@ -492,7 +499,7 @@ impl S3Client { } pub async fn create_multipart(&self, location: &Path) -> Result { - let credential = self.get_credential().await?; + let credential = self.config.get_credential().await?; let url = format!("{}?uploads=", self.config.path_url(location),); let response = self @@ -512,7 +519,7 @@ impl S3Client { .await .context(CreateMultipartResponseBodySnafu)?; - let response: InitiateMultipart = + let response: InitiateMultipartUploadResult = quick_xml::de::from_reader(response.reader()).context(InvalidMultipartResponseSnafu)?; Ok(response.upload_id) @@ -527,15 +534,15 @@ impl S3Client { ) -> Result { let part = (part_idx + 1).to_string(); - let content_id = self - .put_request( - path, - data, - &[("partNumber", &part), ("uploadId", upload_id)], - ) + let result = self + .put_request(path, data) + .query(&[("partNumber", &part), ("uploadId", upload_id)]) + .send() .await?; - Ok(PartId { content_id }) + Ok(PartId { + content_id: result.e_tag.unwrap(), + }) } pub async fn complete_multipart( @@ -544,19 +551,10 @@ impl S3Client { upload_id: &str, parts: Vec, ) -> Result { - let parts = parts - .into_iter() - .enumerate() - .map(|(part_idx, part)| MultipartPart { - e_tag: part.content_id, - part_number: part_idx + 1, - }) - .collect(); - - let request = CompleteMultipart { part: parts }; + let request = CompleteMultipartUpload::from(parts); let body = quick_xml::se::to_string(&request).unwrap(); - let credential = self.get_credential().await?; + let credential = self.config.get_credential().await?; let url = self.config.path_url(location); let response = self @@ -575,16 +573,19 @@ impl S3Client { .await .context(CompleteMultipartRequestSnafu)?; + let version = get_version(response.headers(), VERSION_HEADER).context(MetadataSnafu)?; + let data = response .bytes() .await .context(CompleteMultipartResponseBodySnafu)?; - let response: CompleteMultipartResult = + let response: CompleteMultipartUploadResult = quick_xml::de::from_reader(data.reader()).context(InvalidMultipartResponseSnafu)?; Ok(PutResult { e_tag: Some(response.e_tag), + version, }) } } @@ -596,12 +597,12 @@ impl GetClient for S3Client { const HEADER_CONFIG: HeaderConfig = HeaderConfig { etag_required: false, last_modified_required: false, - version_header: Some("x-amz-version-id"), + version_header: Some(VERSION_HEADER), }; /// Make an S3 GET request async fn get_request(&self, path: &Path, options: GetOptions) -> Result { - let credential = self.get_credential().await?; + let credential = self.config.get_credential().await?; let url = self.config.path_url(path); let method = match options.head { true => Method::HEAD, @@ -643,7 +644,7 @@ impl ListClient for S3Client { token: Option<&str>, offset: Option<&str>, ) -> Result<(ListResult, Option)> { - let credential = self.get_credential().await?; + let credential = self.config.get_credential().await?; let url = self.config.bucket_endpoint.clone(); let mut query = Vec::with_capacity(4); diff --git a/src/aws/mod.rs b/src/aws/mod.rs index 57254c7..99e6376 100644 --- a/src/aws/mod.rs +++ b/src/aws/mod.rs @@ -35,6 +35,7 @@ use async_trait::async_trait; use bytes::Bytes; use futures::stream::BoxStream; use futures::{StreamExt, TryStreamExt}; +use reqwest::header::{IF_MATCH, IF_NONE_MATCH}; use reqwest::Method; use std::{sync::Arc, time::Duration}; use tokio::io::AsyncWrite; @@ -47,20 +48,20 @@ use crate::client::CredentialProvider; use crate::multipart::{MultiPartStore, PartId, PutPart, WriteMultiPart}; use crate::signer::Signer; use crate::{ - GetOptions, GetResult, ListResult, MultipartId, ObjectMeta, ObjectStore, Path, PutResult, - Result, + Error, GetOptions, GetResult, ListResult, MultipartId, ObjectMeta, ObjectStore, Path, PutMode, + PutOptions, PutResult, Result, }; mod builder; mod checksum; mod client; -mod copy; mod credential; +mod precondition; mod resolve; pub use builder::{AmazonS3Builder, AmazonS3ConfigKey}; pub use checksum::Checksum; -pub use copy::S3CopyIfNotExists; +pub use precondition::{S3ConditionalPut, S3CopyIfNotExists}; pub use resolve::resolve_bucket_region; // http://docs.aws.amazon.com/general/latest/gr/sigv4-create-canonical-request.html @@ -158,9 +159,33 @@ impl Signer for AmazonS3 { #[async_trait] impl ObjectStore for AmazonS3 { - async fn put(&self, location: &Path, bytes: Bytes) -> Result { - let e_tag = self.client.put_request(location, bytes, &()).await?; - Ok(PutResult { e_tag: Some(e_tag) }) + async fn put_opts(&self, location: &Path, bytes: Bytes, opts: PutOptions) -> Result { + let request = self.client.put_request(location, bytes); + match (opts.mode, &self.client.config().conditional_put) { + (PutMode::Overwrite, _) => request.send().await, + (PutMode::Create | PutMode::Update(_), None) => Err(Error::NotImplemented), + (PutMode::Create, Some(S3ConditionalPut::ETagMatch)) => { + match request.header(&IF_NONE_MATCH, "*").send().await { + // Technically If-None-Match should return NotModified but some stores, + // such as R2, instead return PreconditionFailed + // https://developers.cloudflare.com/r2/api/s3/extensions/#conditional-operations-in-putobject + Err(e @ Error::NotModified { .. } | e @ Error::Precondition { .. }) => { + Err(Error::AlreadyExists { + path: location.to_string(), + source: Box::new(e), + }) + } + r => r, + } + } + (PutMode::Update(v), Some(S3ConditionalPut::ETagMatch)) => { + let etag = v.e_tag.ok_or_else(|| Error::Generic { + store: STORE, + source: "ETag required for conditional put".to_string().into(), + })?; + request.header(&IF_MATCH, etag.as_str()).send().await + } + } } async fn put_multipart( @@ -306,6 +331,7 @@ mod tests { let config = integration.client.config(); let is_local = config.endpoint.starts_with("http://"); let test_not_exists = config.copy_if_not_exists.is_some(); + let test_conditional_put = config.conditional_put.is_some(); // Localstack doesn't support listing with spaces https://github.com/localstack/localstack/issues/6328 put_get_delete_list_opts(&integration, is_local).await; @@ -319,6 +345,9 @@ mod tests { if test_not_exists { copy_if_not_exists(&integration).await; } + if test_conditional_put { + put_opts(&integration, true).await; + } // run integration test with unsigned payload enabled let builder = AmazonS3Builder::from_env().with_unsigned_payload(true); diff --git a/src/aws/copy.rs b/src/aws/precondition.rs similarity index 68% rename from src/aws/copy.rs rename to src/aws/precondition.rs index da4e280..a50b57f 100644 --- a/src/aws/copy.rs +++ b/src/aws/precondition.rs @@ -17,8 +17,7 @@ use crate::config::Parse; -/// Configure how to provide [`ObjectStore::copy_if_not_exists`] for -/// [`AmazonS3`]. +/// Configure how to provide [`ObjectStore::copy_if_not_exists`] for [`AmazonS3`]. /// /// [`ObjectStore::copy_if_not_exists`]: crate::ObjectStore::copy_if_not_exists /// [`AmazonS3`]: super::AmazonS3 @@ -70,3 +69,45 @@ impl Parse for S3CopyIfNotExists { }) } } + +/// Configure how to provide conditional put support for [`AmazonS3`]. +/// +/// [`AmazonS3`]: super::AmazonS3 +#[derive(Debug, Clone)] +#[allow(missing_copy_implementations)] +#[non_exhaustive] +pub enum S3ConditionalPut { + /// Some S3-compatible stores, such as Cloudflare R2 and minio support conditional + /// put using the standard [HTTP precondition] headers If-Match and If-None-Match + /// + /// Encoded as `etag` ignoring whitespace + /// + /// [HTTP precondition]: https://datatracker.ietf.org/doc/html/rfc9110#name-preconditions + ETagMatch, +} + +impl std::fmt::Display for S3ConditionalPut { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Self::ETagMatch => write!(f, "etag"), + } + } +} + +impl S3ConditionalPut { + fn from_str(s: &str) -> Option { + match s.trim() { + "etag" => Some(Self::ETagMatch), + _ => None, + } + } +} + +impl Parse for S3ConditionalPut { + fn parse(v: &str) -> crate::Result { + Self::from_str(v).ok_or_else(|| crate::Error::Generic { + store: "Config", + source: format!("Failed to parse \"{v}\" as S3PutConditional").into(), + }) + } +} diff --git a/src/azure/client.rs b/src/azure/client.rs index 9f47b9a..c7bd791 100644 --- a/src/azure/client.rs +++ b/src/azure/client.rs @@ -19,7 +19,7 @@ use super::credential::AzureCredential; use crate::azure::credential::*; use crate::azure::{AzureCredentialProvider, STORE}; use crate::client::get::GetClient; -use crate::client::header::{get_etag, HeaderConfig}; +use crate::client::header::{get_put_result, HeaderConfig}; use crate::client::list::ListClient; use crate::client::retry::RetryExt; use crate::client::GetOptionsExt; @@ -27,25 +27,29 @@ use crate::multipart::PartId; use crate::path::DELIMITER; use crate::util::deserialize_rfc1123; use crate::{ - ClientOptions, GetOptions, ListResult, ObjectMeta, Path, PutResult, Result, RetryConfig, + ClientOptions, GetOptions, ListResult, ObjectMeta, Path, PutMode, PutOptions, PutResult, + Result, RetryConfig, }; use async_trait::async_trait; use base64::prelude::BASE64_STANDARD; use base64::Engine; use bytes::{Buf, Bytes}; use chrono::{DateTime, Utc}; +use hyper::http::HeaderName; use itertools::Itertools; use reqwest::header::CONTENT_TYPE; use reqwest::{ - header::{HeaderValue, CONTENT_LENGTH, IF_NONE_MATCH}, - Client as ReqwestClient, Method, Response, StatusCode, + header::{HeaderValue, CONTENT_LENGTH, IF_MATCH, IF_NONE_MATCH}, + Client as ReqwestClient, Method, RequestBuilder, Response, }; use serde::{Deserialize, Serialize}; -use snafu::{ResultExt, Snafu}; +use snafu::{OptionExt, ResultExt, Snafu}; use std::collections::HashMap; use std::sync::Arc; use url::Url; +const VERSION_HEADER: &str = "x-ms-version-id"; + /// A specialized `Error` for object store-related errors #[derive(Debug, Snafu)] #[allow(missing_docs)] @@ -92,6 +96,9 @@ pub(crate) enum Error { Metadata { source: crate::client::header::Error, }, + + #[snafu(display("ETag required for conditional update"))] + MissingETag, } impl From for crate::Error { @@ -134,6 +141,39 @@ impl AzureConfig { } } +/// A builder for a put request allowing customisation of the headers and query string +struct PutRequest<'a> { + path: &'a Path, + config: &'a AzureConfig, + builder: RequestBuilder, +} + +impl<'a> PutRequest<'a> { + fn header(self, k: &HeaderName, v: &str) -> Self { + let builder = self.builder.header(k, v); + Self { builder, ..self } + } + + fn query(self, query: &T) -> Self { + let builder = self.builder.query(query); + Self { builder, ..self } + } + + async fn send(self) -> Result { + let credential = self.config.credentials.get_credential().await?; + let response = self + .builder + .with_azure_authorization(&credential, &self.config.account) + .send_retry(&self.config.retry_config) + .await + .context(PutRequestSnafu { + path: self.path.as_ref(), + })?; + + Ok(response) + } +} + #[derive(Debug)] pub(crate) struct AzureClient { config: AzureConfig, @@ -156,63 +196,52 @@ impl AzureClient { self.config.credentials.get_credential().await } - /// Make an Azure PUT request - pub async fn put_request( - &self, - path: &Path, - bytes: Option, - is_block_op: bool, - query: &T, - ) -> Result { - let credential = self.get_credential().await?; + fn put_request<'a>(&'a self, path: &'a Path, bytes: Bytes) -> PutRequest<'a> { let url = self.config.path_url(path); let mut builder = self.client.request(Method::PUT, url); - if !is_block_op { - builder = builder.header(&BLOB_TYPE, "BlockBlob").query(query); - } else { - builder = builder.query(query); - } - if let Some(value) = self.config().client_options.get_content_type(path) { builder = builder.header(CONTENT_TYPE, value); } - if let Some(bytes) = bytes { - builder = builder - .header(CONTENT_LENGTH, HeaderValue::from(bytes.len())) - .body(bytes) - } else { - builder = builder.header(CONTENT_LENGTH, HeaderValue::from_static("0")); + builder = builder + .header(CONTENT_LENGTH, HeaderValue::from(bytes.len())) + .body(bytes); + + PutRequest { + path, + builder, + config: &self.config, } + } - let response = builder - .with_azure_authorization(&credential, &self.config.account) - .send_retry(&self.config.retry_config) - .await - .context(PutRequestSnafu { - path: path.as_ref(), - })?; + /// Make an Azure PUT request + pub async fn put_blob(&self, path: &Path, bytes: Bytes, opts: PutOptions) -> Result { + let builder = self.put_request(path, bytes); + + let builder = match &opts.mode { + PutMode::Overwrite => builder, + PutMode::Create => builder.header(&IF_NONE_MATCH, "*"), + PutMode::Update(v) => { + let etag = v.e_tag.as_ref().context(MissingETagSnafu)?; + builder.header(&IF_MATCH, etag) + } + }; - Ok(response) + let response = builder.header(&BLOB_TYPE, "BlockBlob").send().await?; + Ok(get_put_result(response.headers(), VERSION_HEADER).context(MetadataSnafu)?) } /// PUT a block pub async fn put_block(&self, path: &Path, part_idx: usize, data: Bytes) -> Result { let content_id = format!("{part_idx:20}"); - let block_id: BlockId = content_id.clone().into(); + let block_id = BASE64_STANDARD.encode(&content_id); - self.put_request( - path, - Some(data), - true, - &[ - ("comp", "block"), - ("blockid", &BASE64_STANDARD.encode(block_id)), - ], - ) - .await?; + self.put_request(path, data) + .query(&[("comp", "block"), ("blockid", &block_id)]) + .send() + .await?; Ok(PartId { content_id }) } @@ -224,15 +253,13 @@ impl AzureClient { .map(|part| BlockId::from(part.content_id)) .collect(); - let block_list = BlockList { blocks }; - let block_xml = block_list.to_xml(); - let response = self - .put_request(path, Some(block_xml.into()), true, &[("comp", "blocklist")]) + .put_request(path, BlockList { blocks }.to_xml().into()) + .query(&[("comp", "blocklist")]) + .send() .await?; - let e_tag = get_etag(response.headers()).context(MetadataSnafu)?; - Ok(PutResult { e_tag: Some(e_tag) }) + Ok(get_put_result(response.headers(), VERSION_HEADER).context(MetadataSnafu)?) } /// Make an Azure Delete request @@ -284,13 +311,7 @@ impl AzureClient { .with_azure_authorization(&credential, &self.config.account) .send_retry(&self.config.retry_config) .await - .map_err(|err| match err.status() { - Some(StatusCode::CONFLICT) => crate::Error::AlreadyExists { - source: Box::new(err), - path: to.to_string(), - }, - _ => err.error(STORE, from.to_string()), - })?; + .map_err(|err| err.error(STORE, from.to_string()))?; Ok(()) } @@ -303,7 +324,7 @@ impl GetClient for AzureClient { const HEADER_CONFIG: HeaderConfig = HeaderConfig { etag_required: true, last_modified_required: true, - version_header: Some("x-ms-version-id"), + version_header: Some(VERSION_HEADER), }; /// Make an Azure GET request diff --git a/src/azure/mod.rs b/src/azure/mod.rs index 779ac2f..762a51d 100644 --- a/src/azure/mod.rs +++ b/src/azure/mod.rs @@ -29,7 +29,8 @@ use crate::{ multipart::{PartId, PutPart, WriteMultiPart}, path::Path, - GetOptions, GetResult, ListResult, MultipartId, ObjectMeta, ObjectStore, PutResult, Result, + GetOptions, GetResult, ListResult, MultipartId, ObjectMeta, ObjectStore, PutOptions, PutResult, + Result, }; use async_trait::async_trait; use bytes::Bytes; @@ -49,7 +50,6 @@ mod credential; /// [`CredentialProvider`] for [`MicrosoftAzure`] pub type AzureCredentialProvider = Arc>; -use crate::client::header::get_etag; use crate::multipart::MultiPartStore; pub use builder::{AzureConfigKey, MicrosoftAzureBuilder}; pub use credential::AzureCredential; @@ -82,16 +82,8 @@ impl std::fmt::Display for MicrosoftAzure { #[async_trait] impl ObjectStore for MicrosoftAzure { - async fn put(&self, location: &Path, bytes: Bytes) -> Result { - let response = self - .client - .put_request(location, Some(bytes), false, &()) - .await?; - let e_tag = get_etag(response.headers()).map_err(|e| crate::Error::Generic { - store: STORE, - source: Box::new(e), - })?; - Ok(PutResult { e_tag: Some(e_tag) }) + async fn put_opts(&self, location: &Path, bytes: Bytes, opts: PutOptions) -> Result { + self.client.put_blob(location, bytes, opts).await } async fn put_multipart( @@ -208,6 +200,7 @@ mod tests { rename_and_copy(&integration).await; copy_if_not_exists(&integration).await; stream_get(&integration).await; + put_opts(&integration, true).await; multipart(&integration, &integration).await; } diff --git a/src/chunked.rs b/src/chunked.rs index 021f9f5..d33556f 100644 --- a/src/chunked.rs +++ b/src/chunked.rs @@ -29,7 +29,8 @@ use tokio::io::AsyncWrite; use crate::path::Path; use crate::{ - GetOptions, GetResult, GetResultPayload, ListResult, ObjectMeta, ObjectStore, PutResult, + GetOptions, GetResult, GetResultPayload, ListResult, ObjectMeta, ObjectStore, PutOptions, + PutResult, }; use crate::{MultipartId, Result}; @@ -62,8 +63,8 @@ impl Display for ChunkedStore { #[async_trait] impl ObjectStore for ChunkedStore { - async fn put(&self, location: &Path, bytes: Bytes) -> Result { - self.inner.put(location, bytes).await + async fn put_opts(&self, location: &Path, bytes: Bytes, opts: PutOptions) -> Result { + self.inner.put_opts(location, bytes, opts).await } async fn put_multipart( diff --git a/src/client/header.rs b/src/client/header.rs index e674968..e85bf6b 100644 --- a/src/client/header.rs +++ b/src/client/header.rs @@ -67,6 +67,23 @@ pub enum Error { }, } +/// Extracts a PutResult from the provided [`HeaderMap`] +#[cfg(any(feature = "aws", feature = "gcp", feature = "azure"))] +pub fn get_put_result(headers: &HeaderMap, version: &str) -> Result { + let e_tag = Some(get_etag(headers)?); + let version = get_version(headers, version)?; + Ok(crate::PutResult { e_tag, version }) +} + +/// Extracts a optional version from the provided [`HeaderMap`] +#[cfg(any(feature = "aws", feature = "gcp", feature = "azure"))] +pub fn get_version(headers: &HeaderMap, version: &str) -> Result, Error> { + Ok(match headers.get(version) { + Some(x) => Some(x.to_str().context(BadHeaderSnafu)?.to_string()), + None => None, + }) +} + /// Extracts an etag from the provided [`HeaderMap`] pub fn get_etag(headers: &HeaderMap) -> Result { let e_tag = headers.get(ETAG).ok_or(Error::MissingEtag)?; diff --git a/src/client/mod.rs b/src/client/mod.rs index 77eee7f..ae092ed 100644 --- a/src/client/mod.rs +++ b/src/client/mod.rs @@ -38,7 +38,7 @@ pub mod token; pub mod header; #[cfg(any(feature = "aws", feature = "gcp"))] -pub mod list_response; +pub mod s3; use async_trait::async_trait; use std::collections::HashMap; diff --git a/src/client/retry.rs b/src/client/retry.rs index d70d6d8..789103c 100644 --- a/src/client/retry.rs +++ b/src/client/retry.rs @@ -79,6 +79,10 @@ impl Error { path, source: Box::new(self), }, + Some(StatusCode::CONFLICT) => crate::Error::AlreadyExists { + path, + source: Box::new(self), + }, _ => crate::Error::Generic { store, source: Box::new(self), diff --git a/src/client/list_response.rs b/src/client/s3.rs similarity index 68% rename from src/client/list_response.rs rename to src/client/s3.rs index 7a170c5..61237dc 100644 --- a/src/client/list_response.rs +++ b/src/client/s3.rs @@ -14,12 +14,13 @@ // specific language governing permissions and limitations // under the License. -//! The list response format used by GCP and AWS +//! The list and multipart API used by both GCS and S3 +use crate::multipart::PartId; use crate::path::Path; use crate::{ListResult, ObjectMeta, Result}; use chrono::{DateTime, Utc}; -use serde::Deserialize; +use serde::{Deserialize, Serialize}; #[derive(Debug, Deserialize)] #[serde(rename_all = "PascalCase")] @@ -84,3 +85,44 @@ impl TryFrom for ObjectMeta { }) } } + +#[derive(Debug, Deserialize)] +#[serde(rename_all = "PascalCase")] +pub struct InitiateMultipartUploadResult { + pub upload_id: String, +} + +#[derive(Debug, Serialize)] +#[serde(rename_all = "PascalCase")] +pub struct CompleteMultipartUpload { + pub part: Vec, +} + +impl From> for CompleteMultipartUpload { + fn from(value: Vec) -> Self { + let part = value + .into_iter() + .enumerate() + .map(|(part_number, part)| MultipartPart { + e_tag: part.content_id, + part_number: part_number + 1, + }) + .collect(); + Self { part } + } +} + +#[derive(Debug, Serialize)] +pub struct MultipartPart { + #[serde(rename = "ETag")] + pub e_tag: String, + #[serde(rename = "PartNumber")] + pub part_number: usize, +} + +#[derive(Debug, Deserialize)] +#[serde(rename_all = "PascalCase")] +pub struct CompleteMultipartUploadResult { + #[serde(rename = "ETag")] + pub e_tag: String, +} diff --git a/src/gcp/client.rs b/src/gcp/client.rs index 8c44f90..7896407 100644 --- a/src/gcp/client.rs +++ b/src/gcp/client.rs @@ -16,23 +16,34 @@ // under the License. use crate::client::get::GetClient; -use crate::client::header::{get_etag, HeaderConfig}; +use crate::client::header::{get_put_result, get_version, HeaderConfig}; use crate::client::list::ListClient; -use crate::client::list_response::ListResponse; use crate::client::retry::RetryExt; +use crate::client::s3::{ + CompleteMultipartUpload, CompleteMultipartUploadResult, InitiateMultipartUploadResult, + ListResponse, +}; use crate::client::GetOptionsExt; use crate::gcp::{GcpCredential, GcpCredentialProvider, STORE}; use crate::multipart::PartId; use crate::path::{Path, DELIMITER}; -use crate::{ClientOptions, GetOptions, ListResult, MultipartId, PutResult, Result, RetryConfig}; +use crate::{ + ClientOptions, GetOptions, ListResult, MultipartId, PutMode, PutOptions, PutResult, Result, + RetryConfig, +}; use async_trait::async_trait; use bytes::{Buf, Bytes}; use percent_encoding::{percent_encode, utf8_percent_encode, NON_ALPHANUMERIC}; -use reqwest::{header, Client, Method, Response, StatusCode}; +use reqwest::header::HeaderName; +use reqwest::{header, Client, Method, RequestBuilder, Response, StatusCode}; use serde::Serialize; -use snafu::{ResultExt, Snafu}; +use snafu::{OptionExt, ResultExt, Snafu}; use std::sync::Arc; +const VERSION_HEADER: &str = "x-goog-generation"; + +static VERSION_MATCH: HeaderName = HeaderName::from_static("x-goog-if-generation-match"); + #[derive(Debug, Snafu)] enum Error { #[snafu(display("Error performing list request: {}", source))] @@ -78,6 +89,18 @@ enum Error { Metadata { source: crate::client::header::Error, }, + + #[snafu(display("Version required for conditional update"))] + MissingVersion, + + #[snafu(display("Error performing complete multipart request: {}", source))] + CompleteMultipartRequest { source: crate::client::retry::Error }, + + #[snafu(display("Error getting complete multipart response body: {}", source))] + CompleteMultipartResponseBody { source: reqwest::Error }, + + #[snafu(display("Got invalid multipart response: {}", source))] + InvalidMultipartResponse { source: quick_xml::de::DeError }, } impl From for crate::Error { @@ -107,6 +130,39 @@ pub struct GoogleCloudStorageConfig { pub client_options: ClientOptions, } +/// A builder for a put request allowing customisation of the headers and query string +pub struct PutRequest<'a> { + path: &'a Path, + config: &'a GoogleCloudStorageConfig, + builder: RequestBuilder, +} + +impl<'a> PutRequest<'a> { + fn header(self, k: &HeaderName, v: &str) -> Self { + let builder = self.builder.header(k, v); + Self { builder, ..self } + } + + fn query(self, query: &T) -> Self { + let builder = self.builder.query(query); + Self { builder, ..self } + } + + async fn send(self) -> Result { + let credential = self.config.credentials.get_credential().await?; + let response = self + .builder + .bearer_auth(&credential.bearer) + .send_retry(&self.config.retry_config) + .await + .context(PutRequestSnafu { + path: self.path.as_ref(), + })?; + + Ok(get_put_result(response.headers(), VERSION_HEADER).context(MetadataSnafu)?) + } +} + #[derive(Debug)] pub struct GoogleCloudStorageClient { config: GoogleCloudStorageConfig, @@ -152,13 +208,7 @@ impl GoogleCloudStorageClient { /// Perform a put request /// /// Returns the new ETag - pub async fn put_request( - &self, - path: &Path, - payload: Bytes, - query: &T, - ) -> Result { - let credential = self.get_credential().await?; + pub fn put_request<'a>(&'a self, path: &'a Path, payload: Bytes) -> PutRequest<'a> { let url = self.object_url(path); let content_type = self @@ -167,21 +217,38 @@ impl GoogleCloudStorageClient { .get_content_type(path) .unwrap_or("application/octet-stream"); - let response = self + let builder = self .client .request(Method::PUT, url) - .query(query) - .bearer_auth(&credential.bearer) .header(header::CONTENT_TYPE, content_type) .header(header::CONTENT_LENGTH, payload.len()) - .body(payload) - .send_retry(&self.config.retry_config) - .await - .context(PutRequestSnafu { - path: path.as_ref(), - })?; + .body(payload); - Ok(get_etag(response.headers()).context(MetadataSnafu)?) + PutRequest { + path, + builder, + config: &self.config, + } + } + + pub async fn put(&self, path: &Path, data: Bytes, opts: PutOptions) -> Result { + let builder = self.put_request(path, data); + + let builder = match &opts.mode { + PutMode::Overwrite => builder, + PutMode::Create => builder.header(&VERSION_MATCH, "0"), + PutMode::Update(v) => { + let etag = v.version.as_ref().context(MissingVersionSnafu)?; + builder.header(&VERSION_MATCH, etag) + } + }; + + match (opts.mode, builder.send().await) { + (PutMode::Create, Err(crate::Error::Precondition { path, source })) => { + Err(crate::Error::AlreadyExists { path, source }) + } + (_, r) => r, + } } /// Perform a put part request @@ -194,18 +261,15 @@ impl GoogleCloudStorageClient { part_idx: usize, data: Bytes, ) -> Result { - let content_id = self - .put_request( - path, - data, - &[ - ("partNumber", &format!("{}", part_idx + 1)), - ("uploadId", upload_id), - ], - ) - .await?; - - Ok(PartId { content_id }) + let query = &[ + ("partNumber", &format!("{}", part_idx + 1)), + ("uploadId", upload_id), + ]; + let result = self.put_request(path, data).query(query).send().await?; + + Ok(PartId { + content_id: result.e_tag.unwrap(), + }) } /// Initiate a multi-part upload @@ -268,17 +332,8 @@ impl GoogleCloudStorageClient { let upload_id = multipart_id.clone(); let url = self.object_url(path); - let parts = completed_parts - .into_iter() - .enumerate() - .map(|(part_number, part)| MultipartPart { - e_tag: part.content_id, - part_number: part_number + 1, - }) - .collect(); - + let upload_info = CompleteMultipartUpload::from(completed_parts); let credential = self.get_credential().await?; - let upload_info = CompleteMultipartUpload { parts }; let data = quick_xml::se::to_string(&upload_info) .context(InvalidPutResponseSnafu)? @@ -287,7 +342,7 @@ impl GoogleCloudStorageClient { // https://github.com/tafia/quick-xml/issues/350 .replace(""", "\""); - let result = self + let response = self .client .request(Method::POST, &url) .bearer_auth(&credential.bearer) @@ -295,12 +350,22 @@ impl GoogleCloudStorageClient { .body(data) .send_retry(&self.config.retry_config) .await - .context(PostRequestSnafu { - path: path.as_ref(), - })?; + .context(CompleteMultipartRequestSnafu)?; - let etag = get_etag(result.headers()).context(MetadataSnafu)?; - Ok(PutResult { e_tag: Some(etag) }) + let version = get_version(response.headers(), VERSION_HEADER).context(MetadataSnafu)?; + + let data = response + .bytes() + .await + .context(CompleteMultipartResponseBodySnafu)?; + + let response: CompleteMultipartUploadResult = + quick_xml::de::from_reader(data.reader()).context(InvalidMultipartResponseSnafu)?; + + Ok(PutResult { + e_tag: Some(response.e_tag), + version, + }) } /// Perform a delete request @@ -334,7 +399,7 @@ impl GoogleCloudStorageClient { .header("x-goog-copy-source", source); if if_not_exists { - builder = builder.header("x-goog-if-generation-match", 0); + builder = builder.header(&VERSION_MATCH, 0); } builder @@ -362,7 +427,7 @@ impl GetClient for GoogleCloudStorageClient { const HEADER_CONFIG: HeaderConfig = HeaderConfig { etag_required: true, last_modified_required: true, - version_header: Some("x-goog-generation"), + version_header: Some(VERSION_HEADER), }; /// Perform a get request @@ -375,13 +440,18 @@ impl GetClient for GoogleCloudStorageClient { false => Method::GET, }; - let mut request = self.client.request(method, url).with_get_options(options); + let mut request = self.client.request(method, url); + + if let Some(version) = &options.version { + request = request.query(&[("generation", version)]); + } if !credential.bearer.is_empty() { request = request.bearer_auth(&credential.bearer); } let response = request + .with_get_options(options) .send_retry(&self.config.retry_config) .await .context(GetRequestSnafu { @@ -444,24 +514,3 @@ impl ListClient for GoogleCloudStorageClient { Ok((response.try_into()?, token)) } } - -#[derive(serde::Deserialize, Debug)] -#[serde(rename_all = "PascalCase")] -struct InitiateMultipartUploadResult { - upload_id: String, -} - -#[derive(serde::Serialize, Debug)] -#[serde(rename_all = "PascalCase", rename(serialize = "Part"))] -struct MultipartPart { - #[serde(rename = "PartNumber")] - part_number: usize, - e_tag: String, -} - -#[derive(serde::Serialize, Debug)] -#[serde(rename_all = "PascalCase")] -struct CompleteMultipartUpload { - #[serde(rename = "Part", default)] - parts: Vec, -} diff --git a/src/gcp/mod.rs b/src/gcp/mod.rs index 0eb3e9c..7721b12 100644 --- a/src/gcp/mod.rs +++ b/src/gcp/mod.rs @@ -35,7 +35,8 @@ use crate::client::CredentialProvider; use crate::{ multipart::{PartId, PutPart, WriteMultiPart}, path::Path, - GetOptions, GetResult, ListResult, MultipartId, ObjectMeta, ObjectStore, PutResult, Result, + GetOptions, GetResult, ListResult, MultipartId, ObjectMeta, ObjectStore, PutOptions, PutResult, + Result, }; use async_trait::async_trait; use bytes::Bytes; @@ -107,9 +108,8 @@ impl PutPart for GCSMultipartUpload { #[async_trait] impl ObjectStore for GoogleCloudStorage { - async fn put(&self, location: &Path, bytes: Bytes) -> Result { - let e_tag = self.client.put_request(location, bytes, &()).await?; - Ok(PutResult { e_tag: Some(e_tag) }) + async fn put_opts(&self, location: &Path, bytes: Bytes, opts: PutOptions) -> Result { + self.client.put(location, bytes, opts).await } async fn put_multipart( @@ -221,6 +221,7 @@ mod test { multipart(&integration, &integration).await; // Fake GCS server doesn't currently honor preconditions get_opts(&integration).await; + put_opts(&integration, true).await; } } diff --git a/src/http/client.rs b/src/http/client.rs index a7dbdfc..8700775 100644 --- a/src/http/client.rs +++ b/src/http/client.rs @@ -243,6 +243,10 @@ impl Client { .header("Destination", self.path_url(to).as_str()); if !overwrite { + // While the Overwrite header appears to duplicate + // the functionality of the If-Match: * header of HTTP/1.1, If-Match + // applies only to the Request-URI, and not to the Destination of a COPY + // or MOVE. builder = builder.header("Overwrite", "F"); } diff --git a/src/http/mod.rs b/src/http/mod.rs index 8f61011..cfcde27 100644 --- a/src/http/mod.rs +++ b/src/http/mod.rs @@ -46,7 +46,7 @@ use crate::http::client::Client; use crate::path::Path; use crate::{ ClientConfigKey, ClientOptions, GetOptions, GetResult, ListResult, MultipartId, ObjectMeta, - ObjectStore, PutResult, Result, RetryConfig, + ObjectStore, PutMode, PutOptions, PutResult, Result, RetryConfig, }; mod client; @@ -96,14 +96,23 @@ impl std::fmt::Display for HttpStore { #[async_trait] impl ObjectStore for HttpStore { - async fn put(&self, location: &Path, bytes: Bytes) -> Result { + async fn put_opts(&self, location: &Path, bytes: Bytes, opts: PutOptions) -> Result { + if opts.mode != PutMode::Overwrite { + // TODO: Add support for If header - https://datatracker.ietf.org/doc/html/rfc2518#section-9.4 + return Err(crate::Error::NotImplemented); + } + let response = self.client.put(location, bytes).await?; let e_tag = match get_etag(response.headers()) { Ok(e_tag) => Some(e_tag), Err(crate::client::header::Error::MissingEtag) => None, Err(source) => return Err(Error::Metadata { source }.into()), }; - Ok(PutResult { e_tag }) + + Ok(PutResult { + e_tag, + version: None, + }) } async fn put_multipart( diff --git a/src/lib.rs b/src/lib.rs index 9a06672..6696430 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -299,7 +299,12 @@ pub trait ObjectStore: std::fmt::Display + Send + Sync + Debug + 'static { /// The operation is guaranteed to be atomic, it will either successfully /// write the entirety of `bytes` to `location`, or fail. No clients /// should be able to observe a partially written object - async fn put(&self, location: &Path, bytes: Bytes) -> Result; + async fn put(&self, location: &Path, bytes: Bytes) -> Result { + self.put_opts(location, bytes, PutOptions::default()).await + } + + /// Save the provided bytes to the specified location with the given options + async fn put_opts(&self, location: &Path, bytes: Bytes, opts: PutOptions) -> Result; /// Get a multi-part upload that allows writing data in chunks. /// @@ -531,6 +536,15 @@ macro_rules! as_ref_impl { self.as_ref().put(location, bytes).await } + async fn put_opts( + &self, + location: &Path, + bytes: Bytes, + opts: PutOptions, + ) -> Result { + self.as_ref().put_opts(location, bytes, opts).await + } + async fn put_multipart( &self, location: &Path, @@ -837,13 +851,65 @@ impl GetResult { } } +/// Configure preconditions for the put operation +#[derive(Debug, Clone, PartialEq, Eq, Default)] +pub enum PutMode { + /// Perform an atomic write operation, overwriting any object present at the provided path + #[default] + Overwrite, + /// Perform an atomic write operation, returning [`Error::AlreadyExists`] if an + /// object already exists at the provided path + Create, + /// Perform an atomic write operation if the current version of the object matches the + /// provided [`UpdateVersion`], returning [`Error::Precondition`] otherwise + Update(UpdateVersion), +} + +/// Uniquely identifies a version of an object to update +/// +/// Stores will use differing combinations of `e_tag` and `version` to provide conditional +/// updates, and it is therefore recommended applications preserve both +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct UpdateVersion { + /// The unique identifier for the newly created object + /// + /// + pub e_tag: Option, + /// A version indicator for the newly created object + pub version: Option, +} + +impl From for UpdateVersion { + fn from(value: PutResult) -> Self { + Self { + e_tag: value.e_tag, + version: value.version, + } + } +} + +/// Options for a put request +#[derive(Debug, Clone, PartialEq, Eq, Default)] +pub struct PutOptions { + /// Configure the [`PutMode`] for this operation + pub mode: PutMode, +} + +impl From for PutOptions { + fn from(mode: PutMode) -> Self { + Self { mode } + } +} + /// Result for a put request #[derive(Debug, Clone, PartialEq, Eq)] pub struct PutResult { - /// The unique identifier for the object + /// The unique identifier for the newly created object /// /// pub e_tag: Option, + /// A version indicator for the newly created object + pub version: Option, } /// A specialized `Result` for object store-related errors @@ -947,6 +1013,7 @@ mod tests { use crate::multipart::MultiPartStore; use crate::test_util::flatten_list_stream; use chrono::TimeZone; + use futures::stream::FuturesUnordered; use rand::{thread_rng, Rng}; use tokio::io::AsyncWriteExt; @@ -1406,7 +1473,7 @@ mod tests { // Can retrieve previous version let get_opts = storage.get_opts(&path, options).await.unwrap(); let old = get_opts.bytes().await.unwrap(); - assert_eq!(old, b"foo".as_slice()); + assert_eq!(old, b"test".as_slice()); // Current version contains the updated data let current = storage.get(&path).await.unwrap().bytes().await.unwrap(); @@ -1414,6 +1481,104 @@ mod tests { } } + pub(crate) async fn put_opts(storage: &dyn ObjectStore, supports_update: bool) { + delete_fixtures(storage).await; + let path = Path::from("put_opts"); + let v1 = storage + .put_opts(&path, "a".into(), PutMode::Create.into()) + .await + .unwrap(); + + let err = storage + .put_opts(&path, "b".into(), PutMode::Create.into()) + .await + .unwrap_err(); + assert!(matches!(err, Error::AlreadyExists { .. }), "{err}"); + + let b = storage.get(&path).await.unwrap().bytes().await.unwrap(); + assert_eq!(b.as_ref(), b"a"); + + if !supports_update { + return; + } + + let v2 = storage + .put_opts(&path, "c".into(), PutMode::Update(v1.clone().into()).into()) + .await + .unwrap(); + + let b = storage.get(&path).await.unwrap().bytes().await.unwrap(); + assert_eq!(b.as_ref(), b"c"); + + let err = storage + .put_opts(&path, "d".into(), PutMode::Update(v1.into()).into()) + .await + .unwrap_err(); + assert!(matches!(err, Error::Precondition { .. }), "{err}"); + + storage + .put_opts(&path, "e".into(), PutMode::Update(v2.clone().into()).into()) + .await + .unwrap(); + + let b = storage.get(&path).await.unwrap().bytes().await.unwrap(); + assert_eq!(b.as_ref(), b"e"); + + // Update not exists + let path = Path::from("I don't exist"); + let err = storage + .put_opts(&path, "e".into(), PutMode::Update(v2.into()).into()) + .await + .unwrap_err(); + assert!(matches!(err, Error::Precondition { .. }), "{err}"); + + const NUM_WORKERS: usize = 5; + const NUM_INCREMENTS: usize = 10; + + let path = Path::from("RACE"); + let mut futures: FuturesUnordered<_> = (0..NUM_WORKERS) + .map(|_| async { + for _ in 0..NUM_INCREMENTS { + loop { + match storage.get(&path).await { + Ok(r) => { + let mode = PutMode::Update(UpdateVersion { + e_tag: r.meta.e_tag.clone(), + version: r.meta.version.clone(), + }); + + let b = r.bytes().await.unwrap(); + let v: usize = std::str::from_utf8(&b).unwrap().parse().unwrap(); + let new = (v + 1).to_string(); + + match storage.put_opts(&path, new.into(), mode.into()).await { + Ok(_) => break, + Err(Error::Precondition { .. }) => continue, + Err(e) => return Err(e), + } + } + Err(Error::NotFound { .. }) => { + let mode = PutMode::Create; + match storage.put_opts(&path, "1".into(), mode.into()).await { + Ok(_) => break, + Err(Error::AlreadyExists { .. }) => continue, + Err(e) => return Err(e), + } + } + Err(e) => return Err(e), + } + } + } + Ok(()) + }) + .collect(); + + while futures.next().await.transpose().unwrap().is_some() {} + let b = storage.get(&path).await.unwrap().bytes().await.unwrap(); + let v = std::str::from_utf8(&b).unwrap().parse::().unwrap(); + assert_eq!(v, NUM_WORKERS * NUM_INCREMENTS); + } + /// Returns a chunk of length `chunk_length` fn get_chunk(chunk_length: usize) -> Bytes { let mut data = vec![0_u8; chunk_length]; diff --git a/src/limit.rs b/src/limit.rs index cd01a96..39cc605 100644 --- a/src/limit.rs +++ b/src/limit.rs @@ -19,7 +19,7 @@ use crate::{ BoxStream, GetOptions, GetResult, GetResultPayload, ListResult, MultipartId, ObjectMeta, - ObjectStore, Path, PutResult, Result, StreamExt, + ObjectStore, Path, PutOptions, PutResult, Result, StreamExt, }; use async_trait::async_trait; use bytes::Bytes; @@ -77,6 +77,10 @@ impl ObjectStore for LimitStore { self.inner.put(location, bytes).await } + async fn put_opts(&self, location: &Path, bytes: Bytes, opts: PutOptions) -> Result { + let _permit = self.semaphore.acquire().await.unwrap(); + self.inner.put_opts(location, bytes, opts).await + } async fn put_multipart( &self, location: &Path, diff --git a/src/local.rs b/src/local.rs index ce9aa46..919baf7 100644 --- a/src/local.rs +++ b/src/local.rs @@ -20,7 +20,7 @@ use crate::{ maybe_spawn_blocking, path::{absolute_path_to_url, Path}, GetOptions, GetResult, GetResultPayload, ListResult, MultipartId, ObjectMeta, ObjectStore, - PutResult, Result, + PutMode, PutOptions, PutResult, Result, }; use async_trait::async_trait; use bytes::Bytes; @@ -271,20 +271,44 @@ impl Config { #[async_trait] impl ObjectStore for LocalFileSystem { - async fn put(&self, location: &Path, bytes: Bytes) -> Result { + async fn put_opts(&self, location: &Path, bytes: Bytes, opts: PutOptions) -> Result { + if matches!(opts.mode, PutMode::Update(_)) { + return Err(crate::Error::NotImplemented); + } + let path = self.config.path_to_filesystem(location)?; maybe_spawn_blocking(move || { let (mut file, suffix) = new_staged_upload(&path)?; let staging_path = staged_upload_path(&path, &suffix); - file.write_all(&bytes) - .context(UnableToCopyDataToFileSnafu) - .and_then(|_| { - std::fs::rename(&staging_path, &path).context(UnableToRenameFileSnafu) - }) - .map_err(|e| { - let _ = std::fs::remove_file(&staging_path); // Attempt to cleanup - e - })?; + + let err = match file.write_all(&bytes) { + Ok(_) => match opts.mode { + PutMode::Overwrite => match std::fs::rename(&staging_path, &path) { + Ok(_) => None, + Err(source) => Some(Error::UnableToRenameFile { source }), + }, + PutMode::Create => match std::fs::hard_link(&staging_path, &path) { + Ok(_) => { + let _ = std::fs::remove_file(&staging_path); // Attempt to cleanup + None + } + Err(source) => match source.kind() { + ErrorKind::AlreadyExists => Some(Error::AlreadyExists { + path: path.to_str().unwrap().to_string(), + source, + }), + _ => Some(Error::UnableToRenameFile { source }), + }, + }, + PutMode::Update(_) => unreachable!(), + }, + Err(source) => Some(Error::UnableToCopyDataToFile { source }), + }; + + if let Some(err) = err { + let _ = std::fs::remove_file(&staging_path); // Attempt to cleanup + return Err(err.into()); + } let metadata = file.metadata().map_err(|e| Error::Metadata { source: e.into(), @@ -293,6 +317,7 @@ impl ObjectStore for LocalFileSystem { Ok(PutResult { e_tag: Some(get_etag(&metadata)), + version: None, }) }) .await @@ -1054,6 +1079,7 @@ mod tests { rename_and_copy(&integration).await; copy_if_not_exists(&integration).await; stream_get(&integration).await; + put_opts(&integration, false).await; } #[test] diff --git a/src/memory.rs b/src/memory.rs index 8b9522e..9d79a79 100644 --- a/src/memory.rs +++ b/src/memory.rs @@ -17,7 +17,8 @@ //! An in-memory object store implementation use crate::{ - path::Path, GetResult, GetResultPayload, ListResult, ObjectMeta, ObjectStore, PutResult, Result, + path::Path, GetResult, GetResultPayload, ListResult, ObjectMeta, ObjectStore, PutMode, + PutOptions, PutResult, Result, UpdateVersion, }; use crate::{GetOptions, MultipartId}; use async_trait::async_trait; @@ -52,6 +53,9 @@ enum Error { #[snafu(display("Object already exists at that location: {path}"))] AlreadyExists { path: String }, + + #[snafu(display("ETag required for conditional update"))] + MissingETag, } impl From for super::Error { @@ -110,9 +114,50 @@ impl Storage { let etag = self.next_etag; self.next_etag += 1; let entry = Entry::new(bytes, Utc::now(), etag); - self.map.insert(location.clone(), entry); + self.overwrite(location, entry); etag } + + fn overwrite(&mut self, location: &Path, entry: Entry) { + self.map.insert(location.clone(), entry); + } + + fn create(&mut self, location: &Path, entry: Entry) -> Result<()> { + use std::collections::btree_map; + match self.map.entry(location.clone()) { + btree_map::Entry::Occupied(_) => Err(Error::AlreadyExists { + path: location.to_string(), + } + .into()), + btree_map::Entry::Vacant(v) => { + v.insert(entry); + Ok(()) + } + } + } + + fn update(&mut self, location: &Path, v: UpdateVersion, entry: Entry) -> Result<()> { + match self.map.get_mut(location) { + // Return Precondition instead of NotFound for consistency with stores + None => Err(crate::Error::Precondition { + path: location.to_string(), + source: format!("Object at location {location} not found").into(), + }), + Some(e) => { + let existing = e.e_tag.to_string(); + let expected = v.e_tag.context(MissingETagSnafu)?; + if existing == expected { + *e = entry; + Ok(()) + } else { + Err(crate::Error::Precondition { + path: location.to_string(), + source: format!("{existing} does not match {expected}").into(), + }) + } + } + } + } } impl std::fmt::Display for InMemory { @@ -123,10 +168,21 @@ impl std::fmt::Display for InMemory { #[async_trait] impl ObjectStore for InMemory { - async fn put(&self, location: &Path, bytes: Bytes) -> Result { - let etag = self.storage.write().insert(location, bytes); + async fn put_opts(&self, location: &Path, bytes: Bytes, opts: PutOptions) -> Result { + let mut storage = self.storage.write(); + let etag = storage.next_etag; + let entry = Entry::new(bytes, Utc::now(), etag); + + match opts.mode { + PutMode::Overwrite => storage.overwrite(location, entry), + PutMode::Create => storage.create(location, entry)?, + PutMode::Update(v) => storage.update(location, v, entry)?, + } + storage.next_etag += 1; + Ok(PutResult { e_tag: Some(etag.to_string()), + version: None, }) } @@ -425,7 +481,7 @@ impl AsyncWrite for InMemoryAppend { fn poll_shutdown( self: Pin<&mut Self>, cx: &mut std::task::Context<'_>, - ) -> std::task::Poll> { + ) -> Poll> { self.poll_flush(cx) } } @@ -449,6 +505,7 @@ mod tests { rename_and_copy(&integration).await; copy_if_not_exists(&integration).await; stream_get(&integration).await; + put_opts(&integration, true).await; } #[tokio::test] diff --git a/src/prefix.rs b/src/prefix.rs index b5bff8b..6810130 100644 --- a/src/prefix.rs +++ b/src/prefix.rs @@ -23,7 +23,8 @@ use tokio::io::AsyncWrite; use crate::path::Path; use crate::{ - GetOptions, GetResult, ListResult, MultipartId, ObjectMeta, ObjectStore, PutResult, Result, + GetOptions, GetResult, ListResult, MultipartId, ObjectMeta, ObjectStore, PutOptions, PutResult, + Result, }; #[doc(hidden)] @@ -85,6 +86,11 @@ impl ObjectStore for PrefixStore { self.inner.put(&full_path, bytes).await } + async fn put_opts(&self, location: &Path, bytes: Bytes, opts: PutOptions) -> Result { + let full_path = self.full_path(location); + self.inner.put_opts(&full_path, bytes, opts).await + } + async fn put_multipart( &self, location: &Path, diff --git a/src/throttle.rs b/src/throttle.rs index c552125..dcd2c04 100644 --- a/src/throttle.rs +++ b/src/throttle.rs @@ -21,7 +21,8 @@ use std::ops::Range; use std::{convert::TryInto, sync::Arc}; use crate::{ - path::Path, GetResult, GetResultPayload, ListResult, ObjectMeta, ObjectStore, PutResult, Result, + path::Path, GetResult, GetResultPayload, ListResult, ObjectMeta, ObjectStore, PutOptions, + PutResult, Result, }; use crate::{GetOptions, MultipartId}; use async_trait::async_trait; @@ -149,10 +150,14 @@ impl std::fmt::Display for ThrottledStore { impl ObjectStore for ThrottledStore { async fn put(&self, location: &Path, bytes: Bytes) -> Result { sleep(self.config().wait_put_per_call).await; - self.inner.put(location, bytes).await } + async fn put_opts(&self, location: &Path, bytes: Bytes, opts: PutOptions) -> Result { + sleep(self.config().wait_put_per_call).await; + self.inner.put_opts(location, bytes, opts).await + } + async fn put_multipart( &self, _location: &Path, diff --git a/tests/get_range_file.rs b/tests/get_range_file.rs index 3fa1cc7..85231a5 100644 --- a/tests/get_range_file.rs +++ b/tests/get_range_file.rs @@ -22,9 +22,7 @@ use bytes::Bytes; use futures::stream::BoxStream; use object_store::local::LocalFileSystem; use object_store::path::Path; -use object_store::{ - GetOptions, GetResult, ListResult, MultipartId, ObjectMeta, ObjectStore, PutResult, -}; +use object_store::*; use std::fmt::Formatter; use tempfile::tempdir; use tokio::io::AsyncWrite; @@ -40,50 +38,42 @@ impl std::fmt::Display for MyStore { #[async_trait] impl ObjectStore for MyStore { - async fn put(&self, path: &Path, data: Bytes) -> object_store::Result { - self.0.put(path, data).await + async fn put_opts(&self, path: &Path, data: Bytes, opts: PutOptions) -> Result { + self.0.put_opts(path, data, opts).await } async fn put_multipart( &self, _: &Path, - ) -> object_store::Result<(MultipartId, Box)> { + ) -> Result<(MultipartId, Box)> { todo!() } - async fn abort_multipart(&self, _: &Path, _: &MultipartId) -> object_store::Result<()> { + async fn abort_multipart(&self, _: &Path, _: &MultipartId) -> Result<()> { todo!() } - async fn get_opts( - &self, - location: &Path, - options: GetOptions, - ) -> object_store::Result { + async fn get_opts(&self, location: &Path, options: GetOptions) -> Result { self.0.get_opts(location, options).await } - async fn head(&self, _: &Path) -> object_store::Result { - todo!() - } - - async fn delete(&self, _: &Path) -> object_store::Result<()> { + async fn delete(&self, _: &Path) -> Result<()> { todo!() } - fn list(&self, _: Option<&Path>) -> BoxStream<'_, object_store::Result> { + fn list(&self, _: Option<&Path>) -> BoxStream<'_, Result> { todo!() } - async fn list_with_delimiter(&self, _: Option<&Path>) -> object_store::Result { + async fn list_with_delimiter(&self, _: Option<&Path>) -> Result { todo!() } - async fn copy(&self, _: &Path, _: &Path) -> object_store::Result<()> { + async fn copy(&self, _: &Path, _: &Path) -> Result<()> { todo!() } - async fn copy_if_not_exists(&self, _: &Path, _: &Path) -> object_store::Result<()> { + async fn copy_if_not_exists(&self, _: &Path, _: &Path) -> Result<()> { todo!() } } From 32450f05be33a2a278f99befedc273ced88e5b2b Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Mon, 30 Oct 2023 11:09:33 +0000 Subject: [PATCH 222/397] Support list_with_offset for GCS (#4993) --- src/gcp/client.rs | 6 ++++-- src/gcp/mod.rs | 8 ++++++++ 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/src/gcp/client.rs b/src/gcp/client.rs index 7896407..e4b0f9a 100644 --- a/src/gcp/client.rs +++ b/src/gcp/client.rs @@ -472,8 +472,6 @@ impl ListClient for GoogleCloudStorageClient { page_token: Option<&str>, offset: Option<&str>, ) -> Result<(ListResult, Option)> { - assert!(offset.is_none()); // Not yet supported - let credential = self.get_credential().await?; let url = format!("{}/{}", self.config.base_url, self.bucket_name_encoded); @@ -495,6 +493,10 @@ impl ListClient for GoogleCloudStorageClient { query.push(("max-keys", max_results)) } + if let Some(offset) = offset { + query.push(("start-after", offset)) + } + let response = self .client .request(Method::GET, url) diff --git a/src/gcp/mod.rs b/src/gcp/mod.rs index 7721b12..11fa683 100644 --- a/src/gcp/mod.rs +++ b/src/gcp/mod.rs @@ -147,6 +147,14 @@ impl ObjectStore for GoogleCloudStorage { self.client.list(prefix) } + fn list_with_offset( + &self, + prefix: Option<&Path>, + offset: &Path, + ) -> BoxStream<'_, Result> { + self.client.list_with_offset(prefix, offset) + } + async fn list_with_delimiter(&self, prefix: Option<&Path>) -> Result { self.client.list_with_delimiter(prefix).await } From 1f3ac189a769176b354af07118d0cb0fd655cf6a Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Mon, 30 Oct 2023 11:30:35 +0000 Subject: [PATCH 223/397] Object tagging (#4754) (#4999) * Object tagging (#4754) * Allow disabling tagging * Rename to disable_tagging --- src/aws/builder.rs | 22 ++++++++++++ src/aws/client.rs | 23 ++++++++++++ src/aws/mod.rs | 16 +++++++-- src/azure/builder.rs | 22 ++++++++++++ src/azure/client.rs | 27 +++++++++++++- src/azure/mod.rs | 7 ++++ src/lib.rs | 85 +++++++++++++++++++++++++++++++++++++++++++- src/tags.rs | 60 +++++++++++++++++++++++++++++++ 8 files changed, 258 insertions(+), 4 deletions(-) create mode 100644 src/tags.rs diff --git a/src/aws/builder.rs b/src/aws/builder.rs index 79ea75b..cf9490d 100644 --- a/src/aws/builder.rs +++ b/src/aws/builder.rs @@ -155,6 +155,8 @@ pub struct AmazonS3Builder { copy_if_not_exists: Option>, /// Put precondition conditional_put: Option>, + /// Ignore tags + disable_tagging: ConfigValue, } /// Configuration keys for [`AmazonS3Builder`] @@ -299,6 +301,15 @@ pub enum AmazonS3ConfigKey { /// Skip signing request SkipSignature, + /// Disable tagging objects + /// + /// This can be desirable if not supported by the backing store + /// + /// Supported keys: + /// - `aws_disable_tagging` + /// - `disable_tagging` + DisableTagging, + /// Client options Client(ClientConfigKey), } @@ -322,6 +333,7 @@ impl AsRef for AmazonS3ConfigKey { Self::SkipSignature => "aws_skip_signature", Self::CopyIfNotExists => "aws_copy_if_not_exists", Self::ConditionalPut => "aws_conditional_put", + Self::DisableTagging => "aws_disable_tagging", Self::Client(opt) => opt.as_ref(), } } @@ -350,6 +362,7 @@ impl FromStr for AmazonS3ConfigKey { "aws_skip_signature" | "skip_signature" => Ok(Self::SkipSignature), "aws_copy_if_not_exists" | "copy_if_not_exists" => Ok(Self::CopyIfNotExists), "aws_conditional_put" | "conditional_put" => Ok(Self::ConditionalPut), + "aws_disable_tagging" | "disable_tagging" => Ok(Self::DisableTagging), // Backwards compatibility "aws_allow_http" => Ok(Self::Client(ClientConfigKey::AllowHttp)), _ => match s.parse() { @@ -453,6 +466,7 @@ impl AmazonS3Builder { self.client_options = self.client_options.with_config(key, value) } AmazonS3ConfigKey::SkipSignature => self.skip_signature.parse(value), + AmazonS3ConfigKey::DisableTagging => self.disable_tagging.parse(value), AmazonS3ConfigKey::CopyIfNotExists => { self.copy_if_not_exists = Some(ConfigValue::Deferred(value.into())) } @@ -525,6 +539,7 @@ impl AmazonS3Builder { AmazonS3ConfigKey::ConditionalPut => { self.conditional_put.as_ref().map(ToString::to_string) } + AmazonS3ConfigKey::DisableTagging => Some(self.disable_tagging.to_string()), } } @@ -735,6 +750,12 @@ impl AmazonS3Builder { self } + /// If set to `true` will ignore any tags provided to put_opts + pub fn with_disable_tagging(mut self, ignore: bool) -> Self { + self.disable_tagging = ignore.into(); + self + } + /// Create a [`AmazonS3`] instance from the provided values, /// consuming `self`. pub fn build(mut self) -> Result { @@ -851,6 +872,7 @@ impl AmazonS3Builder { client_options: self.client_options, sign_payload: !self.unsigned_payload.get()?, skip_signature: self.skip_signature.get()?, + disable_tagging: self.disable_tagging.get()?, checksum, copy_if_not_exists, conditional_put: put_precondition, diff --git a/src/aws/client.rs b/src/aws/client.rs index 20c2a96..3e47abd 100644 --- a/src/aws/client.rs +++ b/src/aws/client.rs @@ -204,6 +204,7 @@ pub struct S3Config { pub client_options: ClientOptions, pub sign_payload: bool, pub skip_signature: bool, + pub disable_tagging: bool, pub checksum: Option, pub copy_if_not_exists: Option, pub conditional_put: Option, @@ -588,6 +589,28 @@ impl S3Client { version, }) } + + #[cfg(test)] + pub async fn get_object_tagging(&self, path: &Path) -> Result { + let credential = self.config.get_credential().await?; + let url = format!("{}?tagging", self.config.path_url(path)); + let response = self + .client + .request(Method::GET, url) + .with_aws_sigv4( + credential.as_deref(), + &self.config.region, + "s3", + self.config.sign_payload, + None, + ) + .send_retry(&self.config.retry_config) + .await + .context(GetRequestSnafu { + path: path.as_ref(), + })?; + Ok(response) + } } #[async_trait] diff --git a/src/aws/mod.rs b/src/aws/mod.rs index 99e6376..cbb3cff 100644 --- a/src/aws/mod.rs +++ b/src/aws/mod.rs @@ -35,7 +35,7 @@ use async_trait::async_trait; use bytes::Bytes; use futures::stream::BoxStream; use futures::{StreamExt, TryStreamExt}; -use reqwest::header::{IF_MATCH, IF_NONE_MATCH}; +use reqwest::header::{HeaderName, IF_MATCH, IF_NONE_MATCH}; use reqwest::Method; use std::{sync::Arc, time::Duration}; use tokio::io::AsyncWrite; @@ -52,6 +52,8 @@ use crate::{ PutOptions, PutResult, Result, }; +static TAGS_HEADER: HeaderName = HeaderName::from_static("x-amz-tagging"); + mod builder; mod checksum; mod client; @@ -160,7 +162,12 @@ impl Signer for AmazonS3 { #[async_trait] impl ObjectStore for AmazonS3 { async fn put_opts(&self, location: &Path, bytes: Bytes, opts: PutOptions) -> Result { - let request = self.client.put_request(location, bytes); + let mut request = self.client.put_request(location, bytes); + let tags = opts.tags.encoded(); + if !tags.is_empty() && !self.client.config().disable_tagging { + request = request.header(&TAGS_HEADER, tags); + } + match (opts.mode, &self.client.config().conditional_put) { (PutMode::Overwrite, _) => request.send().await, (PutMode::Create | PutMode::Update(_), None) => Err(Error::NotImplemented), @@ -342,6 +349,11 @@ mod tests { stream_get(&integration).await; multipart(&integration, &integration).await; + tagging(&integration, !config.disable_tagging, |p| { + let client = Arc::clone(&integration.client); + async move { client.get_object_tagging(&p).await } + }) + .await; if test_not_exists { copy_if_not_exists(&integration).await; } diff --git a/src/azure/builder.rs b/src/azure/builder.rs index 02e0762..6bd2b26 100644 --- a/src/azure/builder.rs +++ b/src/azure/builder.rs @@ -173,6 +173,8 @@ pub struct MicrosoftAzureBuilder { /// /// i.e. https://{account_name}.dfs.fabric.microsoft.com use_fabric_endpoint: ConfigValue, + /// When set to true, skips tagging objects + disable_tagging: ConfigValue, } /// Configuration keys for [`MicrosoftAzureBuilder`] @@ -321,6 +323,15 @@ pub enum AzureConfigKey { /// - `container_name` ContainerName, + /// Disables tagging objects + /// + /// This can be desirable if not supported by the backing store + /// + /// Supported keys: + /// - `azure_disable_tagging` + /// - `disable_tagging` + DisableTagging, + /// Client options Client(ClientConfigKey), } @@ -344,6 +355,7 @@ impl AsRef for AzureConfigKey { Self::FederatedTokenFile => "azure_federated_token_file", Self::UseAzureCli => "azure_use_azure_cli", Self::ContainerName => "azure_container_name", + Self::DisableTagging => "azure_disable_tagging", Self::Client(key) => key.as_ref(), } } @@ -387,6 +399,7 @@ impl FromStr for AzureConfigKey { "azure_use_fabric_endpoint" | "use_fabric_endpoint" => Ok(Self::UseFabricEndpoint), "azure_use_azure_cli" | "use_azure_cli" => Ok(Self::UseAzureCli), "azure_container_name" | "container_name" => Ok(Self::ContainerName), + "azure_disable_tagging" | "disable_tagging" => Ok(Self::DisableTagging), // Backwards compatibility "azure_allow_http" => Ok(Self::Client(ClientConfigKey::AllowHttp)), _ => match s.parse() { @@ -503,6 +516,7 @@ impl MicrosoftAzureBuilder { self.client_options = self.client_options.with_config(key, value) } AzureConfigKey::ContainerName => self.container_name = Some(value.into()), + AzureConfigKey::DisableTagging => self.disable_tagging.parse(value), }; self } @@ -556,6 +570,7 @@ impl MicrosoftAzureBuilder { AzureConfigKey::UseAzureCli => Some(self.use_azure_cli.to_string()), AzureConfigKey::Client(key) => self.client_options.get_config_value(key), AzureConfigKey::ContainerName => self.container_name.clone(), + AzureConfigKey::DisableTagging => Some(self.disable_tagging.to_string()), } } @@ -781,6 +796,12 @@ impl MicrosoftAzureBuilder { self } + /// If set to `true` will ignore any tags provided to put_opts + pub fn with_disable_tagging(mut self, ignore: bool) -> Self { + self.disable_tagging = ignore.into(); + self + } + /// Configure a connection to container with given name on Microsoft Azure Blob store. pub fn build(mut self) -> Result { if let Some(url) = self.url.take() { @@ -885,6 +906,7 @@ impl MicrosoftAzureBuilder { account, is_emulator, container, + disable_tagging: self.disable_tagging.get()?, retry_config: self.retry_config, client_options: self.client_options, service: storage_url, diff --git a/src/azure/client.rs b/src/azure/client.rs index c7bd791..3c71e69 100644 --- a/src/azure/client.rs +++ b/src/azure/client.rs @@ -50,6 +50,8 @@ use url::Url; const VERSION_HEADER: &str = "x-ms-version-id"; +static TAGS_HEADER: HeaderName = HeaderName::from_static("x-ms-tags"); + /// A specialized `Error` for object store-related errors #[derive(Debug, Snafu)] #[allow(missing_docs)] @@ -124,11 +126,12 @@ pub(crate) struct AzureConfig { pub retry_config: RetryConfig, pub service: Url, pub is_emulator: bool, + pub disable_tagging: bool, pub client_options: ClientOptions, } impl AzureConfig { - fn path_url(&self, path: &Path) -> Url { + pub(crate) fn path_url(&self, path: &Path) -> Url { let mut url = self.service.clone(); { let mut path_mut = url.path_segments_mut().unwrap(); @@ -229,6 +232,11 @@ impl AzureClient { } }; + let builder = match (opts.tags.encoded(), self.config.disable_tagging) { + ("", _) | (_, true) => builder, + (tags, false) => builder.header(&TAGS_HEADER, tags), + }; + let response = builder.header(&BLOB_TYPE, "BlockBlob").send().await?; Ok(get_put_result(response.headers(), VERSION_HEADER).context(MetadataSnafu)?) } @@ -315,6 +323,23 @@ impl AzureClient { Ok(()) } + + #[cfg(test)] + pub async fn get_blob_tagging(&self, path: &Path) -> Result { + let credential = self.get_credential().await?; + let url = self.config.path_url(path); + let response = self + .client + .request(Method::GET, url) + .query(&[("comp", "tags")]) + .with_azure_authorization(&credential, &self.config.account) + .send_retry(&self.config.retry_config) + .await + .context(GetRequestSnafu { + path: path.as_ref(), + })?; + Ok(response) + } } #[async_trait] diff --git a/src/azure/mod.rs b/src/azure/mod.rs index 762a51d..1d51cbd 100644 --- a/src/azure/mod.rs +++ b/src/azure/mod.rs @@ -202,6 +202,13 @@ mod tests { stream_get(&integration).await; put_opts(&integration, true).await; multipart(&integration, &integration).await; + + let validate = !integration.client.config().disable_tagging; + tagging(&integration, validate, |p| { + let client = Arc::clone(&integration.client); + async move { client.get_blob_tagging(&p).await } + }) + .await } #[test] diff --git a/src/lib.rs b/src/lib.rs index 6696430..51203ca 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -263,6 +263,10 @@ pub use client::{ #[cfg(feature = "cloud")] mod config; +mod tags; + +pub use tags::TagSet; + pub mod multipart; mod parse; mod util; @@ -893,11 +897,27 @@ impl From for UpdateVersion { pub struct PutOptions { /// Configure the [`PutMode`] for this operation pub mode: PutMode, + /// Provide a [`TagSet`] for this object + /// + /// Implementations that don't support object tagging should ignore this + pub tags: TagSet, } impl From for PutOptions { fn from(mode: PutMode) -> Self { - Self { mode } + Self { + mode, + ..Default::default() + } + } +} + +impl From for PutOptions { + fn from(tags: TagSet) -> Self { + Self { + tags, + ..Default::default() + } } } @@ -1015,6 +1035,7 @@ mod tests { use chrono::TimeZone; use futures::stream::FuturesUnordered; use rand::{thread_rng, Rng}; + use std::future::Future; use tokio::io::AsyncWriteExt; pub(crate) async fn put_get_delete_list(storage: &DynObjectStore) { @@ -1882,6 +1903,68 @@ mod tests { assert_eq!(meta.size, chunk_size * 2); } + #[cfg(any(feature = "aws", feature = "azure"))] + pub(crate) async fn tagging(storage: &dyn ObjectStore, validate: bool, get_tags: F) + where + F: Fn(Path) -> Fut + Send + Sync, + Fut: Future> + Send, + { + use bytes::Buf; + use serde::Deserialize; + + #[derive(Deserialize)] + struct Tagging { + #[serde(rename = "TagSet")] + list: TagList, + } + + #[derive(Debug, Deserialize)] + struct TagList { + #[serde(rename = "Tag")] + tags: Vec, + } + + #[derive(Debug, Deserialize, Eq, PartialEq)] + #[serde(rename_all = "PascalCase")] + struct Tag { + key: String, + value: String, + } + + let tags = vec![ + Tag { + key: "foo.com=bar/s".to_string(), + value: "bananas/foo.com-_".to_string(), + }, + Tag { + key: "namespace/key.foo".to_string(), + value: "value with a space".to_string(), + }, + ]; + let mut tag_set = TagSet::default(); + for t in &tags { + tag_set.push(&t.key, &t.value) + } + + let path = Path::from("tag_test"); + storage + .put_opts(&path, "test".into(), tag_set.into()) + .await + .unwrap(); + + // Write should always succeed, but certain configurations may simply ignore tags + if !validate { + return; + } + + let resp = get_tags(path.clone()).await.unwrap(); + let body = resp.bytes().await.unwrap(); + + let mut resp: Tagging = quick_xml::de::from_reader(body.reader()).unwrap(); + resp.list.tags.sort_by(|a, b| a.key.cmp(&b.key)); + assert_eq!(resp.list.tags, tags); + } + async fn delete_fixtures(storage: &DynObjectStore) { let paths = storage.list(None).map_ok(|meta| meta.location).boxed(); storage diff --git a/src/tags.rs b/src/tags.rs new file mode 100644 index 0000000..fa6e591 --- /dev/null +++ b/src/tags.rs @@ -0,0 +1,60 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use url::form_urlencoded::Serializer; + +/// A collection of key value pairs used to annotate objects +/// +/// +/// +#[derive(Debug, Clone, Default, Eq, PartialEq)] +pub struct TagSet(String); + +impl TagSet { + /// Append a key value pair to this [`TagSet`] + /// + /// Stores have different restrictions on what characters are permitted, + /// for portability it is recommended applications use no more than 10 tags, + /// and stick to alphanumeric characters, and `+ - = . _ : /` + /// + /// + /// + pub fn push(&mut self, key: &str, value: &str) { + Serializer::new(&mut self.0).append_pair(key, value); + } + + /// Return this [`TagSet`] as a URL-encoded string + pub fn encoded(&self) -> &str { + &self.0 + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_tag_set() { + let mut set = TagSet::default(); + set.push("test/foo", "value sdlks"); + set.push("foo", " sdf _ /+./sd"); + assert_eq!( + set.encoded(), + "test%2Ffoo=value+sdlks&foo=+sdf+_+%2F%2B.%2Fsd" + ); + } +} From 07d1d40274e5bd6f8b0b329995fe37f118c09dd0 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Mon, 30 Oct 2023 12:02:17 +0000 Subject: [PATCH 224/397] Improve object_store docs (#4978) * Improve object_store docs * Document configuration system * Review feedback --- src/lib.rs | 285 ++++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 250 insertions(+), 35 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 51203ca..69db9d9 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -38,13 +38,18 @@ //! //! # Highlights //! -//! 1. A focused, easy to use, idiomatic, well documented, high -//! performance, `async` API. +//! 1. A high-performance async API focused on providing a consistent interface +//! mirroring that of object stores such as [S3] //! //! 2. Production quality, leading this crate to be used in large -//! scale production systems, such as [crates.io] and [InfluxDB IOx]. +//! scale production systems, such as [crates.io] and [InfluxDB IOx] //! -//! 3. Stable and predictable governance via the [Apache Arrow] project. +//! 3. Support for advanced functionality, including atomic, conditional reads +//! and writes, vectored IO, bulk deletion, and more... +//! +//! 4. Stable and predictable governance via the [Apache Arrow] project +//! +//! 5. Small dependency footprint, depending on only a small number of common crates //! //! Originally developed for [InfluxDB IOx] and subsequently donated //! to [Apache Arrow]. @@ -52,6 +57,8 @@ //! [Apache Arrow]: https://arrow.apache.org/ //! [InfluxDB IOx]: https://github.com/influxdata/influxdb_iox/ //! [crates.io]: https://github.com/rust-lang/crates.io +//! [ACID]: https://en.wikipedia.org/wiki/ACID +//! [S3]: https://aws.amazon.com/s3/ //! //! # Available [`ObjectStore`] Implementations //! @@ -79,6 +86,23 @@ doc = "* [`http`]: [HTTP/WebDAV Storage](https://datatracker.ietf.org/doc/html/rfc2518). See [`HttpBuilder`](http::HttpBuilder)" )] //! +//! # Why not a Filesystem Interface? +//! +//! Whilst this crate does provide a [`BufReader`], the [`ObjectStore`] interface mirrors the APIs +//! of object stores and not filesystems, opting to provide stateless APIs instead of the cursor +//! based interfaces such as [`Read`] or [`Seek`] favoured by filesystems. +//! +//! This provides some compelling advantages: +//! +//! * Except where explicitly stated otherwise, operations are atomic, and readers +//! cannot observe partial and/or failed writes +//! * Methods map directly to object store APIs, providing both efficiency and predictability +//! * Abstracts away filesystem and operating system specific quirks, ensuring portability +//! * Allows for functionality not native to filesystems, such as operation preconditions +//! and atomic multipart uploads +//! +//! [`BufReader`]: buffered::BufReader +//! //! # Adapters //! //! [`ObjectStore`] instances can be composed with various adapters @@ -87,8 +111,43 @@ //! * Rate Throttling: [`ThrottleConfig`](throttle::ThrottleConfig) //! * Concurrent Request Limit: [`LimitStore`](limit::LimitStore) //! +//! # Configuration System +//! +//! This crate provides a configuration system inspired by the APIs exposed by [fsspec], +//! [PyArrow FileSystem], and [Hadoop FileSystem], allowing creating a [`DynObjectStore`] +//! from a URL and an optional list of key value pairs. This provides a flexible interface +//! to support a wide variety of user-defined store configurations, with minimal additional +//! application complexity. +//! +//! ```no_run +//! # use url::Url; +//! # use object_store::{parse_url, parse_url_opts}; +//! # use object_store::aws::{AmazonS3, AmazonS3Builder}; +//! # +//! # +//! // Can manually create a specific store variant using the appropriate builder +//! let store: AmazonS3 = AmazonS3Builder::from_env() +//! .with_bucket_name("my-bucket").build().unwrap(); //! -//! # List objects: +//! // Alternatively can create an ObjectStore from an S3 URL +//! let url = Url::parse("s3://bucket/path").unwrap(); +//! let (store, path) = parse_url(&url).unwrap(); +//! assert_eq!(path.as_ref(), "path"); +//! +//! // Potentially with additional options +//! let (store, path) = parse_url_opts(&url, vec![("aws_access_key_id", "...")]).unwrap(); +//! +//! // Or with URLs that encode the bucket name in the URL path +//! let url = Url::parse("https://ACCOUNT_ID.r2.cloudflarestorage.com/bucket/path").unwrap(); +//! let (store, path) = parse_url(&url).unwrap(); +//! assert_eq!(path.as_ref(), "path"); +//! ``` +//! +//! [PyArrow FileSystem]: https://arrow.apache.org/docs/python/generated/pyarrow.fs.FileSystem.html#pyarrow.fs.FileSystem.from_uri +//! [fsspec]: https://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.filesystem +//! [Hadoop FileSystem]: https://hadoop.apache.org/docs/r3.0.0/api/org/apache/hadoop/fs/FileSystem.html#get-java.net.URI-org.apache.hadoop.conf.Configuration- +//! +//! # List objects //! //! Use the [`ObjectStore::list`] method to iterate over objects in //! remote storage or files in the local filesystem: @@ -111,7 +170,7 @@ //! // Recursively list all files below the 'data' path. //! // 1. On AWS S3 this would be the 'data/' prefix //! // 2. On a local filesystem, this would be the 'data' directory -//! let prefix: Path = "data".try_into().unwrap(); +//! let prefix = Path::from("data"); //! //! // Get an `async` stream of Metadata objects: //! let mut list_stream = object_store.list(Some(&prefix)); @@ -141,25 +200,34 @@ //! # use futures::TryStreamExt; //! # use object_store::local::LocalFileSystem; //! # use std::sync::Arc; -//! # use object_store::{path::Path, ObjectStore}; +//! # use bytes::Bytes; +//! # use object_store::{path::Path, ObjectStore, GetResult}; //! # fn get_object_store() -> Arc { //! # Arc::new(LocalFileSystem::new()) //! # } //! # //! # async fn example() { //! # -//! // create an ObjectStore +//! // Create an ObjectStore //! let object_store: Arc = get_object_store(); //! //! // Retrieve a specific file -//! let path: Path = "data/file01.parquet".try_into().unwrap(); +//! let path = Path::from("data/file01.parquet"); +//! +//! // Fetch just the file metadata +//! let meta = object_store.head(&path).await.unwrap(); +//! println!("{meta:?}"); +//! +//! // Fetch the object including metadata +//! let result: GetResult = object_store.get(&path).await.unwrap(); +//! assert_eq!(result.meta, meta); +//! +//! // Buffer the entire object in memory +//! let object: Bytes = result.bytes().await.unwrap(); +//! assert_eq!(object.len(), meta.size); //! -//! // fetch the bytes from object store -//! let stream = object_store -//! .get(&path) -//! .await -//! .unwrap() -//! .into_stream(); +//! // Alternatively stream the bytes from object storage +//! let stream = object_store.get(&path).await.unwrap().into_stream(); //! //! // Count the '0's using `try_fold` from `TryStreamExt` trait //! let num_zeros = stream @@ -171,13 +239,9 @@ //! # } //! ``` //! -//! Which will print out something like the following: +//! # Put Object //! -//! ```text -//! Num zeros in data/file01.parquet is 657 -//! ``` -//! # Put object -//! Use the [`ObjectStore::put`] method to save data in remote storage or local filesystem. +//! Use the [`ObjectStore::put`] method to atomically write data. //! //! ``` //! # use object_store::local::LocalFileSystem; @@ -190,15 +254,17 @@ //! # } //! # async fn put() { //! # -//! let object_store: Arc = get_object_store(); -//! let path: Path = "data/file1".try_into().unwrap(); -//! let bytes = Bytes::from_static(b"hello"); -//! object_store.put(&path, bytes).await.unwrap(); +//! let object_store: Arc = get_object_store(); +//! let path = Path::from("data/file1"); +//! let bytes = Bytes::from_static(b"hello"); +//! object_store.put(&path, bytes).await.unwrap(); //! # } //! ``` //! -//! # Multipart put object -//! Use the [`ObjectStore::put_multipart`] method to save large amount of data in chunks. +//! # Multipart Upload +//! +//! Use the [`ObjectStore::put_multipart`] method to atomically write a large amount of data, +//! with implementations automatically handling parallel, chunked upload where appropriate. //! //! ``` //! # use object_store::local::LocalFileSystem; @@ -212,16 +278,165 @@ //! # } //! # async fn multi_upload() { //! # -//! let object_store: Arc = get_object_store(); -//! let path: Path = "data/large_file".try_into().unwrap(); -//! let (_id, mut writer) = object_store.put_multipart(&path).await.unwrap(); -//! -//! let bytes = Bytes::from_static(b"hello"); -//! writer.write_all(&bytes).await.unwrap(); -//! writer.flush().await.unwrap(); -//! writer.shutdown().await.unwrap(); +//! let object_store: Arc = get_object_store(); +//! let path = Path::from("data/large_file"); +//! let (_id, mut writer) = object_store.put_multipart(&path).await.unwrap(); +//! +//! let bytes = Bytes::from_static(b"hello"); +//! writer.write_all(&bytes).await.unwrap(); +//! writer.flush().await.unwrap(); +//! writer.shutdown().await.unwrap(); //! # } //! ``` +//! +//! # Vectored Read +//! +//! A common pattern, especially when reading structured datasets, is to need to fetch +//! multiple, potentially non-contiguous, ranges of a particular object. +//! +//! [`ObjectStore::get_ranges`] provides an efficient way to perform such vectored IO, and will +//! automatically coalesce adjacent ranges into an appropriate number of parallel requests. +//! +//! ``` +//! # use object_store::local::LocalFileSystem; +//! # use object_store::ObjectStore; +//! # use std::sync::Arc; +//! # use bytes::Bytes; +//! # use tokio::io::AsyncWriteExt; +//! # use object_store::path::Path; +//! # fn get_object_store() -> Arc { +//! # Arc::new(LocalFileSystem::new()) +//! # } +//! # async fn multi_upload() { +//! # +//! let object_store: Arc = get_object_store(); +//! let path = Path::from("data/large_file"); +//! let ranges = object_store.get_ranges(&path, &[90..100, 400..600, 0..10]).await.unwrap(); +//! assert_eq!(ranges.len(), 3); +//! assert_eq!(ranges[0].len(), 10); +//! # } +//! ``` +//! +//! # Conditional Fetch +//! +//! More complex object retrieval can be supported by [`ObjectStore::get_opts`]. +//! +//! For example, efficiently refreshing a cache without re-fetching the entire object +//! data if the object hasn't been modified. +//! +//! ``` +//! # use std::collections::btree_map::Entry; +//! # use std::collections::HashMap; +//! # use object_store::{GetOptions, GetResult, ObjectStore, Result, Error}; +//! # use std::sync::Arc; +//! # use std::time::{Duration, Instant}; +//! # use bytes::Bytes; +//! # use tokio::io::AsyncWriteExt; +//! # use object_store::path::Path; +//! struct CacheEntry { +//! /// Data returned by last request +//! data: Bytes, +//! /// ETag identifying the object returned by the server +//! e_tag: String, +//! /// Instant of last refresh +//! refreshed_at: Instant, +//! } +//! +//! /// Example cache that checks entries after 10 seconds for a new version +//! struct Cache { +//! entries: HashMap, +//! store: Arc, +//! } +//! +//! impl Cache { +//! pub async fn get(&mut self, path: &Path) -> Result { +//! Ok(match self.entries.get_mut(path) { +//! Some(e) => match e.refreshed_at.elapsed() < Duration::from_secs(10) { +//! true => e.data.clone(), // Return cached data +//! false => { // Check if remote version has changed +//! let opts = GetOptions { +//! if_none_match: Some(e.e_tag.clone()), +//! ..GetOptions::default() +//! }; +//! match self.store.get_opts(&path, opts).await { +//! Ok(d) => e.data = d.bytes().await?, +//! Err(Error::NotModified { .. }) => {} // Data has not changed +//! Err(e) => return Err(e), +//! }; +//! e.refreshed_at = Instant::now(); +//! e.data.clone() +//! } +//! }, +//! None => { // Not cached, fetch data +//! let get = self.store.get(&path).await?; +//! let e_tag = get.meta.e_tag.clone(); +//! let data = get.bytes().await?; +//! if let Some(e_tag) = e_tag { +//! let entry = CacheEntry { +//! e_tag, +//! data: data.clone(), +//! refreshed_at: Instant::now(), +//! }; +//! self.entries.insert(path.clone(), entry); +//! } +//! data +//! } +//! }) +//! } +//! } +//! ``` +//! +//! # Conditional Put +//! +//! The default behaviour when writing data is to upsert any existing object at the given path, +//! overwriting any previous value. More complex behaviours can be achieved using [`PutMode`], and +//! can be used to build [Optimistic Concurrency Control] based transactions. This facilitates +//! building metadata catalogs, such as [Apache Iceberg] or [Delta Lake], directly on top of object +//! storage, without relying on a separate DBMS. +//! +//! ``` +//! # use object_store::{Error, ObjectStore, PutMode, UpdateVersion}; +//! # use std::sync::Arc; +//! # use bytes::Bytes; +//! # use tokio::io::AsyncWriteExt; +//! # use object_store::memory::InMemory; +//! # use object_store::path::Path; +//! # fn get_object_store() -> Arc { +//! # Arc::new(InMemory::new()) +//! # } +//! # fn do_update(b: Bytes) -> Bytes {b} +//! # async fn conditional_put() { +//! let store = get_object_store(); +//! let path = Path::from("test"); +//! +//! // Perform a conditional update on path +//! loop { +//! // Perform get request +//! let r = store.get(&path).await.unwrap(); +//! +//! // Save version information fetched +//! let version = UpdateVersion { +//! e_tag: r.meta.e_tag.clone(), +//! version: r.meta.version.clone(), +//! }; +//! +//! // Compute new version of object contents +//! let new = do_update(r.bytes().await.unwrap()); +//! +//! // Attempt to commit transaction +//! match store.put_opts(&path, new, PutMode::Update(version).into()).await { +//! Ok(_) => break, // Successfully committed +//! Err(Error::Precondition { .. }) => continue, // Object has changed, try again +//! Err(e) => panic!("{e}") +//! } +//! } +//! # } +//! ``` +//! +//! [Optimistic Concurrency Control]: https://en.wikipedia.org/wiki/Optimistic_concurrency_control +//! [Apache Iceberg]: https://iceberg.apache.org/ +//! [Delta Lake]: https://delta.io/ +//! #[cfg(all( target_arch = "wasm32", From baebb6698bcfa83661f3f936e9d51b53caa61bba Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Wed, 1 Nov 2023 14:33:37 +0000 Subject: [PATCH 225/397] Remove ObjectStore::append (#5016) --- Cargo.toml | 5 -- src/lib.rs | 31 +----------- src/limit.rs | 7 --- src/local.rs | 126 ------------------------------------------------ src/memory.rs | 99 ------------------------------------- src/prefix.rs | 6 --- src/throttle.rs | 4 -- 7 files changed, 1 insertion(+), 277 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index cb820b5..c8cf4e2 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -53,11 +53,6 @@ rand = { version = "0.8", default-features = false, features = ["std", "std_rng" reqwest = { version = "0.11", default-features = false, features = ["rustls-tls"], optional = true } ring = { version = "0.17", default-features = false, features = ["std"], optional = true } rustls-pemfile = { version = "1.0", default-features = false, optional = true } - -[target.'cfg(not(target_arch = "wasm32"))'.dependencies] -tokio = { version = "1.25.0", features = ["sync", "macros", "rt", "time", "io-util", "fs"] } - -[target.'cfg(target_arch = "wasm32")'.dependencies] tokio = { version = "1.25.0", features = ["sync", "macros", "rt", "time", "io-util"] } [target.'cfg(target_family="unix")'.dev-dependencies] diff --git a/src/lib.rs b/src/lib.rs index 69db9d9..1b94f81 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -94,8 +94,7 @@ //! //! This provides some compelling advantages: //! -//! * Except where explicitly stated otherwise, operations are atomic, and readers -//! cannot observe partial and/or failed writes +//! * All operations are atomic, and readers cannot observe partial and/or failed writes //! * Methods map directly to object store APIs, providing both efficiency and predictability //! * Abstracts away filesystem and operating system specific quirks, ensuring portability //! * Allows for functionality not native to filesystems, such as operation preconditions @@ -559,30 +558,6 @@ pub trait ObjectStore: std::fmt::Display + Send + Sync + Debug + 'static { /// vary by object store. async fn abort_multipart(&self, location: &Path, multipart_id: &MultipartId) -> Result<()>; - /// Returns an [`AsyncWrite`] that can be used to append to the object at `location` - /// - /// A new object will be created if it doesn't already exist, otherwise it will be - /// opened, with subsequent writes appended to the end. - /// - /// This operation cannot be supported by all stores, most use-cases should prefer - /// [`ObjectStore::put`] and [`ObjectStore::put_multipart`] for better portability - /// and stronger guarantees - /// - /// This API is not guaranteed to be atomic, in particular - /// - /// * On error, `location` may contain partial data - /// * Concurrent calls to [`ObjectStore::list`] may return partially written objects - /// * Concurrent calls to [`ObjectStore::get`] may return partially written data - /// * Concurrent calls to [`ObjectStore::put`] may result in data loss / corruption - /// * Concurrent calls to [`ObjectStore::append`] may result in data loss / corruption - /// - /// Additionally some stores, such as Azure, may only support appending to objects created - /// with [`ObjectStore::append`], and not with [`ObjectStore::put`], [`ObjectStore::copy`], or - /// [`ObjectStore::put_multipart`] - async fn append(&self, _location: &Path) -> Result> { - Err(Error::NotImplemented) - } - /// Return the bytes that are stored at the specified location. async fn get(&self, location: &Path) -> Result { self.get_opts(location, GetOptions::default()).await @@ -779,10 +754,6 @@ macro_rules! as_ref_impl { self.as_ref().abort_multipart(location, multipart_id).await } - async fn append(&self, location: &Path) -> Result> { - self.as_ref().append(location).await - } - async fn get(&self, location: &Path) -> Result { self.as_ref().get(location).await } diff --git a/src/limit.rs b/src/limit.rs index 39cc605..d1363d9 100644 --- a/src/limit.rs +++ b/src/limit.rs @@ -94,13 +94,6 @@ impl ObjectStore for LimitStore { let _permit = self.semaphore.acquire().await.unwrap(); self.inner.abort_multipart(location, multipart_id).await } - - async fn append(&self, location: &Path) -> Result> { - let permit = Arc::clone(&self.semaphore).acquire_owned().await.unwrap(); - let write = self.inner.append(location).await?; - Ok(Box::new(PermitWrapper::new(write, permit))) - } - async fn get(&self, location: &Path) -> Result { let permit = Arc::clone(&self.semaphore).acquire_owned().await.unwrap(); let r = self.inner.get(location).await?; diff --git a/src/local.rs b/src/local.rs index 919baf7..1a87dc3 100644 --- a/src/local.rs +++ b/src/local.rs @@ -350,45 +350,6 @@ impl ObjectStore for LocalFileSystem { .await } - async fn append(&self, location: &Path) -> Result> { - // Get the path to the file from the configuration. - let path = self.config.path_to_filesystem(location)?; - loop { - // Create new `OpenOptions`. - let mut options = tokio::fs::OpenOptions::new(); - - // Attempt to open the file with the given options. - match options - .truncate(false) - .append(true) - .create(true) - .open(&path) - .await - { - // If the file was successfully opened, return it wrapped in a boxed `AsyncWrite` trait object. - Ok(file) => return Ok(Box::new(file)), - // If the error is that the file was not found, attempt to create the file and any necessary parent directories. - Err(source) if source.kind() == ErrorKind::NotFound => { - // Get the path to the parent directory of the file. - let parent = path.parent().ok_or_else(|| Error::UnableToCreateFile { - path: path.to_path_buf(), - source, - })?; - - // Create the parent directory and any necessary ancestors. - tokio::fs::create_dir_all(parent) - .await - // If creating the directory fails, return a `UnableToCreateDirSnafu` error. - .context(UnableToCreateDirSnafu { path: parent })?; - // Try again to open the file. - continue; - } - // If any other error occurs, return a `UnableToOpenFile` error. - Err(source) => return Err(Error::UnableToOpenFile { source, path }.into()), - } - } - } - async fn get_opts(&self, location: &Path, options: GetOptions) -> Result { let location = location.clone(); let path = self.config.path_to_filesystem(&location)?; @@ -1449,97 +1410,10 @@ mod tests { mod not_wasm_tests { use crate::local::LocalFileSystem; use crate::{ObjectStore, Path}; - use bytes::Bytes; use std::time::Duration; use tempfile::TempDir; use tokio::io::AsyncWriteExt; - #[tokio::test] - async fn creates_dir_if_not_present_append() { - let root = TempDir::new().unwrap(); - let integration = LocalFileSystem::new_with_prefix(root.path()).unwrap(); - - let location = Path::from("nested/file/test_file"); - - let data = Bytes::from("arbitrary data"); - let expected_data = data.clone(); - - let mut writer = integration.append(&location).await.unwrap(); - - writer.write_all(data.as_ref()).await.unwrap(); - - writer.flush().await.unwrap(); - - let read_data = integration - .get(&location) - .await - .unwrap() - .bytes() - .await - .unwrap(); - assert_eq!(&*read_data, expected_data); - } - - #[tokio::test] - async fn unknown_length_append() { - let root = TempDir::new().unwrap(); - let integration = LocalFileSystem::new_with_prefix(root.path()).unwrap(); - - let location = Path::from("some_file"); - - let data = Bytes::from("arbitrary data"); - let expected_data = data.clone(); - let mut writer = integration.append(&location).await.unwrap(); - - writer.write_all(data.as_ref()).await.unwrap(); - writer.flush().await.unwrap(); - - let read_data = integration - .get(&location) - .await - .unwrap() - .bytes() - .await - .unwrap(); - assert_eq!(&*read_data, expected_data); - } - - #[tokio::test] - async fn multiple_append() { - let root = TempDir::new().unwrap(); - let integration = LocalFileSystem::new_with_prefix(root.path()).unwrap(); - - let location = Path::from("some_file"); - - let data = vec![ - Bytes::from("arbitrary"), - Bytes::from("data"), - Bytes::from("gnz"), - ]; - - let mut writer = integration.append(&location).await.unwrap(); - for d in &data { - writer.write_all(d).await.unwrap(); - } - writer.flush().await.unwrap(); - - let mut writer = integration.append(&location).await.unwrap(); - for d in &data { - writer.write_all(d).await.unwrap(); - } - writer.flush().await.unwrap(); - - let read_data = integration - .get(&location) - .await - .unwrap() - .bytes() - .await - .unwrap(); - let expected_data = Bytes::from("arbitrarydatagnzarbitrarydatagnz"); - assert_eq!(&*read_data, expected_data); - } - #[tokio::test] async fn test_cleanup_intermediate_files() { let root = TempDir::new().unwrap(); diff --git a/src/memory.rs b/src/memory.rs index 9d79a79..3823001 100644 --- a/src/memory.rs +++ b/src/memory.rs @@ -205,14 +205,6 @@ impl ObjectStore for InMemory { Ok(()) } - async fn append(&self, location: &Path) -> Result> { - Ok(Box::new(InMemoryAppend { - location: location.clone(), - data: Vec::::new(), - storage: SharedStorage::clone(&self.storage), - })) - } - async fn get_opts(&self, location: &Path, options: GetOptions) -> Result { let entry = self.entry(location).await?; let e_tag = entry.e_tag.to_string(); @@ -443,53 +435,8 @@ impl AsyncWrite for InMemoryUpload { } } -struct InMemoryAppend { - location: Path, - data: Vec, - storage: Arc>, -} - -impl AsyncWrite for InMemoryAppend { - fn poll_write( - mut self: Pin<&mut Self>, - _cx: &mut std::task::Context<'_>, - buf: &[u8], - ) -> Poll> { - self.data.extend_from_slice(buf); - Poll::Ready(Ok(buf.len())) - } - - fn poll_flush( - mut self: Pin<&mut Self>, - _cx: &mut std::task::Context<'_>, - ) -> Poll> { - let storage = Arc::clone(&self.storage); - - let mut writer = storage.write(); - - if let Some(entry) = writer.map.remove(&self.location) { - let buf = std::mem::take(&mut self.data); - let concat = Bytes::from_iter(entry.data.into_iter().chain(buf)); - writer.insert(&self.location, concat); - } else { - let data = Bytes::from(std::mem::take(&mut self.data)); - writer.insert(&self.location, data); - }; - Poll::Ready(Ok(())) - } - - fn poll_shutdown( - self: Pin<&mut Self>, - cx: &mut std::task::Context<'_>, - ) -> Poll> { - self.poll_flush(cx) - } -} - #[cfg(test)] mod tests { - use tokio::io::AsyncWriteExt; - use super::*; use crate::tests::*; @@ -577,50 +524,4 @@ mod tests { panic!("unexpected error type: {err:?}"); } } - - #[tokio::test] - async fn test_append_new() { - let in_memory = InMemory::new(); - let location = Path::from("some_file"); - let data = Bytes::from("arbitrary data"); - let expected_data = data.clone(); - - let mut writer = in_memory.append(&location).await.unwrap(); - writer.write_all(&data).await.unwrap(); - writer.flush().await.unwrap(); - - let read_data = in_memory - .get(&location) - .await - .unwrap() - .bytes() - .await - .unwrap(); - assert_eq!(&*read_data, expected_data); - } - - #[tokio::test] - async fn test_append_existing() { - let in_memory = InMemory::new(); - let location = Path::from("some_file"); - let data = Bytes::from("arbitrary"); - let data_appended = Bytes::from(" data"); - let expected_data = Bytes::from("arbitrary data"); - - let mut writer = in_memory.append(&location).await.unwrap(); - writer.write_all(&data).await.unwrap(); - writer.flush().await.unwrap(); - - writer.write_all(&data_appended).await.unwrap(); - writer.flush().await.unwrap(); - - let read_data = in_memory - .get(&location) - .await - .unwrap() - .bytes() - .await - .unwrap(); - assert_eq!(&*read_data, expected_data); - } } diff --git a/src/prefix.rs b/src/prefix.rs index 6810130..38f9b07 100644 --- a/src/prefix.rs +++ b/src/prefix.rs @@ -103,12 +103,6 @@ impl ObjectStore for PrefixStore { let full_path = self.full_path(location); self.inner.abort_multipart(&full_path, multipart_id).await } - - async fn append(&self, location: &Path) -> Result> { - let full_path = self.full_path(location); - self.inner.append(&full_path).await - } - async fn get(&self, location: &Path) -> Result { let full_path = self.full_path(location); self.inner.get(&full_path).await diff --git a/src/throttle.rs b/src/throttle.rs index dcd2c04..252256a 100644 --- a/src/throttle.rs +++ b/src/throttle.rs @@ -169,10 +169,6 @@ impl ObjectStore for ThrottledStore { Err(super::Error::NotImplemented) } - async fn append(&self, _location: &Path) -> Result> { - Err(super::Error::NotImplemented) - } - async fn get(&self, location: &Path) -> Result { sleep(self.config().wait_get_per_call).await; From 9beaee7496e91f919393e630849f42d3802fc420 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Wed, 1 Nov 2023 14:33:47 +0000 Subject: [PATCH 226/397] Decode URL paths (#5017) (#5018) --- src/parse.rs | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/src/parse.rs b/src/parse.rs index 51993e2..0fbc33c 100644 --- a/src/parse.rs +++ b/src/parse.rs @@ -98,8 +98,7 @@ impl ObjectStoreScheme { _ => return Err(Error::Unrecognised { url: url.clone() }), }; - let path = Path::parse(path)?; - Ok((scheme, path)) + Ok((scheme, Path::from_url_path(path)?)) } } @@ -240,6 +239,18 @@ mod tests { ), ("http://mydomain/path", (ObjectStoreScheme::Http, "path")), ("https://mydomain/path", (ObjectStoreScheme::Http, "path")), + ( + "s3://bucket/foo%20bar", + (ObjectStoreScheme::AmazonS3, "foo bar"), + ), + ( + "https://foo/bar%20baz", + (ObjectStoreScheme::Http, "bar baz"), + ), + ( + "file:///bar%252Efoo", + (ObjectStoreScheme::Local, "bar%2Efoo"), + ), ]; for (s, (expected_scheme, expected_path)) in cases { @@ -260,4 +271,12 @@ mod tests { assert!(ObjectStoreScheme::parse(&url).is_err()); } } + + #[test] + fn test_url_spaces() { + let url = Url::parse("file:///my file with spaces").unwrap(); + assert_eq!(url.path(), "/my%20file%20with%20spaces"); + let (_, path) = parse_url(&url).unwrap(); + assert_eq!(path.as_ref(), "my file with spaces"); + } } From 488159ed9c53b9afda7da1fa90a50114b0fbf520 Mon Sep 17 00:00:00 2001 From: kamille <34352236+Rachelint@users.noreply.github.com> Date: Thu, 2 Nov 2023 00:01:39 +0800 Subject: [PATCH 227/397] ObjectStore: make error msg thrown from retry more detailed (#5012) * optimize error msg for better debugging. * fix unit test. * fix fmt. --- src/client/retry.rs | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) diff --git a/src/client/retry.rs b/src/client/retry.rs index 789103c..08b9a74 100644 --- a/src/client/retry.rs +++ b/src/client/retry.rs @@ -39,9 +39,12 @@ pub enum Error { body: Option, }, - #[snafu(display("Error after {retries} retries: {source}"))] + #[snafu(display("Error after {retries} retries in {elapsed:?}, max_retries:{max_retries}, retry_timeout:{retry_timeout:?}, source:{source}"))] Reqwest { retries: usize, + max_retries: usize, + elapsed: Duration, + retry_timeout: Duration, source: reqwest::Error, }, } @@ -198,7 +201,6 @@ impl RetryExt for reqwest::RequestBuilder { } Err(e) => { let status = r.status(); - if retries == max_retries || now.elapsed() > retry_timeout || !status.is_server_error() { @@ -214,12 +216,18 @@ impl RetryExt for reqwest::RequestBuilder { Err(e) => { Error::Reqwest { retries, + max_retries, + elapsed: now.elapsed(), + retry_timeout, source: e, } } } false => Error::Reqwest { retries, + max_retries, + elapsed: now.elapsed(), + retry_timeout, source: e, } }); @@ -248,6 +256,9 @@ impl RetryExt for reqwest::RequestBuilder { return Err(Error::Reqwest { retries, + max_retries, + elapsed: now.elapsed(), + retry_timeout, source: e, }) } @@ -408,9 +419,8 @@ mod tests { let e = do_request().await.unwrap_err().to_string(); assert!( - e.starts_with( - "Error after 2 retries: HTTP status server error (502 Bad Gateway) for url" - ), + e.contains("Error after 2 retries in") && + e.contains("max_retries:2, retry_timeout:1000s, source:HTTP status server error (502 Bad Gateway) for url"), "{e}" ); @@ -425,7 +435,10 @@ mod tests { } let e = do_request().await.unwrap_err().to_string(); assert!( - e.starts_with("Error after 2 retries: error sending request for url"), + e.contains("Error after 2 retries in") + && e.contains( + "max_retries:2, retry_timeout:1000s, source:error sending request for url" + ), "{e}" ); From b8bc96982b2a1fc297a883187eb39e1019bf78d0 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 2 Nov 2023 10:27:53 +0000 Subject: [PATCH 228/397] Relax path safety (#5019) (#5020) * Relax path safety (#5019) * Review feedback * WASM --- src/lib.rs | 17 +++++ src/local.rs | 174 +++++++++++++++++++++++++++++++++++----------- src/path/mod.rs | 59 +++++++--------- src/path/parts.rs | 23 ++---- 4 files changed, 184 insertions(+), 89 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 1b94f81..cdd572d 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1442,6 +1442,23 @@ mod tests { storage.delete(&path).await.unwrap(); + // Test handling of unicode paths + let path = Path::parse("🇦🇺/$shenanigans@@~.txt").unwrap(); + storage.put(&path, "test".into()).await.unwrap(); + + let r = storage.get(&path).await.unwrap(); + assert_eq!(r.bytes().await.unwrap(), "test"); + + let dir = Path::parse("🇦🇺").unwrap(); + let r = storage.list_with_delimiter(None).await.unwrap(); + assert!(r.common_prefixes.contains(&dir)); + + let r = storage.list_with_delimiter(Some(&dir)).await.unwrap(); + assert_eq!(r.objects.len(), 1); + assert_eq!(r.objects[0].location, path); + + storage.delete(&path).await.unwrap(); + // Can also write non-percent encoded sequences let path = Path::parse("%Q.parquet").unwrap(); storage.put(&path, Bytes::from(vec![0, 1])).await.unwrap(); diff --git a/src/local.rs b/src/local.rs index 1a87dc3..e5c4e32 100644 --- a/src/local.rs +++ b/src/local.rs @@ -144,6 +144,11 @@ pub(crate) enum Error { path: PathBuf, source: io::Error, }, + + #[snafu(display("Filenames containing trailing '/#\\d+/' are not supported: {}", path))] + InvalidPath { + path: String, + }, } impl From for super::Error { @@ -176,6 +181,30 @@ impl From for super::Error { /// [file URI]: https://en.wikipedia.org/wiki/File_URI_scheme /// [RFC 3986]: https://www.rfc-editor.org/rfc/rfc3986 /// +/// # Path Semantics +/// +/// [`LocalFileSystem`] will expose the path semantics of the underlying filesystem, which may +/// have additional restrictions beyond those enforced by [`Path`]. +/// +/// For example: +/// +/// * Windows forbids certain filenames, e.g. `COM0`, +/// * Windows forbids folders with trailing `.` +/// * Windows forbids certain ASCII characters, e.g. `<` or `|` +/// * OS X forbids filenames containing `:` +/// * Leading `-` are discouraged on Unix systems where they may be interpreted as CLI flags +/// * Filesystems may have restrictions on the maximum path or path segment length +/// * Filesystem support for non-ASCII characters is inconsistent +/// +/// Additionally some filesystems, such as NTFS, are case-insensitive, whilst others like +/// FAT don't preserve case at all. Further some filesystems support non-unicode character +/// sequences, such as unpaired UTF-16 surrogates, and [`LocalFileSystem`] will error on +/// encountering such sequences. +/// +/// Finally, filenames matching the regex `/.*#\d+/`, e.g. `foo.parquet#123`, are not supported +/// by [`LocalFileSystem`] as they are used to provide atomic writes. Such files will be ignored +/// for listing operations, and attempting to address such a file will error. +/// /// # Tokio Compatibility /// /// Tokio discourages performing blocking IO on a tokio worker thread, however, @@ -196,6 +225,11 @@ impl From for super::Error { /// * Mutating a file through one or more symlinks will mutate the underlying file /// * Deleting a path that resolves to a symlink will only delete the symlink /// +/// # Cross-Filesystem Copy +/// +/// [`LocalFileSystem::copy`] is implemented using [`std::fs::hard_link`], and therefore +/// does not support copying across filesystem boundaries. +/// #[derive(Debug)] pub struct LocalFileSystem { config: Arc, @@ -246,8 +280,19 @@ impl LocalFileSystem { } impl Config { - /// Return an absolute filesystem path of the given location + /// Return an absolute filesystem path of the given file location fn path_to_filesystem(&self, location: &Path) -> Result { + ensure!( + is_valid_file_path(location), + InvalidPathSnafu { + path: location.as_ref() + } + ); + self.prefix_to_filesystem(location) + } + + /// Return an absolute filesystem path of the given location + fn prefix_to_filesystem(&self, location: &Path) -> Result { let mut url = self.root.clone(); url.path_segments_mut() .expect("url path") @@ -269,6 +314,19 @@ impl Config { } } +fn is_valid_file_path(path: &Path) -> bool { + match path.filename() { + Some(p) => match p.split_once('#') { + Some((_, suffix)) if !suffix.is_empty() => { + // Valid if contains non-digits + !suffix.as_bytes().iter().all(|x| x.is_ascii_digit()) + } + _ => true, + }, + None => false, + } +} + #[async_trait] impl ObjectStore for LocalFileSystem { async fn put_opts(&self, location: &Path, bytes: Bytes, opts: PutOptions) -> Result { @@ -406,7 +464,7 @@ impl ObjectStore for LocalFileSystem { let config = Arc::clone(&self.config); let root_path = match prefix { - Some(prefix) => match config.path_to_filesystem(prefix) { + Some(prefix) => match config.prefix_to_filesystem(prefix) { Ok(path) => path, Err(e) => return futures::future::ready(Err(e)).into_stream().boxed(), }, @@ -419,20 +477,21 @@ impl ObjectStore for LocalFileSystem { .follow_links(true); let s = walkdir.into_iter().flat_map(move |result_dir_entry| { - match convert_walkdir_result(result_dir_entry) { + let entry = match convert_walkdir_result(result_dir_entry).transpose()? { + Ok(entry) => entry, + Err(e) => return Some(Err(e)), + }; + + if !entry.path().is_file() { + return None; + } + + match config.filesystem_to_path(entry.path()) { + Ok(path) => match is_valid_file_path(&path) { + true => Some(convert_entry(entry, path)), + false => None, + }, Err(e) => Some(Err(e)), - Ok(None) => None, - Ok(entry @ Some(_)) => entry - .filter(|dir_entry| { - dir_entry.file_type().is_file() - // Ignore file names with # in them, since they might be in-progress uploads. - // They would be rejected anyways by filesystem_to_path below. - && !dir_entry.file_name().to_string_lossy().contains('#') - }) - .map(|entry| { - let location = config.filesystem_to_path(entry.path())?; - convert_entry(entry, location) - }), } }); @@ -473,7 +532,7 @@ impl ObjectStore for LocalFileSystem { let config = Arc::clone(&self.config); let prefix = prefix.cloned().unwrap_or_default(); - let resolved_prefix = config.path_to_filesystem(&prefix)?; + let resolved_prefix = config.prefix_to_filesystem(&prefix)?; maybe_spawn_blocking(move || { let walkdir = WalkDir::new(&resolved_prefix) @@ -486,15 +545,11 @@ impl ObjectStore for LocalFileSystem { for entry_res in walkdir.into_iter().map(convert_walkdir_result) { if let Some(entry) = entry_res? { - if entry.file_type().is_file() - // Ignore file names with # in them, since they might be in-progress uploads. - // They would be rejected anyways by filesystem_to_path below. - && entry.file_name().to_string_lossy().contains('#') - { - continue; - } let is_directory = entry.file_type().is_dir(); let entry_location = config.filesystem_to_path(entry.path())?; + if !is_directory && !is_valid_file_path(&entry_location) { + continue; + } let mut parts = match entry_location.prefix_match(&prefix) { Some(parts) => parts, @@ -1325,26 +1380,18 @@ mod tests { assert!(result.common_prefixes.is_empty()); assert_eq!(result.objects[0].location, object); - let illegal = root.join("💀"); - std::fs::write(illegal, "foo").unwrap(); - - // Can list directory that doesn't contain illegal path - flatten_list_stream(&integration, Some(&directory)) - .await - .unwrap(); + let emoji = root.join("💀"); + std::fs::write(emoji, "foo").unwrap(); - // Cannot list illegal file - let err = flatten_list_stream(&integration, None) - .await - .unwrap_err() - .to_string(); + // Can list illegal file + let paths = flatten_list_stream(&integration, None).await.unwrap(); - assert!( - err.contains( - "Encountered illegal character sequence \"💀\" whilst parsing path segment \"💀\"" - ), - "{}", - err + assert_eq!( + paths, + vec![ + Path::parse("💀").unwrap(), + Path::parse("directory/child.txt").unwrap() + ] ); } @@ -1403,6 +1450,51 @@ mod tests { let path = Path::from_filesystem_path(".").unwrap(); integration.list_with_delimiter(Some(&path)).await.unwrap(); } + + #[test] + fn test_valid_path() { + let cases = [ + ("foo#123/test.txt", true), + ("foo#123/test#23.txt", true), + ("foo#123/test#34", false), + ("foo😁/test#34", false), + ("foo/test#😁34", true), + ]; + + for (case, expected) in cases { + let path = Path::parse(case).unwrap(); + assert_eq!(is_valid_file_path(&path), expected); + } + } + + #[tokio::test] + async fn test_intermediate_files() { + let root = TempDir::new().unwrap(); + let integration = LocalFileSystem::new_with_prefix(root.path()).unwrap(); + + let a = Path::parse("foo#123/test.txt").unwrap(); + integration.put(&a, "test".into()).await.unwrap(); + + let list = flatten_list_stream(&integration, None).await.unwrap(); + assert_eq!(list, vec![a.clone()]); + + std::fs::write(root.path().join("bar#123"), "test").unwrap(); + + // Should ignore file + let list = flatten_list_stream(&integration, None).await.unwrap(); + assert_eq!(list, vec![a.clone()]); + + let b = Path::parse("bar#123").unwrap(); + let err = integration.get(&b).await.unwrap_err().to_string(); + assert_eq!(err, "Generic LocalFileSystem error: Filenames containing trailing '/#\\d+/' are not supported: bar#123"); + + let c = Path::parse("foo#123.txt").unwrap(); + integration.put(&c, "test".into()).await.unwrap(); + + let mut list = flatten_list_stream(&integration, None).await.unwrap(); + list.sort_unstable(); + assert_eq!(list, vec![c, a]); + } } #[cfg(not(target_arch = "wasm32"))] diff --git a/src/path/mod.rs b/src/path/mod.rs index e065c31..f914862 100644 --- a/src/path/mod.rs +++ b/src/path/mod.rs @@ -65,10 +65,23 @@ pub enum Error { /// A parsed path representation that can be safely written to object storage /// -/// # Path Safety +/// A [`Path`] maintains the following invariants: +/// +/// * Paths are delimited by `/` +/// * Paths do not contain leading or trailing `/` +/// * Paths do not contain relative path segments, i.e. `.` or `..` +/// * Paths do not contain empty path segments +/// * Paths do not contain any ASCII control characters +/// +/// There are no enforced restrictions on path length, however, it should be noted that most +/// object stores do not permit paths longer than 1024 bytes, and many filesystems do not +/// support path segments longer than 255 bytes. +/// +/// # Encode /// /// In theory object stores support any UTF-8 character sequence, however, certain character -/// sequences cause compatibility problems with some applications and protocols. As such the +/// sequences cause compatibility problems with some applications and protocols. Additionally +/// some filesystems may impose character restrictions, see [`LocalFileSystem`]. As such the /// naming guidelines for [S3], [GCS] and [Azure Blob Storage] all recommend sticking to a /// limited character subset. /// @@ -76,34 +89,16 @@ pub enum Error { /// [GCS]: https://cloud.google.com/storage/docs/naming-objects /// [Azure Blob Storage]: https://docs.microsoft.com/en-us/rest/api/storageservices/Naming-and-Referencing-Containers--Blobs--and-Metadata#blob-names /// -/// This presents libraries with two options for consistent path handling: -/// -/// 1. Allow constructing unsafe paths, allowing for both reading and writing of data to paths -/// that may not be consistently understood or supported -/// 2. Disallow constructing unsafe paths, ensuring data written can be consistently handled by -/// all other systems, but preventing interaction with objects at unsafe paths -/// -/// This library takes the second approach, in particular: -/// -/// * Paths are delimited by `/` -/// * Paths do not start with a `/` -/// * Empty path segments are discarded (e.g. `//` is treated as though it were `/`) -/// * Relative path segments, i.e. `.` and `..` are percent encoded -/// * Unsafe characters are percent encoded, as described by [RFC 1738] -/// * All paths are relative to the root of the object store -/// -/// In order to provide these guarantees there are two ways to safely construct a [`Path`] -/// -/// # Encode -/// -/// A string containing potentially illegal path segments can be encoded to a [`Path`] -/// using [`Path::from`] or [`Path::from_iter`]. +/// A string containing potentially problematic path segments can therefore be encoded to a [`Path`] +/// using [`Path::from`] or [`Path::from_iter`]. This will percent encode any problematic +/// segments according to [RFC 1738]. /// /// ``` /// # use object_store::path::Path; /// assert_eq!(Path::from("foo/bar").as_ref(), "foo/bar"); /// assert_eq!(Path::from("foo//bar").as_ref(), "foo/bar"); /// assert_eq!(Path::from("foo/../bar").as_ref(), "foo/%2E%2E/bar"); +/// assert_eq!(Path::from("/").as_ref(), ""); /// assert_eq!(Path::from_iter(["foo", "foo/bar"]).as_ref(), "foo/foo%2Fbar"); /// ``` /// @@ -116,20 +111,20 @@ pub enum Error { /// /// # Parse /// -/// Alternatively a [`Path`] can be created from an existing string, returning an -/// error if it is invalid. Unlike the encoding methods, this will permit -/// valid percent encoded sequences. +/// Alternatively a [`Path`] can be parsed from an existing string, returning an +/// error if it is invalid. Unlike the encoding methods above, this will permit +/// arbitrary unicode, including percent encoded sequences. /// /// ``` /// # use object_store::path::Path; -/// /// assert_eq!(Path::parse("/foo/foo%2Fbar").unwrap().as_ref(), "foo/foo%2Fbar"); -/// Path::parse("..").unwrap_err(); -/// Path::parse("/foo//").unwrap_err(); -/// Path::parse("😀").unwrap_err(); +/// Path::parse("..").unwrap_err(); // Relative path segments are disallowed +/// Path::parse("/foo//").unwrap_err(); // Empty path segments are disallowed +/// Path::parse("\x00").unwrap_err(); // ASCII control characters are disallowed /// ``` /// /// [RFC 1738]: https://www.ietf.org/rfc/rfc1738.txt +/// [`LocalFileSystem`]: crate::local::LocalFileSystem #[derive(Debug, Clone, Default, PartialEq, Eq, Hash, Ord, PartialOrd)] pub struct Path { /// The raw path with no leading or trailing delimiters @@ -236,7 +231,7 @@ impl Path { pub fn filename(&self) -> Option<&str> { match self.raw.is_empty() { true => None, - false => self.raw.split(DELIMITER).last(), + false => self.raw.rsplit(DELIMITER).next(), } } diff --git a/src/path/parts.rs b/src/path/parts.rs index 9da4815..df7097c 100644 --- a/src/path/parts.rs +++ b/src/path/parts.rs @@ -37,8 +37,10 @@ pub struct InvalidPart { /// The PathPart type exists to validate the directory/file names that form part /// of a path. /// -/// A PathPart instance is guaranteed to to contain no illegal characters (e.g. `/`) -/// as it can only be constructed by going through the `from` impl. +/// A [`PathPart`] is guaranteed to: +/// +/// * Contain no ASCII control characters or `/` +/// * Not be a relative path segment, i.e. `.` or `..` #[derive(Clone, PartialEq, Eq, PartialOrd, Ord, Debug, Default, Hash)] pub struct PathPart<'a> { pub(super) raw: Cow<'a, str>, @@ -54,19 +56,12 @@ impl<'a> PathPart<'a> { }); } - for (idx, b) in segment.as_bytes().iter().cloned().enumerate() { - // A percent character is always valid, even if not - // followed by a valid 2-digit hex code - // https://url.spec.whatwg.org/#percent-encoded-bytes - if b == b'%' { - continue; - } - - if !b.is_ascii() || should_percent_encode(b) { + for c in segment.chars() { + if c.is_ascii_control() || c == '/' { return Err(InvalidPart { segment: segment.to_string(), // This is correct as only single byte characters up to this point - illegal: segment.chars().nth(idx).unwrap().to_string(), + illegal: c.to_string(), }); } } @@ -77,10 +72,6 @@ impl<'a> PathPart<'a> { } } -fn should_percent_encode(c: u8) -> bool { - percent_encode(&[c], INVALID).next().unwrap().len() != 1 -} - /// Characters we want to encode. const INVALID: &AsciiSet = &CONTROLS // The delimiter we are reserving for internal hierarchy From 7980d70b471bf5f34dd42e0c2613446e590eabed Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 2 Nov 2023 10:28:07 +0000 Subject: [PATCH 229/397] Support onelake fabric paths in parse_url (#5000) (#5002) --- src/parse.rs | 29 ++++++++++++++++++++++++++++- 1 file changed, 28 insertions(+), 1 deletion(-) diff --git a/src/parse.rs b/src/parse.rs index 0fbc33c..ddea034 100644 --- a/src/parse.rs +++ b/src/parse.rs @@ -81,7 +81,10 @@ impl ObjectStoreScheme { } ("http", Some(_)) => (Self::Http, url.path()), ("https", Some(host)) => { - if host.ends_with("dfs.core.windows.net") || host.ends_with("blob.core.windows.net") + if host.ends_with("dfs.core.windows.net") + || host.ends_with("blob.core.windows.net") + || host.ends_with("dfs.fabric.microsoft.com") + || host.ends_with("blob.fabric.microsoft.com") { (Self::MicrosoftAzure, url.path()) } else if host.ends_with("amazonaws.com") { @@ -251,6 +254,30 @@ mod tests { "file:///bar%252Efoo", (ObjectStoreScheme::Local, "bar%2Efoo"), ), + ( + "abfss://file_system@account.dfs.fabric.microsoft.com/", + (ObjectStoreScheme::MicrosoftAzure, ""), + ), + ( + "abfss://file_system@account.dfs.fabric.microsoft.com/", + (ObjectStoreScheme::MicrosoftAzure, ""), + ), + ( + "https://account.dfs.fabric.microsoft.com/", + (ObjectStoreScheme::MicrosoftAzure, ""), + ), + ( + "https://account.dfs.fabric.microsoft.com/container", + (ObjectStoreScheme::MicrosoftAzure, "container"), + ), + ( + "https://account.blob.fabric.microsoft.com/", + (ObjectStoreScheme::MicrosoftAzure, ""), + ), + ( + "https://account.blob.fabric.microsoft.com/container", + (ObjectStoreScheme::MicrosoftAzure, "container"), + ), ]; for (s, (expected_scheme, expected_path)) in cases { From 776b5756f7e743b798932a8b76727d95b23567be Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 2 Nov 2023 11:30:09 +0000 Subject: [PATCH 230/397] Prepare object_store 0.8.0 (#5010) (#5023) --- CHANGELOG-old.md | 44 +++++++++++++++ CHANGELOG.md | 91 ++++++++++++++++++++++---------- Cargo.toml | 2 +- dev/release/update_change_log.sh | 4 +- 4 files changed, 109 insertions(+), 32 deletions(-) diff --git a/CHANGELOG-old.md b/CHANGELOG-old.md index a0ced7c..6780f7d 100644 --- a/CHANGELOG-old.md +++ b/CHANGELOG-old.md @@ -19,6 +19,50 @@ # Historical Changelog + +## [object_store_0.7.1](https://github.com/apache/arrow-rs/tree/object_store_0.7.1) (2023-09-26) + +[Full Changelog](https://github.com/apache/arrow-rs/compare/object_store_0.7.0...object_store_0.7.1) + +**Implemented enhancements:** + +- Automatically Cleanup LocalFileSystem Temporary Files [\#4778](https://github.com/apache/arrow-rs/issues/4778) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- object-store: Expose an async reader API for object store [\#4762](https://github.com/apache/arrow-rs/issues/4762) +- Improve proxy support by using reqwest::Proxy as configuration [\#4713](https://github.com/apache/arrow-rs/issues/4713) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] + +**Fixed bugs:** + +- object-store: http shouldn't perform range requests unless `accept-ranges: bytes` header is present [\#4839](https://github.com/apache/arrow-rs/issues/4839) +- object-store: http-store fails when url doesn't have last-modified header on 0.7.0 [\#4831](https://github.com/apache/arrow-rs/issues/4831) +- object-store fails to compile for `wasm32-unknown-unknown` with `http` feature [\#4776](https://github.com/apache/arrow-rs/issues/4776) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- object-store: could not find `header` in `client` for `http` feature [\#4775](https://github.com/apache/arrow-rs/issues/4775) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- LocalFileSystem Copy and Rename Don't Create Intermediate Directories [\#4760](https://github.com/apache/arrow-rs/issues/4760) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- LocalFileSystem Copy is not Atomic [\#4758](https://github.com/apache/arrow-rs/issues/4758) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] + +**Closed issues:** + +- object\_store Azure Government Cloud functionality? [\#4853](https://github.com/apache/arrow-rs/issues/4853) + +**Merged pull requests:** + +- Add ObjectStore BufReader \(\#4762\) [\#4857](https://github.com/apache/arrow-rs/pull/4857) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Allow overriding azure endpoint [\#4854](https://github.com/apache/arrow-rs/pull/4854) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Minor: Improve object\_store docs.rs landing page [\#4849](https://github.com/apache/arrow-rs/pull/4849) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([alamb](https://github.com/alamb)) +- Error if Remote Ignores HTTP Range Header [\#4841](https://github.com/apache/arrow-rs/pull/4841) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([universalmind303](https://github.com/universalmind303)) +- Perform HEAD request for HttpStore::head [\#4837](https://github.com/apache/arrow-rs/pull/4837) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- fix: object store http header last modified [\#4834](https://github.com/apache/arrow-rs/pull/4834) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([universalmind303](https://github.com/universalmind303)) +- Prepare arrow 47.0.0 [\#4827](https://github.com/apache/arrow-rs/pull/4827) ([tustvold](https://github.com/tustvold)) +- ObjectStore Wasm32 Fixes \(\#4775\) \(\#4776\) [\#4796](https://github.com/apache/arrow-rs/pull/4796) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Best effort cleanup of staged upload files \(\#4778\) [\#4792](https://github.com/apache/arrow-rs/pull/4792) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Relaxing type bounds on coalesce\_ranges and collect\_bytes [\#4787](https://github.com/apache/arrow-rs/pull/4787) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([sumerman](https://github.com/sumerman)) +- Update object\_store chrono deprecations [\#4786](https://github.com/apache/arrow-rs/pull/4786) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Make coalesce\_ranges and collect\_bytes available for crate users [\#4784](https://github.com/apache/arrow-rs/pull/4784) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([sumerman](https://github.com/sumerman)) +- Bump actions/checkout from 3 to 4 [\#4767](https://github.com/apache/arrow-rs/pull/4767) ([dependabot[bot]](https://github.com/apps/dependabot)) +- Make ObjectStore::copy Atomic and Automatically Create Parent Directories \(\#4758\) \(\#4760\) [\#4759](https://github.com/apache/arrow-rs/pull/4759) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Update nix requirement from 0.26.1 to 0.27.1 in /object\_store [\#4744](https://github.com/apache/arrow-rs/pull/4744) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([viirya](https://github.com/viirya)) +- Add `with_proxy_ca_certificate` and `with_proxy_excludes` [\#4714](https://github.com/apache/arrow-rs/pull/4714) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([gordonwang0](https://github.com/gordonwang0)) +- Update object\_store Dependencies and Configure Dependabot [\#4700](https://github.com/apache/arrow-rs/pull/4700) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) + ## [object_store_0.7.0](https://github.com/apache/arrow-rs/tree/object_store_0.7.0) (2023-08-15) [Full Changelog](https://github.com/apache/arrow-rs/compare/object_store_0.6.1...object_store_0.7.0) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1f069ce..c24cf54 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,48 +19,81 @@ # Changelog -## [object_store_0.7.1](https://github.com/apache/arrow-rs/tree/object_store_0.7.1) (2023-09-26) +## [object_store_0.8.0](https://github.com/apache/arrow-rs/tree/object_store_0.8.0) (2023-11-02) -[Full Changelog](https://github.com/apache/arrow-rs/compare/object_store_0.7.0...object_store_0.7.1) +[Full Changelog](https://github.com/apache/arrow-rs/compare/object_store_0.7.1...object_store_0.8.0) + +**Breaking changes:** + +- Remove ObjectStore::append [\#5016](https://github.com/apache/arrow-rs/pull/5016) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Don't panic on invalid Azure access key \(\#4972\) [\#4974](https://github.com/apache/arrow-rs/pull/4974) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Return `PutResult` with an ETag from ObjectStore::put \(\#4934\) [\#4944](https://github.com/apache/arrow-rs/pull/4944) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Add ObjectMeta::version and GetOptions::version \(\#4925\) [\#4935](https://github.com/apache/arrow-rs/pull/4935) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Add GetOptions::head [\#4931](https://github.com/apache/arrow-rs/pull/4931) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Remove Nested async and Fallibility from ObjectStore::list [\#4930](https://github.com/apache/arrow-rs/pull/4930) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) **Implemented enhancements:** -- Automatically Cleanup LocalFileSystem Temporary Files [\#4778](https://github.com/apache/arrow-rs/issues/4778) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- object-store: Expose an async reader API for object store [\#4762](https://github.com/apache/arrow-rs/issues/4762) -- Improve proxy support by using reqwest::Proxy as configuration [\#4713](https://github.com/apache/arrow-rs/issues/4713) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Relax Path Safety on Parse [\#5019](https://github.com/apache/arrow-rs/issues/5019) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- ObjectStore: hard to determine the cause of the error thrown from retry [\#5013](https://github.com/apache/arrow-rs/issues/5013) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- continue existing multi-part upload [\#4961](https://github.com/apache/arrow-rs/issues/4961) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Simplify ObjectStore::List [\#4946](https://github.com/apache/arrow-rs/issues/4946) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Return ETag and Version on Put [\#4934](https://github.com/apache/arrow-rs/issues/4934) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Support Not Signing Requests in AmazonS3 [\#4927](https://github.com/apache/arrow-rs/issues/4927) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Get Object By Version [\#4925](https://github.com/apache/arrow-rs/issues/4925) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Plans for supporting Extension Array to support Fixed shape tensor Array [\#4890](https://github.com/apache/arrow-rs/issues/4890) +- Conditional Put Support [\#4879](https://github.com/apache/arrow-rs/issues/4879) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- creates\_dir\_if\_not\_present\_append Test is Flaky [\#4872](https://github.com/apache/arrow-rs/issues/4872) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Release object\_store `0.7.1` [\#4858](https://github.com/apache/arrow-rs/issues/4858) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Support User-Defined Object Metadata [\#4754](https://github.com/apache/arrow-rs/issues/4754) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- APIs for directly managing multi-part uploads and saving potential parquet footers [\#4608](https://github.com/apache/arrow-rs/issues/4608) **Fixed bugs:** -- object-store: http shouldn't perform range requests unless `accept-ranges: bytes` header is present [\#4839](https://github.com/apache/arrow-rs/issues/4839) -- object-store: http-store fails when url doesn't have last-modified header on 0.7.0 [\#4831](https://github.com/apache/arrow-rs/issues/4831) -- object-store fails to compile for `wasm32-unknown-unknown` with `http` feature [\#4776](https://github.com/apache/arrow-rs/issues/4776) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- object-store: could not find `header` in `client` for `http` feature [\#4775](https://github.com/apache/arrow-rs/issues/4775) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- LocalFileSystem Copy and Rename Don't Create Intermediate Directories [\#4760](https://github.com/apache/arrow-rs/issues/4760) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- LocalFileSystem Copy is not Atomic [\#4758](https://github.com/apache/arrow-rs/issues/4758) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- ObjectStore parse\_url Incorrectly Handles URLs with Spaces [\#5017](https://github.com/apache/arrow-rs/issues/5017) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- \[objects-store\]: periods/dots error in GCP bucket [\#4991](https://github.com/apache/arrow-rs/issues/4991) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Azure ImdsManagedIdentityProvider does not work in Azure functions [\#4976](https://github.com/apache/arrow-rs/issues/4976) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Panic when using an azure object store with an invalid access key [\#4972](https://github.com/apache/arrow-rs/issues/4972) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Handle Body Errors in AWS CompleteMultipartUpload [\#4965](https://github.com/apache/arrow-rs/issues/4965) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- ObjectStore multiple\_append Test is Flaky [\#4868](https://github.com/apache/arrow-rs/issues/4868) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- \[objectstore\] Problem with special characters in file path [\#4454](https://github.com/apache/arrow-rs/issues/4454) **Closed issues:** -- object\_store Azure Government Cloud functionality? [\#4853](https://github.com/apache/arrow-rs/issues/4853) +- Include onelake fabric path for https [\#5000](https://github.com/apache/arrow-rs/issues/5000) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- \[object\_store\] Support generating and using signed upload URLs [\#4763](https://github.com/apache/arrow-rs/issues/4763) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] **Merged pull requests:** -- Add ObjectStore BufReader \(\#4762\) [\#4857](https://github.com/apache/arrow-rs/pull/4857) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- Allow overriding azure endpoint [\#4854](https://github.com/apache/arrow-rs/pull/4854) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- Minor: Improve object\_store docs.rs landing page [\#4849](https://github.com/apache/arrow-rs/pull/4849) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([alamb](https://github.com/alamb)) -- Error if Remote Ignores HTTP Range Header [\#4841](https://github.com/apache/arrow-rs/pull/4841) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([universalmind303](https://github.com/universalmind303)) -- Perform HEAD request for HttpStore::head [\#4837](https://github.com/apache/arrow-rs/pull/4837) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- fix: object store http header last modified [\#4834](https://github.com/apache/arrow-rs/pull/4834) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([universalmind303](https://github.com/universalmind303)) -- Prepare arrow 47.0.0 [\#4827](https://github.com/apache/arrow-rs/pull/4827) ([tustvold](https://github.com/tustvold)) -- ObjectStore Wasm32 Fixes \(\#4775\) \(\#4776\) [\#4796](https://github.com/apache/arrow-rs/pull/4796) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- Best effort cleanup of staged upload files \(\#4778\) [\#4792](https://github.com/apache/arrow-rs/pull/4792) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- Relaxing type bounds on coalesce\_ranges and collect\_bytes [\#4787](https://github.com/apache/arrow-rs/pull/4787) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([sumerman](https://github.com/sumerman)) -- Update object\_store chrono deprecations [\#4786](https://github.com/apache/arrow-rs/pull/4786) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- Make coalesce\_ranges and collect\_bytes available for crate users [\#4784](https://github.com/apache/arrow-rs/pull/4784) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([sumerman](https://github.com/sumerman)) -- Bump actions/checkout from 3 to 4 [\#4767](https://github.com/apache/arrow-rs/pull/4767) ([dependabot[bot]](https://github.com/apps/dependabot)) -- Make ObjectStore::copy Atomic and Automatically Create Parent Directories \(\#4758\) \(\#4760\) [\#4759](https://github.com/apache/arrow-rs/pull/4759) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- Update nix requirement from 0.26.1 to 0.27.1 in /object\_store [\#4744](https://github.com/apache/arrow-rs/pull/4744) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([viirya](https://github.com/viirya)) -- Add `with_proxy_ca_certificate` and `with_proxy_excludes` [\#4714](https://github.com/apache/arrow-rs/pull/4714) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([gordonwang0](https://github.com/gordonwang0)) -- Update object\_store Dependencies and Configure Dependabot [\#4700](https://github.com/apache/arrow-rs/pull/4700) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Relax path safety \(\#5019\) [\#5020](https://github.com/apache/arrow-rs/pull/5020) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Decode URL paths \(\#5017\) [\#5018](https://github.com/apache/arrow-rs/pull/5018) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- ObjectStore: make error msg thrown from retry more detailed [\#5012](https://github.com/apache/arrow-rs/pull/5012) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([Rachelint](https://github.com/Rachelint)) +- Support onelake fabric paths in parse\_url \(\#5000\) [\#5002](https://github.com/apache/arrow-rs/pull/5002) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Object tagging \(\#4754\) [\#4999](https://github.com/apache/arrow-rs/pull/4999) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- \[MINOR\] No need to jump to web pages [\#4994](https://github.com/apache/arrow-rs/pull/4994) ([smallzhongfeng](https://github.com/smallzhongfeng)) +- Pushdown list\_with\_offset for GCS [\#4993](https://github.com/apache/arrow-rs/pull/4993) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Support bucket name with `.` when parsing GCS URL \(\#4991\) [\#4992](https://github.com/apache/arrow-rs/pull/4992) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Increase default timeout to 30 seconds [\#4989](https://github.com/apache/arrow-rs/pull/4989) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Conditional Put \(\#4879\) [\#4984](https://github.com/apache/arrow-rs/pull/4984) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Update quick-xml requirement from 0.30.0 to 0.31.0 in /object\_store [\#4983](https://github.com/apache/arrow-rs/pull/4983) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- Bump actions/setup-node from 3 to 4 [\#4982](https://github.com/apache/arrow-rs/pull/4982) ([dependabot[bot]](https://github.com/apps/dependabot)) +- Support ImdsManagedIdentityProvider in Azure Functions \(\#4976\) [\#4977](https://github.com/apache/arrow-rs/pull/4977) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Add MultiPartStore \(\#4961\) \(\#4608\) [\#4971](https://github.com/apache/arrow-rs/pull/4971) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Split gcp Module [\#4956](https://github.com/apache/arrow-rs/pull/4956) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Add module links in docs root [\#4955](https://github.com/apache/arrow-rs/pull/4955) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Prepare arrow 48.0.0 [\#4948](https://github.com/apache/arrow-rs/pull/4948) ([tustvold](https://github.com/tustvold)) +- Allow opting out of request signing \(\#4927\) [\#4929](https://github.com/apache/arrow-rs/pull/4929) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Default connection and request timeouts of 5 seconds [\#4928](https://github.com/apache/arrow-rs/pull/4928) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Support service\_account in ApplicationDefaultCredentials and Use SelfSignedJwt [\#4926](https://github.com/apache/arrow-rs/pull/4926) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Generate `ETag`s for `InMemory` and `LocalFileSystem` \(\#4879\) [\#4922](https://github.com/apache/arrow-rs/pull/4922) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Cleanup `object_store::retry` client error handling [\#4915](https://github.com/apache/arrow-rs/pull/4915) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Fix integration tests [\#4889](https://github.com/apache/arrow-rs/pull/4889) ([tustvold](https://github.com/tustvold)) +- Support Parsing Avro File Headers [\#4888](https://github.com/apache/arrow-rs/pull/4888) ([tustvold](https://github.com/tustvold)) +- Update ring requirement from 0.16 to 0.17 in /object\_store [\#4887](https://github.com/apache/arrow-rs/pull/4887) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- Add AWS presigned URL support [\#4876](https://github.com/apache/arrow-rs/pull/4876) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([carols10cents](https://github.com/carols10cents)) +- Flush in creates\_dir\_if\_not\_present\_append \(\#4872\) [\#4874](https://github.com/apache/arrow-rs/pull/4874) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Flush in multiple\_append test \(\#4868\) [\#4869](https://github.com/apache/arrow-rs/pull/4869) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Enable new integration tests \(\#4828\) [\#4862](https://github.com/apache/arrow-rs/pull/4862) ([tustvold](https://github.com/tustvold)) diff --git a/Cargo.toml b/Cargo.toml index c8cf4e2..7fcb6ce 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "object_store" -version = "0.7.1" +version = "0.8.0" edition = "2021" license = "MIT/Apache-2.0" readme = "README.md" diff --git a/dev/release/update_change_log.sh b/dev/release/update_change_log.sh index aeec3ca..33eeb33 100755 --- a/dev/release/update_change_log.sh +++ b/dev/release/update_change_log.sh @@ -29,8 +29,8 @@ set -e -SINCE_TAG="object_store_0.7.0" -FUTURE_RELEASE="object_store_0.7.1" +SINCE_TAG="object_store_0.7.1" +FUTURE_RELEASE="object_store_0.8.0" SOURCE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" SOURCE_TOP_DIR="$(cd "${SOURCE_DIR}/../../" && pwd)" From 8582f52996a0642cf053b50dc6d1b49fe9dc6c24 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 2 Nov 2023 11:37:19 +0000 Subject: [PATCH 231/397] Verify object_store with all features (#5024) --- dev/release/verify-release-candidate.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dev/release/verify-release-candidate.sh b/dev/release/verify-release-candidate.sh index 06a5d8b..b24bd8f 100755 --- a/dev/release/verify-release-candidate.sh +++ b/dev/release/verify-release-candidate.sh @@ -103,7 +103,7 @@ test_source_distribution() { # build and test rust cargo build - cargo test --all + cargo test --all --all-features # verify that the crate can be published to crates.io cargo publish --dry-run From 76664beaed0f2fa72002113bdbe1dac30264b88d Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 2 Nov 2023 15:27:25 +0000 Subject: [PATCH 232/397] Fix invalid_path test (#5026) --- src/local.rs | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/local.rs b/src/local.rs index e5c4e32..dd71d9e 100644 --- a/src/local.rs +++ b/src/local.rs @@ -1384,13 +1384,14 @@ mod tests { std::fs::write(emoji, "foo").unwrap(); // Can list illegal file - let paths = flatten_list_stream(&integration, None).await.unwrap(); + let mut paths = flatten_list_stream(&integration, None).await.unwrap(); + paths.sort_unstable(); assert_eq!( paths, vec![ - Path::parse("💀").unwrap(), - Path::parse("directory/child.txt").unwrap() + Path::parse("directory/child.txt").unwrap(), + Path::parse("💀").unwrap() ] ); } From a9c17c6f56418261eeb561abd2449b476de81686 Mon Sep 17 00:00:00 2001 From: Marco Neumann Date: Wed, 8 Nov 2023 16:00:41 +0100 Subject: [PATCH 233/397] refactor: change `object_store` CA handling (#5056) Closes #4870. --- Cargo.toml | 3 ++- src/lib.rs | 11 +++++++++++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index 7fcb6ce..bf83015 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -50,7 +50,7 @@ quick-xml = { version = "0.31.0", features = ["serialize", "overlapped-lists"], serde = { version = "1.0", default-features = false, features = ["derive"], optional = true } serde_json = { version = "1.0", default-features = false, optional = true } rand = { version = "0.8", default-features = false, features = ["std", "std_rng"], optional = true } -reqwest = { version = "0.11", default-features = false, features = ["rustls-tls"], optional = true } +reqwest = { version = "0.11", default-features = false, features = ["rustls-tls-native-roots"], optional = true } ring = { version = "0.17", default-features = false, features = ["std"], optional = true } rustls-pemfile = { version = "1.0", default-features = false, optional = true } tokio = { version = "1.25.0", features = ["sync", "macros", "rt", "time", "io-util"] } @@ -64,6 +64,7 @@ azure = ["cloud"] gcp = ["cloud", "rustls-pemfile"] aws = ["cloud"] http = ["cloud"] +tls-webpki-roots = ["reqwest?/rustls-tls-webpki-roots"] [dev-dependencies] # In alphabetical order tempfile = "3.1.0" diff --git a/src/lib.rs b/src/lib.rs index cdd572d..f791e65 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -86,6 +86,17 @@ doc = "* [`http`]: [HTTP/WebDAV Storage](https://datatracker.ietf.org/doc/html/rfc2518). See [`HttpBuilder`](http::HttpBuilder)" )] //! +//! # TLS Certificates +//! +//! Stores that use HTTPS/TLS (this is true for most cloud stores) can choose the source of their [CA] +//! certificates. By default the system-bundled certificates are used (see +//! [`rustls-native-certs`]). The `tls-webpki-roots` feature switch can be used to also bundle Mozilla's +//! root certificates with the library/application (see [`webpki-roots`]). +//! +//! [CA]: https://en.wikipedia.org/wiki/Certificate_authority +//! [`rustls-native-certs`]: https://crates.io/crates/rustls-native-certs/ +//! [`webpki-roots`]: https://crates.io/crates/webpki-roots +//! //! # Why not a Filesystem Interface? //! //! Whilst this crate does provide a [`BufReader`], the [`ObjectStore`] interface mirrors the APIs From cdc0151f99cf263d3c23dd9cfdfbfc0e1401cbcc Mon Sep 17 00:00:00 2001 From: Marco Neumann Date: Wed, 8 Nov 2023 16:51:13 +0100 Subject: [PATCH 234/397] docs: re-order `object_store` intro (#5058) --- src/lib.rs | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index f791e65..2d1d549 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -86,17 +86,6 @@ doc = "* [`http`]: [HTTP/WebDAV Storage](https://datatracker.ietf.org/doc/html/rfc2518). See [`HttpBuilder`](http::HttpBuilder)" )] //! -//! # TLS Certificates -//! -//! Stores that use HTTPS/TLS (this is true for most cloud stores) can choose the source of their [CA] -//! certificates. By default the system-bundled certificates are used (see -//! [`rustls-native-certs`]). The `tls-webpki-roots` feature switch can be used to also bundle Mozilla's -//! root certificates with the library/application (see [`webpki-roots`]). -//! -//! [CA]: https://en.wikipedia.org/wiki/Certificate_authority -//! [`rustls-native-certs`]: https://crates.io/crates/rustls-native-certs/ -//! [`webpki-roots`]: https://crates.io/crates/webpki-roots -//! //! # Why not a Filesystem Interface? //! //! Whilst this crate does provide a [`BufReader`], the [`ObjectStore`] interface mirrors the APIs @@ -447,6 +436,17 @@ //! [Apache Iceberg]: https://iceberg.apache.org/ //! [Delta Lake]: https://delta.io/ //! +//! # TLS Certificates +//! +//! Stores that use HTTPS/TLS (this is true for most cloud stores) can choose the source of their [CA] +//! certificates. By default the system-bundled certificates are used (see +//! [`rustls-native-certs`]). The `tls-webpki-roots` feature switch can be used to also bundle Mozilla's +//! root certificates with the library/application (see [`webpki-roots`]). +//! +//! [CA]: https://en.wikipedia.org/wiki/Certificate_authority +//! [`rustls-native-certs`]: https://crates.io/crates/rustls-native-certs/ +//! [`webpki-roots`]: https://crates.io/crates/webpki-roots +//! #[cfg(all( target_arch = "wasm32", From a5b670df5012d8fe52317374a2d223b4618a8099 Mon Sep 17 00:00:00 2001 From: "Carol (Nichols || Goulding)" <193874+carols10cents@users.noreply.github.com> Date: Fri, 10 Nov 2023 10:41:07 -0500 Subject: [PATCH 235/397] Add a PR under "Breaking changes" in the object_store 0.8.0 changelog (#5063) This PR adds a method, `put_opts`, to the `ObjectStore` trait, so any implementer of this trait will need to update their code when they upgrade to 0.8.0. --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index c24cf54..7a4fcd0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -31,6 +31,7 @@ - Add ObjectMeta::version and GetOptions::version \(\#4925\) [\#4935](https://github.com/apache/arrow-rs/pull/4935) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) - Add GetOptions::head [\#4931](https://github.com/apache/arrow-rs/pull/4931) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) - Remove Nested async and Fallibility from ObjectStore::list [\#4930](https://github.com/apache/arrow-rs/pull/4930) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Add ObjectStore::put_opts / Conditional Put [\#4879](https://github.com/apache/arrow-rs/pull/4984) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) **Implemented enhancements:** From f2c6f26fa9e7a8faffca3c591a5d3390d69fc763 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 15 Nov 2023 08:19:07 -0800 Subject: [PATCH 236/397] Update itertools requirement from 0.11.0 to 0.12.0 in /object_store (#5077) Updates the requirements on [itertools](https://github.com/rust-itertools/itertools) to permit the latest version. - [Changelog](https://github.com/rust-itertools/itertools/blob/master/CHANGELOG.md) - [Commits](https://github.com/rust-itertools/itertools/compare/v0.11.0...v0.12.0) --- updated-dependencies: - dependency-name: itertools dependency-type: direct:production ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index bf83015..2f5157c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -35,7 +35,7 @@ bytes = "1.0" chrono = { version = "0.4.31", default-features = false, features = ["clock"] } futures = "0.3" humantime = "2.1" -itertools = "0.11.0" +itertools = "0.12.0" parking_lot = { version = "0.12" } percent-encoding = "2.1" snafu = "0.7" From ad8da1e512835c5883d51bcfb023f236579779d1 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Fri, 17 Nov 2023 14:52:26 +0000 Subject: [PATCH 237/397] Fix latest clippy lints (#5090) --- src/lib.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 2d1d549..40dca8f 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1535,11 +1535,11 @@ mod tests { let expected: Vec<_> = files .iter() - .cloned() .filter(|x| { let prefix_match = prefix.as_ref().map(|p| x.prefix_matches(p)).unwrap_or(true); - prefix_match && x > &offset + prefix_match && *x > &offset }) + .cloned() .collect(); assert_eq!(actual, expected, "{prefix:?} - {offset:?}"); From 6f05363b3554ba9fba17ece3aef5daf5d73193ad Mon Sep 17 00:00:00 2001 From: Jeffrey <22608443+Jefffrey@users.noreply.github.com> Date: Tue, 21 Nov 2023 08:18:21 +1100 Subject: [PATCH 238/397] Allow writing null valued keys in JSON (#5065) * Allow writing null valued keys in JSON * Trigger * Refactor keep nulls to be runtime config * Rename option * Rename option --- src/gcp/builder.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/gcp/builder.rs b/src/gcp/builder.rs index 5f718d6..7417ea4 100644 --- a/src/gcp/builder.rs +++ b/src/gcp/builder.rs @@ -605,7 +605,7 @@ mod tests { .with_bucket_name("foo") .with_proxy_url("https://example.com") .build(); - assert!(dbg!(gcs).is_ok()); + assert!(gcs.is_ok()); let err = GoogleCloudStorageBuilder::new() .with_service_account_path(service_account_path.to_str().unwrap()) From 288086077563c5d3557832615b4a807cfdc05771 Mon Sep 17 00:00:00 2001 From: Robin Lin <128118209+RobinLin666@users.noreply.github.com> Date: Mon, 27 Nov 2023 19:07:45 +0800 Subject: [PATCH 239/397] Fix ObjectStore.LocalFileSystem.put_opts for blobfuse (#5094) * Fix ObjectStore.LocalFileSystem.put_opts for blobfuse * Fix ObjectStore.LocalFileSystem.put_opts for blobfuse * fix comment * fix race condition * add comment --- src/local.rs | 56 ++++++++++++++++++++++++++++++---------------------- 1 file changed, 32 insertions(+), 24 deletions(-) diff --git a/src/local.rs b/src/local.rs index dd71d9e..71b96f0 100644 --- a/src/local.rs +++ b/src/local.rs @@ -338,28 +338,41 @@ impl ObjectStore for LocalFileSystem { maybe_spawn_blocking(move || { let (mut file, suffix) = new_staged_upload(&path)?; let staging_path = staged_upload_path(&path, &suffix); + let mut e_tag = None; let err = match file.write_all(&bytes) { - Ok(_) => match opts.mode { - PutMode::Overwrite => match std::fs::rename(&staging_path, &path) { - Ok(_) => None, - Err(source) => Some(Error::UnableToRenameFile { source }), - }, - PutMode::Create => match std::fs::hard_link(&staging_path, &path) { - Ok(_) => { - let _ = std::fs::remove_file(&staging_path); // Attempt to cleanup - None + Ok(_) => { + let metadata = file.metadata().map_err(|e| Error::Metadata { + source: e.into(), + path: path.to_string_lossy().to_string(), + })?; + e_tag = Some(get_etag(&metadata)); + match opts.mode { + PutMode::Overwrite => { + // For some fuse types of file systems, the file must be closed first + // to trigger the upload operation, and then renamed, such as Blobfuse + std::mem::drop(file); + match std::fs::rename(&staging_path, &path) { + Ok(_) => None, + Err(source) => Some(Error::UnableToRenameFile { source }), + } } - Err(source) => match source.kind() { - ErrorKind::AlreadyExists => Some(Error::AlreadyExists { - path: path.to_str().unwrap().to_string(), - source, - }), - _ => Some(Error::UnableToRenameFile { source }), + PutMode::Create => match std::fs::hard_link(&staging_path, &path) { + Ok(_) => { + let _ = std::fs::remove_file(&staging_path); // Attempt to cleanup + None + } + Err(source) => match source.kind() { + ErrorKind::AlreadyExists => Some(Error::AlreadyExists { + path: path.to_str().unwrap().to_string(), + source, + }), + _ => Some(Error::UnableToRenameFile { source }), + }, }, - }, - PutMode::Update(_) => unreachable!(), - }, + PutMode::Update(_) => unreachable!(), + } + } Err(source) => Some(Error::UnableToCopyDataToFile { source }), }; @@ -368,13 +381,8 @@ impl ObjectStore for LocalFileSystem { return Err(err.into()); } - let metadata = file.metadata().map_err(|e| Error::Metadata { - source: e.into(), - path: path.to_string_lossy().to_string(), - })?; - Ok(PutResult { - e_tag: Some(get_etag(&metadata)), + e_tag, version: None, }) }) From 33c220f914385cdc8e6c457411fb8cc7f4b224c0 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Tue, 28 Nov 2023 15:38:00 +0000 Subject: [PATCH 240/397] Update localstack to 3.0.1 (#5028) --- src/aws/mod.rs | 8 +++----- src/azure/mod.rs | 2 +- src/http/mod.rs | 2 +- src/lib.rs | 18 +++++++----------- 4 files changed, 12 insertions(+), 18 deletions(-) diff --git a/src/aws/mod.rs b/src/aws/mod.rs index cbb3cff..0985263 100644 --- a/src/aws/mod.rs +++ b/src/aws/mod.rs @@ -336,12 +336,10 @@ mod tests { let integration = config.build().unwrap(); let config = integration.client.config(); - let is_local = config.endpoint.starts_with("http://"); let test_not_exists = config.copy_if_not_exists.is_some(); let test_conditional_put = config.conditional_put.is_some(); - // Localstack doesn't support listing with spaces https://github.com/localstack/localstack/issues/6328 - put_get_delete_list_opts(&integration, is_local).await; + put_get_delete_list_opts(&integration).await; get_opts(&integration).await; list_uses_directories_correctly(&integration).await; list_with_delimiter(&integration).await; @@ -364,12 +362,12 @@ mod tests { // run integration test with unsigned payload enabled let builder = AmazonS3Builder::from_env().with_unsigned_payload(true); let integration = builder.build().unwrap(); - put_get_delete_list_opts(&integration, is_local).await; + put_get_delete_list_opts(&integration).await; // run integration test with checksum set to sha256 let builder = AmazonS3Builder::from_env().with_checksum_algorithm(Checksum::SHA256); let integration = builder.build().unwrap(); - put_get_delete_list_opts(&integration, is_local).await; + put_get_delete_list_opts(&integration).await; } #[tokio::test] diff --git a/src/azure/mod.rs b/src/azure/mod.rs index 1d51cbd..af0a4ce 100644 --- a/src/azure/mod.rs +++ b/src/azure/mod.rs @@ -193,7 +193,7 @@ mod tests { crate::test_util::maybe_skip_integration!(); let integration = MicrosoftAzureBuilder::from_env().build().unwrap(); - put_get_delete_list_opts(&integration, false).await; + put_get_delete_list_opts(&integration).await; get_opts(&integration).await; list_uses_directories_correctly(&integration).await; list_with_delimiter(&integration).await; diff --git a/src/http/mod.rs b/src/http/mod.rs index cfcde27..f1d11db 100644 --- a/src/http/mod.rs +++ b/src/http/mod.rs @@ -264,7 +264,7 @@ mod tests { .build() .unwrap(); - put_get_delete_list_opts(&integration, false).await; + put_get_delete_list_opts(&integration).await; list_uses_directories_correctly(&integration).await; list_with_delimiter(&integration).await; rename_and_copy(&integration).await; diff --git a/src/lib.rs b/src/lib.rs index 40dca8f..5c5c70d 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1236,13 +1236,10 @@ mod tests { use tokio::io::AsyncWriteExt; pub(crate) async fn put_get_delete_list(storage: &DynObjectStore) { - put_get_delete_list_opts(storage, false).await + put_get_delete_list_opts(storage).await } - pub(crate) async fn put_get_delete_list_opts( - storage: &DynObjectStore, - skip_list_with_spaces: bool, - ) { + pub(crate) async fn put_get_delete_list_opts(storage: &DynObjectStore) { delete_fixtures(storage).await; let content_list = flatten_list_stream(storage, None).await.unwrap(); @@ -1483,12 +1480,11 @@ mod tests { storage.put(&path, Bytes::from(vec![0, 1])).await.unwrap(); storage.head(&path).await.unwrap(); - if !skip_list_with_spaces { - let files = flatten_list_stream(storage, Some(&Path::from("foo bar"))) - .await - .unwrap(); - assert_eq!(files, vec![path.clone()]); - } + let files = flatten_list_stream(storage, Some(&Path::from("foo bar"))) + .await + .unwrap(); + assert_eq!(files, vec![path.clone()]); + storage.delete(&path).await.unwrap(); let files = flatten_list_stream(storage, None).await.unwrap(); From f04eb5cb79da63cbcfbd3afd5483db3270b02bcd Mon Sep 17 00:00:00 2001 From: emcake <3726783+emcake@users.noreply.github.com> Date: Wed, 29 Nov 2023 17:42:10 +0000 Subject: [PATCH 241/397] Allow 403 for overwrite prevention (#5134) * Allow 403 for overwrite prevention * implment instead via a new 'return code override' key * add with_... method * rework: implement via header-with-status * Update object_store/src/aws/precondition.rs Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> * Update object_store/src/aws/precondition.rs Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> * Update object_store/src/aws/precondition.rs Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> * Update object_store/src/aws/client.rs Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> * review comments * clipps lints & docs --------- Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> --- src/aws/client.rs | 12 ++++- src/aws/precondition.rs | 98 ++++++++++++++++++++++++++++++++++++++++- 2 files changed, 106 insertions(+), 4 deletions(-) diff --git a/src/aws/client.rs b/src/aws/client.rs index 3e47abd..ecbe556 100644 --- a/src/aws/client.rs +++ b/src/aws/client.rs @@ -45,7 +45,7 @@ use percent_encoding::{utf8_percent_encode, PercentEncode}; use quick_xml::events::{self as xml_events}; use reqwest::{ header::{CONTENT_LENGTH, CONTENT_TYPE}, - Client as ReqwestClient, Method, RequestBuilder, Response, StatusCode, + Client as ReqwestClient, Method, RequestBuilder, Response, }; use serde::{Deserialize, Serialize}; use snafu::{ResultExt, Snafu}; @@ -466,6 +466,9 @@ impl S3Client { Some(S3CopyIfNotExists::Header(k, v)) => { builder = builder.header(k, v); } + Some(S3CopyIfNotExists::HeaderWithStatus(k, v, _)) => { + builder = builder.header(k, v); + } None => { return Err(crate::Error::NotSupported { source: "S3 does not support copy-if-not-exists".to_string().into(), @@ -474,6 +477,11 @@ impl S3Client { } } + let precondition_failure = match &self.config.copy_if_not_exists { + Some(S3CopyIfNotExists::HeaderWithStatus(_, _, code)) => *code, + _ => reqwest::StatusCode::PRECONDITION_FAILED, + }; + builder .with_aws_sigv4( credential.as_deref(), @@ -485,7 +493,7 @@ impl S3Client { .send_retry(&self.config.retry_config) .await .map_err(|source| match source.status() { - Some(StatusCode::PRECONDITION_FAILED) => crate::Error::AlreadyExists { + Some(error) if error == precondition_failure => crate::Error::AlreadyExists { source: Box::new(source), path: to.to_string(), }, diff --git a/src/aws/precondition.rs b/src/aws/precondition.rs index a50b57f..ada5f3b 100644 --- a/src/aws/precondition.rs +++ b/src/aws/precondition.rs @@ -17,11 +17,13 @@ use crate::config::Parse; +use itertools::Itertools; + /// Configure how to provide [`ObjectStore::copy_if_not_exists`] for [`AmazonS3`]. /// /// [`ObjectStore::copy_if_not_exists`]: crate::ObjectStore::copy_if_not_exists /// [`AmazonS3`]: super::AmazonS3 -#[derive(Debug, Clone)] +#[derive(Debug, Clone, PartialEq, Eq)] #[non_exhaustive] pub enum S3CopyIfNotExists { /// Some S3-compatible stores, such as Cloudflare R2, support copy if not exists @@ -29,7 +31,7 @@ pub enum S3CopyIfNotExists { /// /// If set, [`ObjectStore::copy_if_not_exists`] will perform a normal copy operation /// with the provided header pair, and expect the store to fail with `412 Precondition Failed` - /// if the destination file already exists + /// if the destination file already exists. /// /// Encoded as `header::` ignoring whitespace /// @@ -38,12 +40,20 @@ pub enum S3CopyIfNotExists { /// /// [`ObjectStore::copy_if_not_exists`]: crate::ObjectStore::copy_if_not_exists Header(String, String), + /// The same as [`S3CopyIfNotExists::Header`] but allows custom status code checking, for object stores that return values + /// other than 412. + /// + /// Encoded as `header-with-status:::` ignoring whitespace + HeaderWithStatus(String, String, reqwest::StatusCode), } impl std::fmt::Display for S3CopyIfNotExists { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { Self::Header(k, v) => write!(f, "header: {}: {}", k, v), + Self::HeaderWithStatus(k, v, code) => { + write!(f, "header-with-status: {k}: {v}: {}", code.as_u16()) + } } } } @@ -56,6 +66,17 @@ impl S3CopyIfNotExists { let (k, v) = value.split_once(':')?; Some(Self::Header(k.trim().to_string(), v.trim().to_string())) } + "header-with-status" => { + let (k, v, status) = value.split(':').collect_tuple()?; + + let code = status.trim().parse().ok()?; + + Some(Self::HeaderWithStatus( + k.trim().to_string(), + v.trim().to_string(), + code, + )) + } _ => None, } } @@ -111,3 +132,76 @@ impl Parse for S3ConditionalPut { }) } } + +#[cfg(test)] +mod tests { + use super::S3CopyIfNotExists; + + #[test] + fn parse_s3_copy_if_not_exists_header() { + let input = "header: cf-copy-destination-if-none-match: *"; + let expected = Some(S3CopyIfNotExists::Header( + "cf-copy-destination-if-none-match".to_owned(), + "*".to_owned(), + )); + + assert_eq!(expected, S3CopyIfNotExists::from_str(input)); + } + + #[test] + fn parse_s3_copy_if_not_exists_header_with_status() { + let input = "header-with-status:key:value:403"; + let expected = Some(S3CopyIfNotExists::HeaderWithStatus( + "key".to_owned(), + "value".to_owned(), + reqwest::StatusCode::FORBIDDEN, + )); + + assert_eq!(expected, S3CopyIfNotExists::from_str(input)); + } + + #[test] + fn parse_s3_copy_if_not_exists_header_whitespace_invariant() { + let expected = Some(S3CopyIfNotExists::Header( + "cf-copy-destination-if-none-match".to_owned(), + "*".to_owned(), + )); + + const INPUTS: &[&str] = &[ + "header:cf-copy-destination-if-none-match:*", + "header: cf-copy-destination-if-none-match:*", + "header: cf-copy-destination-if-none-match: *", + "header : cf-copy-destination-if-none-match: *", + "header : cf-copy-destination-if-none-match : *", + "header : cf-copy-destination-if-none-match : * ", + ]; + + for input in INPUTS { + assert_eq!(expected, S3CopyIfNotExists::from_str(input)); + } + } + + #[test] + fn parse_s3_copy_if_not_exists_header_with_status_whitespace_invariant() { + let expected = Some(S3CopyIfNotExists::HeaderWithStatus( + "key".to_owned(), + "value".to_owned(), + reqwest::StatusCode::FORBIDDEN, + )); + + const INPUTS: &[&str] = &[ + "header-with-status:key:value:403", + "header-with-status: key:value:403", + "header-with-status: key: value:403", + "header-with-status: key: value: 403", + "header-with-status : key: value: 403", + "header-with-status : key : value: 403", + "header-with-status : key : value : 403", + "header-with-status : key : value : 403 ", + ]; + + for input in INPUTS { + assert_eq!(expected, S3CopyIfNotExists::from_str(input)); + } + } +} From baaf13bf96cc85f3f3088e817ca5509ea02423a7 Mon Sep 17 00:00:00 2001 From: Jeffrey <22608443+Jefffrey@users.noreply.github.com> Date: Tue, 5 Dec 2023 21:03:26 +1100 Subject: [PATCH 242/397] object_store: fix failing doctest with default features (#5161) --- Cargo.toml | 4 ++-- src/lib.rs | 2 ++ 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 2f5157c..d5cf91c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -67,10 +67,10 @@ http = ["cloud"] tls-webpki-roots = ["reqwest?/rustls-tls-webpki-roots"] [dev-dependencies] # In alphabetical order -tempfile = "3.1.0" futures-test = "0.3" -rand = "0.8" hyper = { version = "0.14.24", features = ["server"] } +rand = "0.8" +tempfile = "3.1.0" [[test]] name = "get_range_file" diff --git a/src/lib.rs b/src/lib.rs index 5c5c70d..3a84166 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -119,6 +119,7 @@ //! application complexity. //! //! ```no_run +//! # #[cfg(feature = "aws")] { //! # use url::Url; //! # use object_store::{parse_url, parse_url_opts}; //! # use object_store::aws::{AmazonS3, AmazonS3Builder}; @@ -140,6 +141,7 @@ //! let url = Url::parse("https://ACCOUNT_ID.r2.cloudflarestorage.com/bucket/path").unwrap(); //! let (store, path) = parse_url(&url).unwrap(); //! assert_eq!(path.as_ref(), "path"); +//! # } //! ``` //! //! [PyArrow FileSystem]: https://arrow.apache.org/docs/python/generated/pyarrow.fs.FileSystem.html#pyarrow.fs.FileSystem.from_uri From 6c99069749e55330f2c508a58a9536e22cb9b76d Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 5 Dec 2023 12:54:05 +0000 Subject: [PATCH 243/397] Update rustls-pemfile requirement from 1.0 to 2.0 in /object_store (#5155) * Update rustls-pemfile requirement from 1.0 to 2.0 in /object_store Updates the requirements on [rustls-pemfile](https://github.com/rustls/pemfile) to permit the latest version. - [Release notes](https://github.com/rustls/pemfile/releases) - [Commits](https://github.com/rustls/pemfile/compare/v/1.0.0...v/2.0.0) --- updated-dependencies: - dependency-name: rustls-pemfile dependency-type: direct:production ... Signed-off-by: dependabot[bot] * Update --------- Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Raphael Taylor-Davies --- Cargo.toml | 2 +- src/gcp/credential.rs | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index d5cf91c..e7f99e5 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -52,7 +52,7 @@ serde_json = { version = "1.0", default-features = false, optional = true } rand = { version = "0.8", default-features = false, features = ["std", "std_rng"], optional = true } reqwest = { version = "0.11", default-features = false, features = ["rustls-tls-native-roots"], optional = true } ring = { version = "0.17", default-features = false, features = ["std"], optional = true } -rustls-pemfile = { version = "1.0", default-features = false, optional = true } +rustls-pemfile = { version = "2.0", default-features = false, features = ["std"], optional = true } tokio = { version = "1.25.0", features = ["sync", "macros", "rt", "time", "io-util"] } [target.'cfg(target_family="unix")'.dev-dependencies] diff --git a/src/gcp/credential.rs b/src/gcp/credential.rs index 29c7b45..dc504da 100644 --- a/src/gcp/credential.rs +++ b/src/gcp/credential.rs @@ -304,8 +304,8 @@ fn decode_first_rsa_key(private_key_pem: String) -> Result { // Reading from string is infallible match rustls_pemfile::read_one(&mut reader).unwrap() { - Some(Item::PKCS8Key(key)) => Ok(RsaKeyPair::from_pkcs8(&key)?), - Some(Item::RSAKey(key)) => Ok(RsaKeyPair::from_der(&key)?), + Some(Item::Pkcs8Key(key)) => Ok(RsaKeyPair::from_pkcs8(key.secret_pkcs8_der())?), + Some(Item::Pkcs1Key(key)) => Ok(RsaKeyPair::from_der(key.secret_pkcs1_der())?), _ => Err(Error::MissingKey), } } From 9c14bdc77f11ad7f7c31c30540bfb9d4b7b9bf08 Mon Sep 17 00:00:00 2001 From: Will Jones Date: Tue, 12 Dec 2023 09:11:23 -0800 Subject: [PATCH 244/397] feat(object_store): use http1 by default (#5204) * feat: use http1 by default * add note to GCS docs * fix docs * simplify changes * bring back option --- src/client/mod.rs | 16 +++++++++++++++- src/gcp/mod.rs | 7 +++++++ 2 files changed, 22 insertions(+), 1 deletion(-) diff --git a/src/client/mod.rs b/src/client/mod.rs index ae092ed..2baf586 100644 --- a/src/client/mod.rs +++ b/src/client/mod.rs @@ -213,7 +213,10 @@ impl Default for ClientOptions { http2_keep_alive_interval: None, http2_keep_alive_timeout: None, http2_keep_alive_while_idle: Default::default(), - http1_only: Default::default(), + // HTTP2 is known to be significantly slower than HTTP1, so we default + // to HTTP1 for now. + // https://github.com/apache/arrow-rs/issues/5194 + http1_only: true.into(), http2_only: Default::default(), } } @@ -350,17 +353,28 @@ impl ClientOptions { } /// Only use http1 connections + /// + /// This is on by default, since http2 is known to be significantly slower than http1. pub fn with_http1_only(mut self) -> Self { + self.http2_only = false.into(); self.http1_only = true.into(); self } /// Only use http2 connections pub fn with_http2_only(mut self) -> Self { + self.http1_only = false.into(); self.http2_only = true.into(); self } + /// Use http2 if supported, otherwise use http1. + pub fn with_allow_http2(mut self) -> Self { + self.http1_only = false.into(); + self.http2_only = false.into(); + self + } + /// Set a proxy URL to use for requests pub fn with_proxy_url(mut self, proxy_url: impl Into) -> Self { self.proxy_url = Some(proxy_url.into()); diff --git a/src/gcp/mod.rs b/src/gcp/mod.rs index 11fa683..8633abb 100644 --- a/src/gcp/mod.rs +++ b/src/gcp/mod.rs @@ -29,6 +29,13 @@ //! to abort the upload and drop those unneeded parts. In addition, you may wish to //! consider implementing automatic clean up of unused parts that are older than one //! week. +//! +//! ## Using HTTP/2 +//! +//! Google Cloud Storage supports both HTTP/2 and HTTP/1. HTTP/1 is used by default +//! because it allows much higher throughput in our benchmarks (see +//! [#5194](https://github.com/apache/arrow-rs/issues/5194)). HTTP/2 can be +//! enabled by setting [crate::ClientConfigKey::Http1Only] to false. use std::sync::Arc; use crate::client::CredentialProvider; From 0c4306d3e700712ac65bcbb0511a9d05bf260110 Mon Sep 17 00:00:00 2001 From: Justin Abrahms Date: Sat, 23 Dec 2023 05:26:51 -0800 Subject: [PATCH 245/397] Document default value of InstanceCredentialProvider (#5188) * Document default value of InstanceCredentialProvider * Move docs to credentials field rather than crate-private method * Review feedback * Tweak doctest --------- Co-authored-by: Raphael Taylor-Davies --- src/gcp/builder.rs | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/src/gcp/builder.rs b/src/gcp/builder.rs index 7417ea4..21b767d 100644 --- a/src/gcp/builder.rs +++ b/src/gcp/builder.rs @@ -78,18 +78,16 @@ impl From for crate::Error { } } -/// Configure a connection to Google Cloud Storage using the specified -/// credentials. +/// Configure a connection to Google Cloud Storage. +/// +/// If no credentials are explicitly provided, they will be sourced +/// from the environment as documented [here](https://cloud.google.com/docs/authentication/application-default-credentials). /// /// # Example /// ``` /// # let BUCKET_NAME = "foo"; -/// # let SERVICE_ACCOUNT_PATH = "/tmp/foo.json"; /// # use object_store::gcp::GoogleCloudStorageBuilder; -/// let gcs = GoogleCloudStorageBuilder::new() -/// .with_service_account_path(SERVICE_ACCOUNT_PATH) -/// .with_bucket_name(BUCKET_NAME) -/// .build(); +/// let gcs = GoogleCloudStorageBuilder::from_env().with_bucket_name(BUCKET_NAME).build(); /// ``` #[derive(Debug, Clone)] pub struct GoogleCloudStorageBuilder { From 3c4e95b3ddc4e7affa71041477ee9333c3684ee1 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Sat, 23 Dec 2023 17:03:40 +0000 Subject: [PATCH 246/397] Remove deprecated try_with_option methods (#5237) --- src/aws/builder.rs | 23 ----------------------- src/azure/builder.rs | 19 ------------------- src/gcp/builder.rs | 19 ------------------- 3 files changed, 61 deletions(-) diff --git a/src/aws/builder.rs b/src/aws/builder.rs index cf9490d..596ff99 100644 --- a/src/aws/builder.rs +++ b/src/aws/builder.rs @@ -477,29 +477,6 @@ impl AmazonS3Builder { self } - /// Set an option on the builder via a key - value pair. - /// - /// This method will return an `UnknownConfigKey` error if key cannot be parsed into [`AmazonS3ConfigKey`]. - #[deprecated(note = "Use with_config")] - pub fn try_with_option(self, key: impl AsRef, value: impl Into) -> Result { - Ok(self.with_config(key.as_ref().parse()?, value)) - } - - /// Hydrate builder from key value pairs - /// - /// This method will return an `UnknownConfigKey` error if any key cannot be parsed into [`AmazonS3ConfigKey`]. - #[deprecated(note = "Use with_config")] - #[allow(deprecated)] - pub fn try_with_options, impl Into)>>( - mut self, - options: I, - ) -> Result { - for (key, value) in options { - self = self.try_with_option(key, value)?; - } - Ok(self) - } - /// Get config value via a [`AmazonS3ConfigKey`]. /// /// # Example diff --git a/src/azure/builder.rs b/src/azure/builder.rs index 6bd2b26..2de0a7c 100644 --- a/src/azure/builder.rs +++ b/src/azure/builder.rs @@ -521,25 +521,6 @@ impl MicrosoftAzureBuilder { self } - /// Set an option on the builder via a key - value pair. - #[deprecated(note = "Use with_config")] - pub fn try_with_option(self, key: impl AsRef, value: impl Into) -> Result { - Ok(self.with_config(key.as_ref().parse()?, value)) - } - - /// Hydrate builder from key value pairs - #[deprecated(note = "Use with_config")] - #[allow(deprecated)] - pub fn try_with_options, impl Into)>>( - mut self, - options: I, - ) -> Result { - for (key, value) in options { - self = self.try_with_option(key, value)?; - } - Ok(self) - } - /// Get config value via a [`AzureConfigKey`]. /// /// # Example diff --git a/src/gcp/builder.rs b/src/gcp/builder.rs index 21b767d..14c4257 100644 --- a/src/gcp/builder.rs +++ b/src/gcp/builder.rs @@ -287,25 +287,6 @@ impl GoogleCloudStorageBuilder { self } - /// Set an option on the builder via a key - value pair. - #[deprecated(note = "Use with_config")] - pub fn try_with_option(self, key: impl AsRef, value: impl Into) -> Result { - Ok(self.with_config(key.as_ref().parse()?, value)) - } - - /// Hydrate builder from key value pairs - #[deprecated(note = "Use with_config")] - #[allow(deprecated)] - pub fn try_with_options, impl Into)>>( - mut self, - options: I, - ) -> Result { - for (key, value) in options { - self = self.try_with_option(key, value)?; - } - Ok(self) - } - /// Get config value via a [`GoogleConfigKey`]. /// /// # Example From 63a2d4fe04c3edfba24712f203fc5a09f62c9ac5 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Tue, 26 Dec 2023 13:06:13 +0000 Subject: [PATCH 247/397] Implement `copy_if_not_exist` for `AmazonS3` using DynamoDB (#4880) (#4918) * Implement DynamoDBLock (#4880) * Cleanup error handling * Clippy * Localstack support * Clippy * Handle integration test concurrency * More docs * Disable request timeout * Fix merge conflicts * Reduce test concurrency * Increase timeouts --- src/aws/builder.rs | 24 +- src/aws/client.rs | 179 +++++-------- src/aws/dynamo.rs | 567 ++++++++++++++++++++++++++++++++++++++++ src/aws/mod.rs | 59 ++++- src/aws/precondition.rs | 17 ++ 5 files changed, 700 insertions(+), 146 deletions(-) create mode 100644 src/aws/dynamo.rs diff --git a/src/aws/builder.rs b/src/aws/builder.rs index 596ff99..5f7f1c9 100644 --- a/src/aws/builder.rs +++ b/src/aws/builder.rs @@ -821,27 +821,23 @@ impl AmazonS3Builder { )) as _ }; - let endpoint: String; - let bucket_endpoint: String; - // If `endpoint` is provided then its assumed to be consistent with // `virtual_hosted_style_request`. i.e. if `virtual_hosted_style_request` is true then // `endpoint` should have bucket name included. - if self.virtual_hosted_style_request.get()? { - endpoint = self - .endpoint - .unwrap_or_else(|| format!("https://{bucket}.s3.{region}.amazonaws.com")); - bucket_endpoint = endpoint.clone(); + let bucket_endpoint = if self.virtual_hosted_style_request.get()? { + self.endpoint + .clone() + .unwrap_or_else(|| format!("https://{bucket}.s3.{region}.amazonaws.com")) } else { - endpoint = self - .endpoint - .unwrap_or_else(|| format!("https://s3.{region}.amazonaws.com")); - bucket_endpoint = format!("{endpoint}/{bucket}"); - } + match &self.endpoint { + None => format!("https://s3.{region}.amazonaws.com/{bucket}"), + Some(endpoint) => format!("{endpoint}/{bucket}"), + } + }; let config = S3Config { region, - endpoint, + endpoint: self.endpoint, bucket, bucket_endpoint, credentials, diff --git a/src/aws/client.rs b/src/aws/client.rs index ecbe556..45d97ea 100644 --- a/src/aws/client.rs +++ b/src/aws/client.rs @@ -21,7 +21,7 @@ use crate::aws::{ AwsCredentialProvider, S3ConditionalPut, S3CopyIfNotExists, STORE, STRICT_PATH_ENCODE_SET, }; use crate::client::get::GetClient; -use crate::client::header::HeaderConfig; +use crate::client::header::{get_etag, HeaderConfig}; use crate::client::header::{get_put_result, get_version}; use crate::client::list::ListClient; use crate::client::retry::RetryExt; @@ -39,6 +39,7 @@ use async_trait::async_trait; use base64::prelude::BASE64_STANDARD; use base64::Engine; use bytes::{Buf, Bytes}; +use hyper::http; use hyper::http::HeaderName; use itertools::Itertools; use percent_encoding::{utf8_percent_encode, PercentEncode}; @@ -57,30 +58,12 @@ const VERSION_HEADER: &str = "x-amz-version-id"; #[derive(Debug, Snafu)] #[allow(missing_docs)] pub(crate) enum Error { - #[snafu(display("Error performing get request {}: {}", path, source))] - GetRequest { - source: crate::client::retry::Error, - path: String, - }, - #[snafu(display("Error fetching get response body {}: {}", path, source))] GetResponseBody { source: reqwest::Error, path: String, }, - #[snafu(display("Error performing put request {}: {}", path, source))] - PutRequest { - source: crate::client::retry::Error, - path: String, - }, - - #[snafu(display("Error performing delete request {}: {}", path, source))] - DeleteRequest { - source: crate::client::retry::Error, - path: String, - }, - #[snafu(display("Error performing DeleteObjects request: {}", source))] DeleteObjectsRequest { source: crate::client::retry::Error }, @@ -104,12 +87,6 @@ pub(crate) enum Error { source: Box, }, - #[snafu(display("Error performing copy request {}: {}", path, source))] - CopyRequest { - source: crate::client::retry::Error, - path: String, - }, - #[snafu(display("Error performing list request: {}", source))] ListRequest { source: crate::client::retry::Error }, @@ -142,15 +119,9 @@ pub(crate) enum Error { impl From for crate::Error { fn from(err: Error) -> Self { - match err { - Error::GetRequest { source, path } - | Error::DeleteRequest { source, path } - | Error::CopyRequest { source, path } - | Error::PutRequest { source, path } => source.error(STORE, path), - _ => Self::Generic { - store: STORE, - source: Box::new(err), - }, + Self::Generic { + store: STORE, + source: Box::new(err), } } } @@ -196,7 +167,7 @@ impl From for Error { #[derive(Debug)] pub struct S3Config { pub region: String, - pub endpoint: String, + pub endpoint: Option, pub bucket: String, pub bucket_endpoint: String, pub credentials: AwsCredentialProvider, @@ -215,7 +186,7 @@ impl S3Config { format!("{}/{}", self.bucket_endpoint, encode_path(path)) } - async fn get_credential(&self) -> Result>> { + pub(crate) async fn get_credential(&self) -> Result>> { Ok(match self.skip_signature { false => Some(self.credentials.get_credential().await?), true => None, @@ -223,30 +194,52 @@ impl S3Config { } } -/// A builder for a put request allowing customisation of the headers and query string -pub(crate) struct PutRequest<'a> { +#[derive(Debug, Snafu)] +pub enum RequestError { + #[snafu(context(false))] + Generic { source: crate::Error }, + Retry { + source: crate::client::retry::Error, + path: String, + }, +} + +impl From for crate::Error { + fn from(value: RequestError) -> Self { + match value { + RequestError::Generic { source } => source, + RequestError::Retry { source, path } => source.error(STORE, path), + } + } +} + +/// A builder for a request allowing customisation of the headers and query string +pub(crate) struct Request<'a> { path: &'a Path, config: &'a S3Config, builder: RequestBuilder, payload_sha256: Option>, } -impl<'a> PutRequest<'a> { +impl<'a> Request<'a> { pub fn query(self, query: &T) -> Self { let builder = self.builder.query(query); Self { builder, ..self } } - pub fn header(self, k: &HeaderName, v: &str) -> Self { + pub fn header(self, k: K, v: &str) -> Self + where + HeaderName: TryFrom, + >::Error: Into, + { let builder = self.builder.header(k, v); Self { builder, ..self } } - pub async fn send(self) -> Result { + pub async fn send(self) -> Result { let credential = self.config.get_credential().await?; - - let response = self - .builder + let path = self.path.as_ref(); + self.builder .with_aws_sigv4( credential.as_deref(), &self.config.region, @@ -256,18 +249,19 @@ impl<'a> PutRequest<'a> { ) .send_retry(&self.config.retry_config) .await - .context(PutRequestSnafu { - path: self.path.as_ref(), - })?; + .context(RetrySnafu { path }) + } + pub async fn do_put(self) -> Result { + let response = self.send().await?; Ok(get_put_result(response.headers(), VERSION_HEADER).context(MetadataSnafu)?) } } #[derive(Debug)] pub(crate) struct S3Client { - config: S3Config, - client: ReqwestClient, + pub config: S3Config, + pub client: ReqwestClient, } impl S3Client { @@ -276,20 +270,15 @@ impl S3Client { Ok(Self { config, client }) } - /// Returns the config - pub fn config(&self) -> &S3Config { - &self.config - } - /// Make an S3 PUT request /// /// Returns the ETag - pub fn put_request<'a>(&'a self, path: &'a Path, bytes: Bytes) -> PutRequest<'a> { + pub fn put_request<'a>(&'a self, path: &'a Path, bytes: Bytes) -> Request<'a> { let url = self.config.path_url(path); let mut builder = self.client.request(Method::PUT, url); let mut payload_sha256 = None; - if let Some(checksum) = self.config().checksum { + if let Some(checksum) = self.config.checksum { let digest = checksum.digest(&bytes); builder = builder.header(checksum.header_name(), BASE64_STANDARD.encode(&digest)); if checksum == Checksum::SHA256 { @@ -302,11 +291,11 @@ impl S3Client { false => builder.body(bytes), }; - if let Some(value) = self.config().client_options.get_content_type(path) { + if let Some(value) = self.config.client_options.get_content_type(path) { builder = builder.header(CONTENT_TYPE, value); } - PutRequest { + Request { path, builder, payload_sha256, @@ -335,9 +324,7 @@ impl S3Client { ) .send_retry(&self.config.retry_config) .await - .context(DeleteRequestSnafu { - path: path.as_ref(), - })?; + .map_err(|e| e.error(STORE, path.to_string()))?; Ok(()) } @@ -400,7 +387,7 @@ impl S3Client { // Compute checksum - S3 *requires* this for DeleteObjects requests, so we default to // their algorithm if the user hasn't specified one. - let checksum = self.config().checksum.unwrap_or(Checksum::SHA256); + let checksum = self.config.checksum.unwrap_or(Checksum::SHA256); let digest = checksum.digest(&body); builder = builder.header(checksum.header_name(), BASE64_STANDARD.encode(&digest)); let payload_sha256 = if checksum == Checksum::SHA256 { @@ -451,60 +438,21 @@ impl S3Client { } /// Make an S3 Copy request - pub async fn copy_request(&self, from: &Path, to: &Path, overwrite: bool) -> Result<()> { - let credential = self.config.get_credential().await?; + pub fn copy_request<'a>(&'a self, from: &'a Path, to: &Path) -> Request<'a> { let url = self.config.path_url(to); let source = format!("{}/{}", self.config.bucket, encode_path(from)); - let mut builder = self + let builder = self .client .request(Method::PUT, url) .header("x-amz-copy-source", source); - if !overwrite { - match &self.config.copy_if_not_exists { - Some(S3CopyIfNotExists::Header(k, v)) => { - builder = builder.header(k, v); - } - Some(S3CopyIfNotExists::HeaderWithStatus(k, v, _)) => { - builder = builder.header(k, v); - } - None => { - return Err(crate::Error::NotSupported { - source: "S3 does not support copy-if-not-exists".to_string().into(), - }) - } - } + Request { + builder, + path: from, + config: &self.config, + payload_sha256: None, } - - let precondition_failure = match &self.config.copy_if_not_exists { - Some(S3CopyIfNotExists::HeaderWithStatus(_, _, code)) => *code, - _ => reqwest::StatusCode::PRECONDITION_FAILED, - }; - - builder - .with_aws_sigv4( - credential.as_deref(), - &self.config.region, - "s3", - self.config.sign_payload, - None, - ) - .send_retry(&self.config.retry_config) - .await - .map_err(|source| match source.status() { - Some(error) if error == precondition_failure => crate::Error::AlreadyExists { - source: Box::new(source), - path: to.to_string(), - }, - _ => Error::CopyRequest { - source, - path: from.to_string(), - } - .into(), - })?; - - Ok(()) } pub async fn create_multipart(&self, location: &Path) -> Result { @@ -543,15 +491,14 @@ impl S3Client { ) -> Result { let part = (part_idx + 1).to_string(); - let result = self + let response = self .put_request(path, data) .query(&[("partNumber", &part), ("uploadId", upload_id)]) .send() .await?; - Ok(PartId { - content_id: result.e_tag.unwrap(), - }) + let content_id = get_etag(response.headers()).context(MetadataSnafu)?; + Ok(PartId { content_id }) } pub async fn complete_multipart( @@ -614,9 +561,7 @@ impl S3Client { ) .send_retry(&self.config.retry_config) .await - .context(GetRequestSnafu { - path: path.as_ref(), - })?; + .map_err(|e| e.error(STORE, path.to_string()))?; Ok(response) } } @@ -657,9 +602,7 @@ impl GetClient for S3Client { ) .send_retry(&self.config.retry_config) .await - .context(GetRequestSnafu { - path: path.as_ref(), - })?; + .map_err(|e| e.error(STORE, path.to_string()))?; Ok(response) } diff --git a/src/aws/dynamo.rs b/src/aws/dynamo.rs new file mode 100644 index 0000000..ce1500b --- /dev/null +++ b/src/aws/dynamo.rs @@ -0,0 +1,567 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! A DynamoDB based lock system + +use std::collections::HashMap; +use std::time::{Duration, Instant}; + +use chrono::Utc; +use reqwest::{Response, StatusCode}; +use serde::ser::SerializeMap; +use serde::{Deserialize, Serialize, Serializer}; + +use crate::aws::client::S3Client; +use crate::aws::credential::CredentialExt; +use crate::aws::AwsCredential; +use crate::client::get::GetClientExt; +use crate::client::retry::Error as RetryError; +use crate::client::retry::RetryExt; +use crate::path::Path; +use crate::{Error, GetOptions, Result}; + +/// The exception returned by DynamoDB on conflict +const CONFLICT: &str = "ConditionalCheckFailedException"; + +const STORE: &str = "DynamoDB"; + +/// A DynamoDB-based commit protocol, used to provide conditional write support for S3 +/// +/// ## Limitations +/// +/// Only conditional operations, e.g. `copy_if_not_exists` will be synchronized, and can +/// therefore race with non-conditional operations, e.g. `put`, `copy`, `delete`, or +/// conditional operations performed by writers not configured to synchronize with DynamoDB. +/// +/// Workloads making use of this mechanism **must** ensure: +/// +/// * Conditional and non-conditional operations are not performed on the same paths +/// * Conditional operations are only performed via similarly configured clients +/// +/// Additionally as the locking mechanism relies on timeouts to detect stale locks, +/// performance will be poor for systems that frequently delete and then create +/// objects at the same path, instead being optimised for systems that primarily create +/// files with paths never used before, or perform conditional updates to existing files +/// +/// ## Commit Protocol +/// +/// The DynamoDB schema is as follows: +/// +/// * A string hash key named `"key"` +/// * A numeric [TTL] attribute named `"ttl"` +/// * A numeric attribute named `"generation"` +/// * A numeric attribute named `"timeout"` +/// +/// To perform a conditional operation on an object with a given `path` and `etag` (if exists), +/// the commit protocol is as follows: +/// +/// 1. Perform HEAD request on `path` and error on precondition mismatch +/// 2. Create record in DynamoDB with key `{path}#{etag}` with the configured timeout +/// 1. On Success: Perform operation with the configured timeout +/// 2. On Conflict: +/// 1. Periodically re-perform HEAD request on `path` and error on precondition mismatch +/// 2. If `timeout * max_skew_rate` passed, replace the record incrementing the `"generation"` +/// 1. On Success: GOTO 2.1 +/// 2. On Conflict: GOTO 2.2 +/// +/// Provided no writer modifies an object with a given `path` and `etag` without first adding a +/// corresponding record to DynamoDB, we are guaranteed that only one writer will ever commit. +/// +/// This is inspired by the [DynamoDB Lock Client] but simplified for the more limited +/// requirements of synchronizing object storage. The major changes are: +/// +/// * Uses a monotonic generation count instead of a UUID rvn, as this is: +/// * Cheaper to generate, serialize and compare +/// * Cannot collide +/// * More human readable / interpretable +/// * Relies on [TTL] to eventually clean up old locks +/// +/// It also draws inspiration from the DeltaLake [S3 Multi-Cluster] commit protocol, but +/// generalised to not make assumptions about the workload and not rely on first writing +/// to a temporary path. +/// +/// [TTL]: https://docs.aws.amazon.com/amazondynamodb/latest/developerguide/howitworks-ttl.html +/// [DynamoDB Lock Client]: https://aws.amazon.com/blogs/database/building-distributed-locks-with-the-dynamodb-lock-client/ +/// [S3 Multi-Cluster]: https://docs.google.com/document/d/1Gs4ZsTH19lMxth4BSdwlWjUNR-XhKHicDvBjd2RqNd8/edit#heading=h.mjjuxw9mcz9h +#[derive(Debug, Clone, Eq, PartialEq)] +pub struct DynamoCommit { + table_name: String, + /// The number of milliseconds a lease is valid for + timeout: u64, + /// The maximum clock skew rate tolerated by the system + max_clock_skew_rate: u32, + /// The length of time a record will be retained in DynamoDB before being cleaned up + /// + /// This is purely an optimisation to avoid indefinite growth of the DynamoDB table + /// and does not impact how long clients may wait to acquire a lock + ttl: Duration, + /// The backoff duration before retesting a condition + test_interval: Duration, +} + +impl DynamoCommit { + /// Create a new [`DynamoCommit`] with a given table name + pub fn new(table_name: String) -> Self { + Self { + table_name, + timeout: 20_000, + max_clock_skew_rate: 3, + ttl: Duration::from_secs(60 * 60), + test_interval: Duration::from_millis(100), + } + } + + /// Overrides the lock timeout. + /// + /// A longer lock timeout reduces the probability of spurious commit failures and multi-writer + /// races, but will increase the time that writers must wait to reclaim a lock lost. The + /// default value of 20 seconds should be appropriate for must use-cases. + pub fn with_timeout(mut self, millis: u64) -> Self { + self.timeout = millis; + self + } + + /// The maximum clock skew rate tolerated by the system. + /// + /// An environment in which the clock on the fastest node ticks twice as fast as the slowest + /// node, would have a clock skew rate of 2. The default value of 3 should be appropriate + /// for most environments. + pub fn with_max_clock_skew_rate(mut self, rate: u32) -> Self { + self.max_clock_skew_rate = rate; + self + } + + /// The length of time a record should be retained in DynamoDB before being cleaned up + /// + /// This should be significantly larger than the configured lock timeout, with the default + /// value of 1 hour appropriate for most use-cases. + pub fn with_ttl(mut self, ttl: Duration) -> Self { + self.ttl = ttl; + self + } + + /// Returns the name of the DynamoDB table. + pub(crate) fn table_name(&self) -> &str { + &self.table_name + } + + pub(crate) async fn copy_if_not_exists( + &self, + client: &S3Client, + from: &Path, + to: &Path, + ) -> Result<()> { + check_not_exists(client, to).await?; + + let mut previous_lease = None; + + loop { + let existing = previous_lease.as_ref(); + match self.try_lock(client, to.as_ref(), existing).await? { + TryLockResult::Ok(lease) => { + let fut = client.copy_request(from, to).send(); + let expiry = lease.acquire + lease.timeout; + return match tokio::time::timeout_at(expiry.into(), fut).await { + Ok(Ok(_)) => Ok(()), + Ok(Err(e)) => Err(e.into()), + Err(_) => Err(Error::Generic { + store: "DynamoDB", + source: format!( + "Failed to perform copy operation in {} milliseconds", + self.timeout + ) + .into(), + }), + }; + } + TryLockResult::Conflict(conflict) => { + let mut interval = tokio::time::interval(self.test_interval); + let expiry = conflict.timeout * self.max_clock_skew_rate; + loop { + interval.tick().await; + check_not_exists(client, to).await?; + if conflict.acquire.elapsed() > expiry { + previous_lease = Some(conflict); + break; + } + } + } + } + } + } + + /// Retrieve a lock, returning an error if it doesn't exist + async fn get_lock(&self, s3: &S3Client, key: &str) -> Result { + let key_attributes = [("key", AttributeValue::String(key))]; + let req = GetItem { + table_name: &self.table_name, + key: Map(&key_attributes), + }; + let credential = s3.config.get_credential().await?; + + let resp = self + .request(s3, credential.as_deref(), "DynamoDB_20120810.GetItem", req) + .await + .map_err(|e| e.error(STORE, key.to_string()))?; + + let body = resp.bytes().await.map_err(|e| Error::Generic { + store: STORE, + source: Box::new(e), + })?; + + let response: GetItemResponse<'_> = + serde_json::from_slice(body.as_ref()).map_err(|e| Error::Generic { + store: STORE, + source: Box::new(e), + })?; + + extract_lease(&response.item).ok_or_else(|| Error::NotFound { + path: key.into(), + source: "DynamoDB GetItem returned no items".to_string().into(), + }) + } + + /// Attempt to acquire a lock, reclaiming an existing lease if provided + async fn try_lock( + &self, + s3: &S3Client, + key: &str, + existing: Option<&Lease>, + ) -> Result { + let attributes; + let (next_gen, condition_expression, expression_attribute_values) = match existing { + None => (0_u64, "attribute_not_exists(#pk)", Map(&[])), + Some(existing) => { + attributes = [(":g", AttributeValue::Number(existing.generation))]; + ( + existing.generation.checked_add(1).unwrap(), + "attribute_exists(#pk) AND generation = :g", + Map(attributes.as_slice()), + ) + } + }; + + let ttl = (Utc::now() + self.ttl).timestamp(); + let items = [ + ("key", AttributeValue::String(key)), + ("generation", AttributeValue::Number(next_gen)), + ("timeout", AttributeValue::Number(self.timeout)), + ("ttl", AttributeValue::Number(ttl as _)), + ]; + let names = [("#pk", "key")]; + + let req = PutItem { + table_name: &self.table_name, + condition_expression, + expression_attribute_values, + expression_attribute_names: Map(&names), + item: Map(&items), + return_values: None, + return_values_on_condition_check_failure: Some(ReturnValues::AllOld), + }; + + let credential = s3.config.get_credential().await?; + + let acquire = Instant::now(); + match self + .request(s3, credential.as_deref(), "DynamoDB_20120810.PutItem", req) + .await + { + Ok(_) => Ok(TryLockResult::Ok(Lease { + acquire, + generation: next_gen, + timeout: Duration::from_millis(self.timeout), + })), + Err(e) => match parse_error_response(&e) { + Some(e) if e.error.ends_with(CONFLICT) => match extract_lease(&e.item) { + Some(lease) => Ok(TryLockResult::Conflict(lease)), + // ReturnValuesOnConditionCheckFailure is a relatively recent addition + // to DynamoDB and is not supported by dynamodb-local, which is used + // by localstack. In such cases the conflict error will not contain + // the conflicting item, and we must instead perform a get request + // + // There is a potential race here if the conflicting record is removed + // before we retrieve it. We could retry the transaction in such a scenario, + // but as this only occurs for emulators, we simply abort with a + // not found error + // + // + // + // + None => Ok(TryLockResult::Conflict(self.get_lock(s3, key).await?)), + }, + _ => Err(Error::Generic { + store: STORE, + source: Box::new(e), + }), + }, + } + } + + async fn request( + &self, + s3: &S3Client, + cred: Option<&AwsCredential>, + target: &str, + req: R, + ) -> Result { + let region = &s3.config.region; + + let builder = match &s3.config.endpoint { + Some(e) => s3.client.post(e), + None => { + let url = format!("https://dynamodb.{region}.amazonaws.com"); + s3.client.post(url) + } + }; + + builder + .timeout(Duration::from_millis(self.timeout)) + .json(&req) + .header("X-Amz-Target", target) + .with_aws_sigv4(cred, region, "dynamodb", true, None) + .send_retry(&s3.config.retry_config) + .await + } +} + +#[derive(Debug)] +enum TryLockResult { + /// Successfully acquired a lease + Ok(Lease), + /// An existing lease was found + Conflict(Lease), +} + +/// Returns an [`Error::AlreadyExists`] if `path` exists +async fn check_not_exists(client: &S3Client, path: &Path) -> Result<()> { + let options = GetOptions { + head: true, + ..Default::default() + }; + match client.get_opts(path, options).await { + Ok(_) => Err(Error::AlreadyExists { + path: path.to_string(), + source: "Already Exists".to_string().into(), + }), + Err(Error::NotFound { .. }) => Ok(()), + Err(e) => Err(e), + } +} + +/// Parses the error response if any +fn parse_error_response(e: &RetryError) -> Option> { + match e { + RetryError::Client { + status: StatusCode::BAD_REQUEST, + body: Some(b), + } => serde_json::from_str(b).ok(), + _ => None, + } +} + +/// Extracts a lease from `item`, returning `None` on error +fn extract_lease(item: &HashMap<&str, AttributeValue<'_>>) -> Option { + let generation = match item.get("generation") { + Some(AttributeValue::Number(generation)) => generation, + _ => return None, + }; + + let timeout = match item.get("timeout") { + Some(AttributeValue::Number(timeout)) => *timeout, + _ => return None, + }; + + Some(Lease { + acquire: Instant::now(), + generation: *generation, + timeout: Duration::from_millis(timeout), + }) +} + +/// A lock lease +#[derive(Debug, Clone)] +struct Lease { + acquire: Instant, + generation: u64, + timeout: Duration, +} + +/// A DynamoDB [PutItem] payload +/// +/// [PutItem]: https://docs.aws.amazon.com/amazondynamodb/latest/APIReference/API_PutItem.html +#[derive(Serialize)] +#[serde(rename_all = "PascalCase")] +struct PutItem<'a> { + /// The table name + table_name: &'a str, + + /// A condition that must be satisfied in order for a conditional PutItem operation to succeed. + condition_expression: &'a str, + + /// One or more substitution tokens for attribute names in an expression + expression_attribute_names: Map<'a, &'a str, &'a str>, + + /// One or more values that can be substituted in an expression + expression_attribute_values: Map<'a, &'a str, AttributeValue<'a>>, + + /// A map of attribute name/value pairs, one for each attribute + item: Map<'a, &'a str, AttributeValue<'a>>, + + /// Use ReturnValues if you want to get the item attributes as they appeared + /// before they were updated with the PutItem request. + #[serde(skip_serializing_if = "Option::is_none")] + return_values: Option, + + /// An optional parameter that returns the item attributes for a PutItem operation + /// that failed a condition check. + #[serde(skip_serializing_if = "Option::is_none")] + return_values_on_condition_check_failure: Option, +} + +/// A DynamoDB [GetItem] payload +/// +/// [GetItem]: https://docs.aws.amazon.com/amazondynamodb/latest/APIReference/API_GetItem.html +#[derive(Serialize)] +#[serde(rename_all = "PascalCase")] +struct GetItem<'a> { + /// The table name + table_name: &'a str, + /// The primary key + key: Map<'a, &'a str, AttributeValue<'a>>, +} + +#[derive(Deserialize)] +struct GetItemResponse<'a> { + #[serde(borrow, default, rename = "Item")] + item: HashMap<&'a str, AttributeValue<'a>>, +} + +#[derive(Deserialize)] +struct ErrorResponse<'a> { + #[serde(rename = "__type")] + error: &'a str, + + #[serde(borrow, default, rename = "Item")] + item: HashMap<&'a str, AttributeValue<'a>>, +} + +#[derive(Serialize)] +#[serde(rename_all = "SCREAMING_SNAKE_CASE")] +enum ReturnValues { + AllOld, +} + +/// A collection of key value pairs +/// +/// This provides cheap, ordered serialization of maps +struct Map<'a, K, V>(&'a [(K, V)]); + +impl<'a, K: Serialize, V: Serialize> Serialize for Map<'a, K, V> { + fn serialize(&self, serializer: S) -> Result + where + S: Serializer, + { + if self.0.is_empty() { + return serializer.serialize_none(); + } + let mut map = serializer.serialize_map(Some(self.0.len()))?; + for (k, v) in self.0 { + map.serialize_entry(k, v)? + } + map.end() + } +} + +/// A DynamoDB [AttributeValue] +/// +/// [AttributeValue]: https://docs.aws.amazon.com/amazondynamodb/latest/APIReference/API_AttributeValue.html +#[derive(Debug, Serialize, Deserialize)] +enum AttributeValue<'a> { + #[serde(rename = "S")] + String(&'a str), + #[serde(rename = "N", with = "number")] + Number(u64), +} + +/// Numbers are serialized as strings +mod number { + use serde::{Deserialize, Deserializer, Serializer}; + + pub fn serialize(v: &u64, s: S) -> Result { + s.serialize_str(&v.to_string()) + } + + pub fn deserialize<'de, D: Deserializer<'de>>(d: D) -> Result { + let v: &str = Deserialize::deserialize(d)?; + v.parse().map_err(serde::de::Error::custom) + } +} + +/// Re-export integration_test to be called by s3_test +#[cfg(test)] +pub(crate) use tests::integration_test; + +#[cfg(test)] +mod tests { + + use super::*; + use crate::aws::AmazonS3; + use crate::ObjectStore; + + #[test] + fn test_attribute_serde() { + let serde = serde_json::to_string(&AttributeValue::Number(23)).unwrap(); + assert_eq!(serde, "{\"N\":\"23\"}"); + let back: AttributeValue<'_> = serde_json::from_str(&serde).unwrap(); + assert!(matches!(back, AttributeValue::Number(23))); + } + + /// An integration test for DynamoDB + /// + /// This is a function called by s3_test to avoid test concurrency issues + pub async fn integration_test(integration: &AmazonS3, d: &DynamoCommit) { + let client = integration.client.as_ref(); + + let src = Path::from("dynamo_path_src"); + integration.put(&src, "asd".into()).await.unwrap(); + + let dst = Path::from("dynamo_path"); + let _ = integration.delete(&dst).await; // Delete if present + + // Create a lock if not already exists + let existing = match d.try_lock(client, dst.as_ref(), None).await.unwrap() { + TryLockResult::Conflict(l) => l, + TryLockResult::Ok(l) => l, + }; + + // Should not be able to acquire a lock again + let r = d.try_lock(client, dst.as_ref(), None).await; + assert!(matches!(r, Ok(TryLockResult::Conflict(_)))); + + // But should still be able to reclaim lock and perform copy + d.copy_if_not_exists(client, &src, &dst).await.unwrap(); + + match d.try_lock(client, dst.as_ref(), None).await.unwrap() { + TryLockResult::Conflict(new) => { + // Should have incremented generation to do so + assert_eq!(new.generation, existing.generation + 1); + } + _ => panic!("Should conflict"), + } + } +} diff --git a/src/aws/mod.rs b/src/aws/mod.rs index 0985263..75b43d4 100644 --- a/src/aws/mod.rs +++ b/src/aws/mod.rs @@ -36,12 +36,12 @@ use bytes::Bytes; use futures::stream::BoxStream; use futures::{StreamExt, TryStreamExt}; use reqwest::header::{HeaderName, IF_MATCH, IF_NONE_MATCH}; -use reqwest::Method; +use reqwest::{Method, StatusCode}; use std::{sync::Arc, time::Duration}; use tokio::io::AsyncWrite; use url::Url; -use crate::aws::client::S3Client; +use crate::aws::client::{RequestError, S3Client}; use crate::client::get::GetClientExt; use crate::client::list::ListClientExt; use crate::client::CredentialProvider; @@ -58,11 +58,13 @@ mod builder; mod checksum; mod client; mod credential; +mod dynamo; mod precondition; mod resolve; pub use builder::{AmazonS3Builder, AmazonS3ConfigKey}; pub use checksum::Checksum; +pub use dynamo::DynamoCommit; pub use precondition::{S3ConditionalPut, S3CopyIfNotExists}; pub use resolve::resolve_bucket_region; @@ -93,19 +95,19 @@ pub struct AmazonS3 { impl std::fmt::Display for AmazonS3 { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!(f, "AmazonS3({})", self.client.config().bucket) + write!(f, "AmazonS3({})", self.client.config.bucket) } } impl AmazonS3 { /// Returns the [`AwsCredentialProvider`] used by [`AmazonS3`] pub fn credentials(&self) -> &AwsCredentialProvider { - &self.client.config().credentials + &self.client.config.credentials } /// Create a full URL to the resource specified by `path` with this instance's configuration. fn path_url(&self, path: &Path) -> String { - self.client.config().path_url(path) + self.client.config.path_url(path) } } @@ -145,7 +147,7 @@ impl Signer for AmazonS3 { /// ``` async fn signed_url(&self, method: Method, path: &Path, expires_in: Duration) -> Result { let credential = self.credentials().get_credential().await?; - let authorizer = AwsAuthorizer::new(&credential, "s3", &self.client.config().region); + let authorizer = AwsAuthorizer::new(&credential, "s3", &self.client.config.region); let path_url = self.path_url(path); let mut url = Url::parse(&path_url).map_err(|e| crate::Error::Generic { @@ -164,15 +166,15 @@ impl ObjectStore for AmazonS3 { async fn put_opts(&self, location: &Path, bytes: Bytes, opts: PutOptions) -> Result { let mut request = self.client.put_request(location, bytes); let tags = opts.tags.encoded(); - if !tags.is_empty() && !self.client.config().disable_tagging { + if !tags.is_empty() && !self.client.config.disable_tagging { request = request.header(&TAGS_HEADER, tags); } - match (opts.mode, &self.client.config().conditional_put) { - (PutMode::Overwrite, _) => request.send().await, + match (opts.mode, &self.client.config.conditional_put) { + (PutMode::Overwrite, _) => request.do_put().await, (PutMode::Create | PutMode::Update(_), None) => Err(Error::NotImplemented), (PutMode::Create, Some(S3ConditionalPut::ETagMatch)) => { - match request.header(&IF_NONE_MATCH, "*").send().await { + match request.header(&IF_NONE_MATCH, "*").do_put().await { // Technically If-None-Match should return NotModified but some stores, // such as R2, instead return PreconditionFailed // https://developers.cloudflare.com/r2/api/s3/extensions/#conditional-operations-in-putobject @@ -190,7 +192,7 @@ impl ObjectStore for AmazonS3 { store: STORE, source: "ETag required for conditional put".to_string().into(), })?; - request.header(&IF_MATCH, etag.as_str()).send().await + request.header(&IF_MATCH, etag.as_str()).do_put().await } } } @@ -261,11 +263,35 @@ impl ObjectStore for AmazonS3 { } async fn copy(&self, from: &Path, to: &Path) -> Result<()> { - self.client.copy_request(from, to, true).await + self.client.copy_request(from, to).send().await?; + Ok(()) } async fn copy_if_not_exists(&self, from: &Path, to: &Path) -> Result<()> { - self.client.copy_request(from, to, false).await + let (k, v, status) = match &self.client.config.copy_if_not_exists { + Some(S3CopyIfNotExists::Header(k, v)) => (k, v, StatusCode::PRECONDITION_FAILED), + Some(S3CopyIfNotExists::HeaderWithStatus(k, v, status)) => (k, v, *status), + Some(S3CopyIfNotExists::Dynamo(lock)) => { + return lock.copy_if_not_exists(&self.client, from, to).await + } + None => { + return Err(Error::NotSupported { + source: "S3 does not support copy-if-not-exists".to_string().into(), + }) + } + }; + + let req = self.client.copy_request(from, to); + match req.header(k, v).send().await { + Err(RequestError::Retry { source, path }) if source.status() == Some(status) => { + Err(Error::AlreadyExists { + source: Box::new(source), + path, + }) + } + Err(e) => Err(e.into()), + Ok(_) => Ok(()), + } } } @@ -335,7 +361,7 @@ mod tests { let config = AmazonS3Builder::from_env(); let integration = config.build().unwrap(); - let config = integration.client.config(); + let config = &integration.client.config; let test_not_exists = config.copy_if_not_exists.is_some(); let test_conditional_put = config.conditional_put.is_some(); @@ -368,6 +394,11 @@ mod tests { let builder = AmazonS3Builder::from_env().with_checksum_algorithm(Checksum::SHA256); let integration = builder.build().unwrap(); put_get_delete_list_opts(&integration).await; + + match &integration.client.config.copy_if_not_exists { + Some(S3CopyIfNotExists::Dynamo(d)) => dynamo::integration_test(&integration, d).await, + _ => eprintln!("Skipping dynamo integration test - dynamo not configured"), + }; } #[tokio::test] diff --git a/src/aws/precondition.rs b/src/aws/precondition.rs index ada5f3b..83d45db 100644 --- a/src/aws/precondition.rs +++ b/src/aws/precondition.rs @@ -15,6 +15,7 @@ // specific language governing permissions and limitations // under the License. +use crate::aws::dynamo::DynamoCommit; use crate::config::Parse; use itertools::Itertools; @@ -45,6 +46,15 @@ pub enum S3CopyIfNotExists { /// /// Encoded as `header-with-status:::` ignoring whitespace HeaderWithStatus(String, String, reqwest::StatusCode), + /// The name of a DynamoDB table to use for coordination + /// + /// Encoded as either `dynamodb:` or `dynamodb::` + /// ignoring whitespace. The default timeout is used if not specified + /// + /// See [`DynamoCommit`] for more information + /// + /// This will use the same region, credentials and endpoint as configured for S3 + Dynamo(DynamoCommit), } impl std::fmt::Display for S3CopyIfNotExists { @@ -54,6 +64,7 @@ impl std::fmt::Display for S3CopyIfNotExists { Self::HeaderWithStatus(k, v, code) => { write!(f, "header-with-status: {k}: {v}: {}", code.as_u16()) } + Self::Dynamo(lock) => write!(f, "dynamo: {}", lock.table_name()), } } } @@ -77,6 +88,12 @@ impl S3CopyIfNotExists { code, )) } + "dynamo" => Some(Self::Dynamo(match value.split_once(':') { + Some((table_name, timeout)) => DynamoCommit::new(table_name.trim().to_string()) + .with_timeout(timeout.parse().ok()?), + None => DynamoCommit::new(value.trim().to_string()), + })), + _ => None, } } From ca756dc9055d38fe02a0cb9e88135b8ee27cf44e Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Fri, 29 Dec 2023 21:14:18 +0000 Subject: [PATCH 248/397] Further Clippy Lints and Temporarily disable JS integration tests (#5258) * Temporarily disable JS integration tests * ObjectStore Clippy 1.75 --- src/lib.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lib.rs b/src/lib.rs index 3a84166..632e949 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1886,7 +1886,7 @@ mod tests { // We can abort an in-progress write let (upload_id, mut writer) = storage.put_multipart(&location).await.unwrap(); - if let Some(chunk) = data.get(0) { + if let Some(chunk) = data.first() { writer.write_all(chunk).await.unwrap(); let _ = writer.write(chunk).await.unwrap(); } From d0da1ec83a42e99c87c3cef1d77817bfe44a7b86 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Sat, 30 Dec 2023 14:29:03 +0000 Subject: [PATCH 249/397] Default AWS region to us-east-1 (#5211) (#5244) --- src/aws/builder.rs | 44 ++++++++++++++++++++++++++------------------ 1 file changed, 26 insertions(+), 18 deletions(-) diff --git a/src/aws/builder.rs b/src/aws/builder.rs index 5f7f1c9..5e05b05 100644 --- a/src/aws/builder.rs +++ b/src/aws/builder.rs @@ -41,9 +41,6 @@ static DEFAULT_METADATA_ENDPOINT: &str = "http://169.254.169.254"; #[derive(Debug, Snafu)] #[allow(missing_docs)] enum Error { - #[snafu(display("Missing region"))] - MissingRegion, - #[snafu(display("Missing bucket name"))] MissingBucketName, @@ -559,19 +556,25 @@ impl AmazonS3Builder { Ok(()) } - /// Set the AWS Access Key (required) + /// Set the AWS Access Key pub fn with_access_key_id(mut self, access_key_id: impl Into) -> Self { self.access_key_id = Some(access_key_id.into()); self } - /// Set the AWS Secret Access Key (required) + /// Set the AWS Secret Access Key pub fn with_secret_access_key(mut self, secret_access_key: impl Into) -> Self { self.secret_access_key = Some(secret_access_key.into()); self } - /// Set the region (e.g. `us-east-1`) (required) + /// Set the AWS Session Token to use for requests + pub fn with_token(mut self, token: impl Into) -> Self { + self.token = Some(token.into()); + self + } + + /// Set the region, defaults to `us-east-1` pub fn with_region(mut self, region: impl Into) -> Self { self.region = Some(region.into()); self @@ -583,25 +586,21 @@ impl AmazonS3Builder { self } - /// Sets the endpoint for communicating with AWS S3. Default value - /// is based on region. The `endpoint` field should be consistent with - /// the field `virtual_hosted_style_request'. + /// Sets the endpoint for communicating with AWS S3, defaults to the [region endpoint] /// /// For example, this might be set to `"http://localhost:4566:` /// for testing against a localstack instance. - /// If `virtual_hosted_style_request` is set to true then `endpoint` - /// should have bucket name included. + /// + /// The `endpoint` field should be consistent with [`Self::with_virtual_hosted_style_request`], + /// i.e. if `virtual_hosted_style_request` is set to true then `endpoint` + /// should have the bucket name included. + /// + /// [region endpoint]: https://docs.aws.amazon.com/general/latest/gr/s3.html pub fn with_endpoint(mut self, endpoint: impl Into) -> Self { self.endpoint = Some(endpoint.into()); self } - /// Set the token to use for requests (passed to underlying provider) - pub fn with_token(mut self, token: impl Into) -> Self { - self.token = Some(token.into()); - self - } - /// Set the credential provider overriding any other options pub fn with_credentials(mut self, credentials: AwsCredentialProvider) -> Self { self.credentials = Some(credentials); @@ -741,7 +740,7 @@ impl AmazonS3Builder { } let bucket = self.bucket_name.context(MissingBucketNameSnafu)?; - let region = self.region.context(MissingRegionSnafu)?; + let region = self.region.unwrap_or_else(|| "us-east-1".to_string()); let checksum = self.checksum_algorithm.map(|x| x.get()).transpose()?; let copy_if_not_exists = self.copy_if_not_exists.map(|x| x.get()).transpose()?; let put_precondition = self.conditional_put.map(|x| x.get()).transpose()?; @@ -950,6 +949,15 @@ mod tests { ); } + #[test] + fn s3_default_region() { + let builder = AmazonS3Builder::new() + .with_bucket_name("foo") + .build() + .unwrap(); + assert_eq!(builder.client.config.region, "us-east-1"); + } + #[test] fn s3_test_urls() { let mut builder = AmazonS3Builder::new(); From bedf624c7226d7dbb84c1cbb739ec520ab21dd31 Mon Sep 17 00:00:00 2001 From: Xuanwo Date: Wed, 3 Jan 2024 18:59:17 +0800 Subject: [PATCH 250/397] docs(object_store): Mention `with_allow_http` in docs of `with_endpoint` (#5275) Signed-off-by: Xuanwo --- src/aws/builder.rs | 3 +++ src/azure/builder.rs | 3 +++ 2 files changed, 6 insertions(+) diff --git a/src/aws/builder.rs b/src/aws/builder.rs index 5e05b05..542f17a 100644 --- a/src/aws/builder.rs +++ b/src/aws/builder.rs @@ -595,6 +595,9 @@ impl AmazonS3Builder { /// i.e. if `virtual_hosted_style_request` is set to true then `endpoint` /// should have the bucket name included. /// + /// By default, only HTTPS schemes are enabled. To connect to an HTTP endpoint, enable + /// [`Self::with_allow_http`]. + /// /// [region endpoint]: https://docs.aws.amazon.com/general/latest/gr/s3.html pub fn with_endpoint(mut self, endpoint: impl Into) -> Self { self.endpoint = Some(endpoint.into()); diff --git a/src/azure/builder.rs b/src/azure/builder.rs index 2de0a7c..905fa52 100644 --- a/src/azure/builder.rs +++ b/src/azure/builder.rs @@ -687,6 +687,9 @@ impl MicrosoftAzureBuilder { /// Override the endpoint used to communicate with blob storage /// /// Defaults to `https://{account}.blob.core.windows.net` + /// + /// By default, only HTTPS schemes are enabled. To connect to an HTTP endpoint, enable + /// [`Self::with_allow_http`]. pub fn with_endpoint(mut self, endpoint: String) -> Self { self.endpoint = Some(endpoint); self From 0c76a79308e37b82049bce4cd3cb5ec7fdc89bdb Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Wed, 3 Jan 2024 19:34:16 +0000 Subject: [PATCH 251/397] Fix ObjectMeta::size for range requests (#5272) (#5276) * Fix ObjectMeta::size for range requests (#5272) * Docs * Update object_store/src/lib.rs Co-authored-by: Andrew Lamb * Add tests --------- Co-authored-by: Andrew Lamb --- src/client/get.rs | 243 ++++++++++++++++++++++++++++++++++++++++++---- src/lib.rs | 17 +++- 2 files changed, 236 insertions(+), 24 deletions(-) diff --git a/src/client/get.rs b/src/client/get.rs index 5f9cac9..b7e7f24 100644 --- a/src/client/get.rs +++ b/src/client/get.rs @@ -15,13 +15,18 @@ // specific language governing permissions and limitations // under the License. +use std::ops::Range; + use crate::client::header::{header_meta, HeaderConfig}; use crate::path::Path; -use crate::{Error, GetOptions, GetResult}; -use crate::{GetResultPayload, Result}; +use crate::{Error, GetOptions, GetResult, GetResultPayload, Result}; use async_trait::async_trait; use futures::{StreamExt, TryStreamExt}; +use hyper::header::CONTENT_RANGE; +use hyper::StatusCode; +use reqwest::header::ToStrError; use reqwest::Response; +use snafu::{ensure, OptionExt, ResultExt, Snafu}; /// A client that can perform a get request #[async_trait] @@ -45,25 +50,221 @@ impl GetClientExt for T { async fn get_opts(&self, location: &Path, options: GetOptions) -> Result { let range = options.range.clone(); let response = self.get_request(location, options).await?; - let meta = header_meta(location, response.headers(), T::HEADER_CONFIG).map_err(|e| { - Error::Generic { - store: T::STORE, - source: Box::new(e), - } - })?; - - let stream = response - .bytes_stream() - .map_err(|source| Error::Generic { - store: T::STORE, - source: Box::new(source), - }) - .boxed(); - - Ok(GetResult { - range: range.unwrap_or(0..meta.size), - payload: GetResultPayload::Stream(stream), - meta, + get_result::(location, range, response).map_err(|e| crate::Error::Generic { + store: T::STORE, + source: Box::new(e), + }) + } +} + +struct ContentRange { + /// The range of the object returned + range: Range, + /// The total size of the object being requested + size: usize, +} + +impl ContentRange { + /// Parse a content range of the form `bytes -/` + /// + /// + fn from_str(s: &str) -> Option { + let rem = s.trim().strip_prefix("bytes ")?; + let (range, size) = rem.split_once('/')?; + let size = size.parse().ok()?; + + let (start_s, end_s) = range.split_once('-')?; + + let start = start_s.parse().ok()?; + let end: usize = end_s.parse().ok()?; + + Some(Self { + size, + range: start..end + 1, }) } } + +/// A specialized `Error` for get-related errors +#[derive(Debug, Snafu)] +#[allow(missing_docs)] +enum GetResultError { + #[snafu(context(false))] + Header { + source: crate::client::header::Error, + }, + + #[snafu(display("Received non-partial response when range requested"))] + NotPartial, + + #[snafu(display("Content-Range header not present in partial response"))] + NoContentRange, + + #[snafu(display("Failed to parse value for CONTENT_RANGE header: \"{value}\""))] + ParseContentRange { value: String }, + + #[snafu(display("Content-Range header contained non UTF-8 characters"))] + InvalidContentRange { source: ToStrError }, + + #[snafu(display("Requested {expected:?}, got {actual:?}"))] + UnexpectedRange { + expected: Range, + actual: Range, + }, +} + +fn get_result( + location: &Path, + range: Option>, + response: Response, +) -> Result { + let mut meta = header_meta(location, response.headers(), T::HEADER_CONFIG)?; + + // ensure that we receive the range we asked for + let range = if let Some(expected) = range { + ensure!( + response.status() == StatusCode::PARTIAL_CONTENT, + NotPartialSnafu + ); + let val = response + .headers() + .get(CONTENT_RANGE) + .context(NoContentRangeSnafu)?; + + let value = val.to_str().context(InvalidContentRangeSnafu)?; + let value = ContentRange::from_str(value).context(ParseContentRangeSnafu { value })?; + let actual = value.range; + + ensure!( + actual == expected, + UnexpectedRangeSnafu { expected, actual } + ); + + // Update size to reflect full size of object (#5272) + meta.size = value.size; + actual + } else { + 0..meta.size + }; + + let stream = response + .bytes_stream() + .map_err(|source| Error::Generic { + store: T::STORE, + source: Box::new(source), + }) + .boxed(); + + Ok(GetResult { + range, + meta, + payload: GetResultPayload::Stream(stream), + }) +} + +#[cfg(test)] +mod tests { + use super::*; + use hyper::http; + use hyper::http::header::*; + + struct TestClient {} + + #[async_trait] + impl GetClient for TestClient { + const STORE: &'static str = "TEST"; + + const HEADER_CONFIG: HeaderConfig = HeaderConfig { + etag_required: false, + last_modified_required: false, + version_header: None, + }; + + async fn get_request(&self, _: &Path, _: GetOptions) -> Result { + unimplemented!() + } + } + + fn make_response( + object_size: usize, + range: Option>, + status: StatusCode, + content_range: Option<&str>, + ) -> Response { + let mut builder = http::Response::builder(); + if let Some(range) = content_range { + builder = builder.header(CONTENT_RANGE, range); + } + + let body = match range { + Some(range) => vec![0_u8; range.end - range.start], + None => vec![0_u8; object_size], + }; + + builder + .status(status) + .header(CONTENT_LENGTH, object_size) + .body(body) + .unwrap() + .into() + } + + #[tokio::test] + async fn test_get_result() { + let path = Path::from("test"); + + let resp = make_response(12, None, StatusCode::OK, None); + let res = get_result::(&path, None, resp).unwrap(); + assert_eq!(res.meta.size, 12); + assert_eq!(res.range, 0..12); + let bytes = res.bytes().await.unwrap(); + assert_eq!(bytes.len(), 12); + + let resp = make_response( + 12, + Some(2..3), + StatusCode::PARTIAL_CONTENT, + Some("bytes 2-2/12"), + ); + let res = get_result::(&path, Some(2..3), resp).unwrap(); + assert_eq!(res.meta.size, 12); + assert_eq!(res.range, 2..3); + let bytes = res.bytes().await.unwrap(); + assert_eq!(bytes.len(), 1); + + let resp = make_response(12, Some(2..3), StatusCode::OK, None); + let err = get_result::(&path, Some(2..3), resp).unwrap_err(); + assert_eq!( + err.to_string(), + "Received non-partial response when range requested" + ); + + let resp = make_response( + 12, + Some(2..3), + StatusCode::PARTIAL_CONTENT, + Some("bytes 2-3/12"), + ); + let err = get_result::(&path, Some(2..3), resp).unwrap_err(); + assert_eq!(err.to_string(), "Requested 2..3, got 2..4"); + + let resp = make_response( + 12, + Some(2..3), + StatusCode::PARTIAL_CONTENT, + Some("bytes 2-2/*"), + ); + let err = get_result::(&path, Some(2..3), resp).unwrap_err(); + assert_eq!( + err.to_string(), + "Failed to parse value for CONTENT_RANGE header: \"bytes 2-2/*\"" + ); + + let resp = make_response(12, Some(2..3), StatusCode::PARTIAL_CONTENT, None); + let err = get_result::(&path, Some(2..3), resp).unwrap_err(); + assert_eq!( + err.to_string(), + "Content-Range header not present in partial response" + ); + } +} diff --git a/src/lib.rs b/src/lib.rs index 632e949..b438254 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1303,12 +1303,23 @@ mod tests { let range = 3..7; let range_result = storage.get_range(&location, range.clone()).await; + let bytes = range_result.unwrap(); + assert_eq!(bytes, expected_data.slice(range.clone())); + + let opts = GetOptions { + range: Some(2..5), + ..Default::default() + }; + let result = storage.get_opts(&location, opts).await.unwrap(); + // Data is `"arbitrary data"`, length 14 bytes + assert_eq!(result.meta.size, 14); // Should return full object size (#5272) + assert_eq!(result.range, 2..5); + let bytes = result.bytes().await.unwrap(); + assert_eq!(bytes, b"bit".as_ref()); + let out_of_range = 200..300; let out_of_range_result = storage.get_range(&location, out_of_range).await; - let bytes = range_result.unwrap(); - assert_eq!(bytes, expected_data.slice(range)); - // Should be a non-fatal error out_of_range_result.unwrap_err(); From b226208dc5cefe17d23e66e9e1e28e8fcff0b65b Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 4 Jan 2024 08:39:02 +0000 Subject: [PATCH 252/397] Retry Safe/Read-Only Requests on Timeout (#5278) * Retry safe requests on timeout * Docs --- src/client/mock_server.rs | 21 ++++++++++++---- src/client/retry.rs | 50 +++++++++++++++++++++++++++++++++------ 2 files changed, 60 insertions(+), 11 deletions(-) diff --git a/src/client/mock_server.rs b/src/client/mock_server.rs index 36c6b65..70b8561 100644 --- a/src/client/mock_server.rs +++ b/src/client/mock_server.rs @@ -15,17 +15,20 @@ // specific language governing permissions and limitations // under the License. +use futures::future::BoxFuture; +use futures::FutureExt; use hyper::service::{make_service_fn, service_fn}; use hyper::{Body, Request, Response, Server}; use parking_lot::Mutex; use std::collections::VecDeque; use std::convert::Infallible; +use std::future::Future; use std::net::SocketAddr; use std::sync::Arc; use tokio::sync::oneshot; use tokio::task::JoinHandle; -pub type ResponseFn = Box) -> Response + Send>; +pub type ResponseFn = Box) -> BoxFuture<'static, Response> + Send>; /// A mock server pub struct MockServer { @@ -46,9 +49,10 @@ impl MockServer { async move { Ok::<_, Infallible>(service_fn(move |req| { let r = Arc::clone(&r); + let next = r.lock().pop_front(); async move { - Ok::<_, Infallible>(match r.lock().pop_front() { - Some(r) => r(req), + Ok::<_, Infallible>(match next { + Some(r) => r(req).await, None => Response::new(Body::from("Hello World")), }) } @@ -93,7 +97,16 @@ impl MockServer { where F: FnOnce(Request) -> Response + Send + 'static, { - self.responses.lock().push_back(Box::new(f)) + let f = Box::new(|req| async move { f(req) }.boxed()); + self.responses.lock().push_back(f) + } + + pub fn push_async_fn(&self, f: F) + where + F: FnOnce(Request) -> Fut + Send + 'static, + Fut: Future> + Send + 'static, + { + self.responses.lock().push_back(Box::new(|r| f(r).boxed())) } /// Shutdown the mock server diff --git a/src/client/retry.rs b/src/client/retry.rs index 08b9a74..9d21867 100644 --- a/src/client/retry.rs +++ b/src/client/retry.rs @@ -119,11 +119,19 @@ impl From for std::io::Error { pub type Result = std::result::Result; -/// Contains the configuration for how to respond to server errors +/// The configuration for how to respond to request errors /// -/// By default they will be retried up to some limit, using exponential +/// The following categories of error will be retried: +/// +/// * 5xx server errors +/// * Connection errors +/// * Dropped connections +/// * Timeouts for [safe] / read-only requests +/// +/// Requests will be retried up to some limit, using exponential /// backoff with jitter. See [`BackoffConfig`] for more information /// +/// [safe]: https://datatracker.ietf.org/doc/html/rfc7231#section-4.2.1 #[derive(Debug, Clone)] pub struct RetryConfig { /// The backoff configuration @@ -173,13 +181,16 @@ impl RetryExt for reqwest::RequestBuilder { let max_retries = config.max_retries; let retry_timeout = config.retry_timeout; + let (client, req) = self.build_split(); + let req = req.expect("request must be valid"); + async move { let mut retries = 0; let now = Instant::now(); loop { - let s = self.try_clone().expect("request body must be cloneable"); - match s.send().await { + let s = req.try_clone().expect("request body must be cloneable"); + match client.execute(s).await { Ok(r) => match r.error_for_status_ref() { Ok(_) if r.status().is_success() => return Ok(r), Ok(r) if r.status() == StatusCode::NOT_MODIFIED => { @@ -242,7 +253,9 @@ impl RetryExt for reqwest::RequestBuilder { Err(e) => { let mut do_retry = false; - if let Some(source) = e.source() { + if req.method().is_safe() && e.is_timeout() { + do_retry = true + } else if let Some(source) = e.source() { if let Some(e) = source.downcast_ref::() { if e.is_connect() || e.is_closed() || e.is_incomplete_message() { do_retry = true; @@ -294,7 +307,11 @@ mod tests { retry_timeout: Duration::from_secs(1000), }; - let client = Client::new(); + let client = Client::builder() + .timeout(Duration::from_millis(100)) + .build() + .unwrap(); + let do_request = || client.request(Method::GET, mock.url()).send_retry(&retry); // Simple request should work @@ -419,7 +436,7 @@ mod tests { let e = do_request().await.unwrap_err().to_string(); assert!( - e.contains("Error after 2 retries in") && + e.contains("Error after 2 retries in") && e.contains("max_retries:2, retry_timeout:1000s, source:HTTP status server error (502 Bad Gateway) for url"), "{e}" ); @@ -442,6 +459,25 @@ mod tests { "{e}" ); + // Retries on client timeout + mock.push_async_fn(|_| async move { + tokio::time::sleep(Duration::from_secs(10)).await; + panic!() + }); + do_request().await.unwrap(); + + // Does not retry PUT request + mock.push_async_fn(|_| async move { + tokio::time::sleep(Duration::from_secs(10)).await; + panic!() + }); + let res = client.request(Method::PUT, mock.url()).send_retry(&retry); + let e = res.await.unwrap_err().to_string(); + assert!( + e.contains("Error after 0 retries in") && e.contains("operation timed out"), + "{e}" + ); + // Shutdown mock.shutdown().await } From ca790eba7c0f403dbdc9d77ed8d111d312bcdac7 Mon Sep 17 00:00:00 2001 From: Robert Pack <42610831+roeap@users.noreply.github.com> Date: Thu, 4 Jan 2024 17:40:51 +0100 Subject: [PATCH 253/397] feat(object_store): Azure url signing (#5259) * refactor: move current signing to new AzureAuthrizer * feat: generate signed urls with master key * feat: sign with user delegated keys * chore: clippy * pr feedback * chore: clippy * pr feedback II * fix: move sigining test --- src/aws/mod.rs | 1 + src/azure/client.rs | 117 +++++++++++++++- src/azure/credential.rs | 295 +++++++++++++++++++++++++++++++++++----- src/azure/mod.rs | 103 +++++++++++++- src/lib.rs | 23 ++++ src/signer.rs | 16 +++ 6 files changed, 517 insertions(+), 38 deletions(-) diff --git a/src/aws/mod.rs b/src/aws/mod.rs index 75b43d4..20e7b03 100644 --- a/src/aws/mod.rs +++ b/src/aws/mod.rs @@ -372,6 +372,7 @@ mod tests { rename_and_copy(&integration).await; stream_get(&integration).await; multipart(&integration, &integration).await; + signing(&integration).await; tagging(&integration, !config.disable_tagging, |p| { let client = Arc::clone(&integration.client); diff --git a/src/azure/client.rs b/src/azure/client.rs index 3c71e69..865e0a1 100644 --- a/src/azure/client.rs +++ b/src/azure/client.rs @@ -46,6 +46,7 @@ use serde::{Deserialize, Serialize}; use snafu::{OptionExt, ResultExt, Snafu}; use std::collections::HashMap; use std::sync::Arc; +use std::time::Duration; use url::Url; const VERSION_HEADER: &str = "x-ms-version-id"; @@ -101,6 +102,18 @@ pub(crate) enum Error { #[snafu(display("ETag required for conditional update"))] MissingETag, + + #[snafu(display("Error requesting user delegation key: {}", source))] + DelegationKeyRequest { source: crate::client::retry::Error }, + + #[snafu(display("Error getting user delegation key response body: {}", source))] + DelegationKeyResponseBody { source: reqwest::Error }, + + #[snafu(display("Got invalid user delegation key response: {}", source))] + DelegationKeyResponse { source: quick_xml::de::DeError }, + + #[snafu(display("Generating SAS keys with SAS tokens auth is not supported"))] + SASforSASNotSupported, } impl From for crate::Error { @@ -324,6 +337,78 @@ impl AzureClient { Ok(()) } + /// Make a Get User Delegation Key request + /// + async fn get_user_delegation_key( + &self, + start: &DateTime, + end: &DateTime, + ) -> Result { + let credential = self.get_credential().await?; + let url = self.config.service.clone(); + + let start = start.to_rfc3339_opts(chrono::SecondsFormat::Secs, true); + let expiry = end.to_rfc3339_opts(chrono::SecondsFormat::Secs, true); + + let mut body = String::new(); + body.push_str("\n\n"); + body.push_str(&format!( + "\t{start}\n\t{expiry}\n" + )); + body.push_str(""); + + let response = self + .client + .request(Method::POST, url) + .body(body) + .query(&[("restype", "service"), ("comp", "userdelegationkey")]) + .with_azure_authorization(&credential, &self.config.account) + .send_retry(&self.config.retry_config) + .await + .context(DelegationKeyRequestSnafu)? + .bytes() + .await + .context(DelegationKeyResponseBodySnafu)?; + + let response: UserDelegationKey = + quick_xml::de::from_reader(response.reader()).context(DelegationKeyResponseSnafu)?; + + Ok(response) + } + + /// Creat an AzureSigner for generating SAS tokens (pre-signed urls). + /// + /// Depending on the type of credential, this will either use the account key or a user delegation key. + /// Since delegation keys are acquired ad-hoc, the signer aloows for signing multiple urls with the same key. + pub async fn signer(&self, expires_in: Duration) -> Result { + let credential = self.get_credential().await?; + let signed_start = chrono::Utc::now(); + let signed_expiry = signed_start + expires_in; + match credential.as_ref() { + AzureCredential::BearerToken(_) => { + let key = self + .get_user_delegation_key(&signed_start, &signed_expiry) + .await?; + let signing_key = AzureAccessKey::try_new(&key.value)?; + Ok(AzureSigner::new( + signing_key, + self.config.account.clone(), + signed_start, + signed_expiry, + Some(key), + )) + } + AzureCredential::AccessKey(key) => Ok(AzureSigner::new( + key.to_owned(), + self.config.account.clone(), + signed_start, + signed_expiry, + None, + )), + _ => Err(Error::SASforSASNotSupported.into()), + } + } + #[cfg(test)] pub async fn get_blob_tagging(&self, path: &Path) -> Result { let credential = self.get_credential().await?; @@ -600,6 +685,18 @@ impl BlockList { } } +#[derive(Debug, Clone, PartialEq, Deserialize)] +#[serde(rename_all = "PascalCase")] +pub(crate) struct UserDelegationKey { + pub signed_oid: String, + pub signed_tid: String, + pub signed_start: String, + pub signed_expiry: String, + pub signed_service: String, + pub signed_version: String, + pub value: String, +} + #[cfg(test)] mod tests { use bytes::Bytes; @@ -757,8 +854,7 @@ mod tests { "; - let mut _list_blobs_response_internal: ListResultInternal = - quick_xml::de::from_str(S).unwrap(); + let _list_blobs_response_internal: ListResultInternal = quick_xml::de::from_str(S).unwrap(); } #[test] @@ -778,4 +874,21 @@ mod tests { assert_eq!(res, S) } + + #[test] + fn test_delegated_key_response() { + const S: &str = r#" + + String containing a GUID value + String containing a GUID value + String formatted as ISO date + String formatted as ISO date + b + String specifying REST api version to use to create the user delegation key + String containing the user delegation key +"#; + + let _delegated_key_response_internal: UserDelegationKey = + quick_xml::de::from_str(S).unwrap(); + } } diff --git a/src/azure/credential.rs b/src/azure/credential.rs index 2b8788d..bfbbde8 100644 --- a/src/azure/credential.rs +++ b/src/azure/credential.rs @@ -24,26 +24,27 @@ use crate::RetryConfig; use async_trait::async_trait; use base64::prelude::BASE64_STANDARD; use base64::Engine; -use chrono::{DateTime, Utc}; -use reqwest::header::ACCEPT; -use reqwest::{ - header::{ - HeaderMap, HeaderName, HeaderValue, AUTHORIZATION, CONTENT_ENCODING, CONTENT_LANGUAGE, - CONTENT_LENGTH, CONTENT_TYPE, DATE, IF_MATCH, IF_MODIFIED_SINCE, IF_NONE_MATCH, - IF_UNMODIFIED_SINCE, RANGE, - }, - Client, Method, RequestBuilder, +use chrono::{DateTime, SecondsFormat, Utc}; +use reqwest::header::{ + HeaderMap, HeaderName, HeaderValue, ACCEPT, AUTHORIZATION, CONTENT_ENCODING, CONTENT_LANGUAGE, + CONTENT_LENGTH, CONTENT_TYPE, DATE, IF_MATCH, IF_MODIFIED_SINCE, IF_NONE_MATCH, + IF_UNMODIFIED_SINCE, RANGE, }; +use reqwest::{Client, Method, Request, RequestBuilder}; use serde::Deserialize; use snafu::{ResultExt, Snafu}; use std::borrow::Cow; +use std::collections::HashMap; +use std::fmt::Debug; use std::process::Command; use std::str; use std::sync::Arc; use std::time::{Duration, Instant, SystemTime}; use url::Url; -static AZURE_VERSION: HeaderValue = HeaderValue::from_static("2021-08-06"); +use super::client::UserDelegationKey; + +static AZURE_VERSION: HeaderValue = HeaderValue::from_static("2023-11-03"); static VERSION: HeaderName = HeaderName::from_static("x-ms-version"); pub(crate) static BLOB_TYPE: HeaderName = HeaderName::from_static("x-ms-blob-type"); pub(crate) static DELETE_SNAPSHOTS: HeaderName = HeaderName::from_static("x-ms-delete-snapshots"); @@ -83,6 +84,9 @@ pub enum Error { #[snafu(display("Failed to parse azure cli response: {source}"))] AzureCliResponse { source: serde_json::Error }, + + #[snafu(display("Generating SAS keys with SAS tokens auth is not supported"))] + SASforSASNotSupported, } pub type Result = std::result::Result; @@ -97,7 +101,7 @@ impl From for crate::Error { } /// A shared Azure Storage Account Key -#[derive(Debug, Eq, PartialEq)] +#[derive(Debug, Clone, Eq, PartialEq)] pub struct AzureAccessKey(Vec); impl AzureAccessKey { @@ -137,33 +141,86 @@ pub mod authority_hosts { pub const AZURE_PUBLIC_CLOUD: &str = "https://login.microsoftonline.com"; } -pub(crate) trait CredentialExt { - /// Apply authorization to requests against azure storage accounts - /// - fn with_azure_authorization(self, credential: &AzureCredential, account: &str) -> Self; +pub(crate) struct AzureSigner { + signing_key: AzureAccessKey, + start: DateTime, + end: DateTime, + account: String, + delegation_key: Option, } -impl CredentialExt for RequestBuilder { - fn with_azure_authorization(mut self, credential: &AzureCredential, account: &str) -> Self { +impl AzureSigner { + pub fn new( + signing_key: AzureAccessKey, + account: String, + start: DateTime, + end: DateTime, + delegation_key: Option, + ) -> Self { + Self { + signing_key, + account, + start, + end, + delegation_key, + } + } + + pub fn sign(&self, method: &Method, url: &mut Url) -> Result<()> { + let (str_to_sign, query_pairs) = match &self.delegation_key { + Some(delegation_key) => string_to_sign_user_delegation_sas( + url, + method, + &self.account, + &self.start, + &self.end, + delegation_key, + ), + None => string_to_sign_service_sas(url, method, &self.account, &self.start, &self.end), + }; + let auth = hmac_sha256(&self.signing_key.0, str_to_sign); + url.query_pairs_mut().extend_pairs(query_pairs); + url.query_pairs_mut() + .append_pair("sig", BASE64_STANDARD.encode(auth).as_str()); + Ok(()) + } +} + +/// Authorize a [`Request`] with an [`AzureAuthorizer`] +#[derive(Debug)] +pub struct AzureAuthorizer<'a> { + credential: &'a AzureCredential, + account: &'a str, +} + +impl<'a> AzureAuthorizer<'a> { + /// Create a new [`AzureAuthorizer`] + pub fn new(credential: &'a AzureCredential, account: &'a str) -> Self { + AzureAuthorizer { + credential, + account, + } + } + + /// Authorize `request` + pub fn authorize(&self, request: &mut Request) { // rfc2822 string should never contain illegal characters let date = Utc::now(); let date_str = date.format(RFC1123_FMT).to_string(); // we formatted the data string ourselves, so unwrapping should be fine let date_val = HeaderValue::from_str(&date_str).unwrap(); - self = self - .header(DATE, &date_val) - .header(&VERSION, &AZURE_VERSION); + request.headers_mut().insert(DATE, date_val); + request + .headers_mut() + .insert(&VERSION, AZURE_VERSION.clone()); - match credential { + match self.credential { AzureCredential::AccessKey(key) => { - let (client, request) = self.build_split(); - let mut request = request.expect("request valid"); - let signature = generate_authorization( request.headers(), request.url(), request.method(), - account, + self.account, key, ); @@ -173,15 +230,40 @@ impl CredentialExt for RequestBuilder { AUTHORIZATION, HeaderValue::from_str(signature.as_str()).unwrap(), ); - - Self::from_parts(client, request) } - AzureCredential::BearerToken(token) => self.bearer_auth(token), - AzureCredential::SASToken(query_pairs) => self.query(&query_pairs), + AzureCredential::BearerToken(token) => { + request.headers_mut().append( + AUTHORIZATION, + HeaderValue::from_str(format!("Bearer {}", token).as_str()).unwrap(), + ); + } + AzureCredential::SASToken(query_pairs) => { + request + .url_mut() + .query_pairs_mut() + .extend_pairs(query_pairs); + } } } } +pub(crate) trait CredentialExt { + /// Apply authorization to requests against azure storage accounts + /// + fn with_azure_authorization(self, credential: &AzureCredential, account: &str) -> Self; +} + +impl CredentialExt for RequestBuilder { + fn with_azure_authorization(self, credential: &AzureCredential, account: &str) -> Self { + let (client, request) = self.build_split(); + let mut request = request.expect("request valid"); + + AzureAuthorizer::new(credential, account).authorize(&mut request); + + Self::from_parts(client, request) + } +} + /// Generate signed key for authorization via access keys /// fn generate_authorization( @@ -205,6 +287,152 @@ fn add_if_exists<'a>(h: &'a HeaderMap, key: &HeaderName) -> &'a str { .unwrap_or_default() } +fn string_to_sign_sas( + u: &Url, + method: &Method, + account: &str, + start: &DateTime, + end: &DateTime, +) -> (String, String, String, String, String) { + // NOTE: for now only blob signing is supported. + let signed_resource = "b".to_string(); + + // https://learn.microsoft.com/en-us/rest/api/storageservices/create-service-sas#permissions-for-a-directory-container-or-blob + let signed_permissions = match *method { + // read and list permissions + Method::GET => match signed_resource.as_str() { + "c" => "rl", + "b" => "r", + _ => unreachable!(), + }, + // write permissions (also allows crating a new blob in a sub-key) + Method::PUT => "w", + // delete permissions + Method::DELETE => "d", + // other methods are not used in any of the current operations + _ => "", + } + .to_string(); + let signed_start = start.to_rfc3339_opts(SecondsFormat::Secs, true); + let signed_expiry = end.to_rfc3339_opts(SecondsFormat::Secs, true); + let canonicalized_resource = if u.host_str().unwrap_or_default().contains(account) { + format!("/blob/{}{}", account, u.path()) + } else { + // NOTE: in case of the emulator, the account name is not part of the host + // but the path starts with the account name + format!("/blob{}", u.path()) + }; + + ( + signed_resource, + signed_permissions, + signed_start, + signed_expiry, + canonicalized_resource, + ) +} + +/// Create a string to be signed for authorization via [service sas]. +/// +/// [service sas]: https://learn.microsoft.com/en-us/rest/api/storageservices/create-service-sas#version-2020-12-06-and-later +fn string_to_sign_service_sas( + u: &Url, + method: &Method, + account: &str, + start: &DateTime, + end: &DateTime, +) -> (String, HashMap<&'static str, String>) { + let (signed_resource, signed_permissions, signed_start, signed_expiry, canonicalized_resource) = + string_to_sign_sas(u, method, account, start, end); + + let string_to_sign = format!( + "{}\n{}\n{}\n{}\n{}\n{}\n{}\n{}\n{}\n{}\n{}\n{}\n{}\n{}\n{}\n{}", + signed_permissions, + signed_start, + signed_expiry, + canonicalized_resource, + "", // signed identifier + "", // signed ip + "", // signed protocol + &AZURE_VERSION.to_str().unwrap(), // signed version + signed_resource, // signed resource + "", // signed snapshot time + "", // signed encryption scope + "", // rscc - response header: Cache-Control + "", // rscd - response header: Content-Disposition + "", // rsce - response header: Content-Encoding + "", // rscl - response header: Content-Language + "", // rsct - response header: Content-Type + ); + + let mut pairs = HashMap::new(); + pairs.insert("sv", AZURE_VERSION.to_str().unwrap().to_string()); + pairs.insert("sp", signed_permissions); + pairs.insert("st", signed_start); + pairs.insert("se", signed_expiry); + pairs.insert("sr", signed_resource); + + (string_to_sign, pairs) +} + +/// Create a string to be signed for authorization via [user delegation sas]. +/// +/// [user delegation sas]: https://learn.microsoft.com/en-us/rest/api/storageservices/create-user-delegation-sas#version-2020-12-06-and-later +fn string_to_sign_user_delegation_sas( + u: &Url, + method: &Method, + account: &str, + start: &DateTime, + end: &DateTime, + delegation_key: &UserDelegationKey, +) -> (String, HashMap<&'static str, String>) { + let (signed_resource, signed_permissions, signed_start, signed_expiry, canonicalized_resource) = + string_to_sign_sas(u, method, account, start, end); + + let string_to_sign = format!( + "{}\n{}\n{}\n{}\n{}\n{}\n{}\n{}\n{}\n{}\n{}\n{}\n{}\n{}\n{}\n{}\n{}\n{}\n{}\n{}\n{}\n{}\n{}\n{}", + signed_permissions, + signed_start, + signed_expiry, + canonicalized_resource, + delegation_key.signed_oid, // signed key object id + delegation_key.signed_tid, // signed key tenant id + delegation_key.signed_start, // signed key start + delegation_key.signed_expiry, // signed key expiry + delegation_key.signed_service, // signed key service + delegation_key.signed_version, // signed key version + "", // signed authorized user object id + "", // signed unauthorized user object id + "", // signed correlation id + "", // signed ip + "", // signed protocol + &AZURE_VERSION.to_str().unwrap(), // signed version + signed_resource, // signed resource + "", // signed snapshot time + "", // signed encryption scope + "", // rscc - response header: Cache-Control + "", // rscd - response header: Content-Disposition + "", // rsce - response header: Content-Encoding + "", // rscl - response header: Content-Language + "", // rsct - response header: Content-Type + ); + + let mut pairs = HashMap::new(); + pairs.insert("sv", AZURE_VERSION.to_str().unwrap().to_string()); + pairs.insert("sp", signed_permissions); + pairs.insert("st", signed_start); + pairs.insert("se", signed_expiry); + pairs.insert("sr", signed_resource); + pairs.insert("skoid", delegation_key.signed_oid.clone()); + pairs.insert("sktid", delegation_key.signed_tid.clone()); + pairs.insert("skt", delegation_key.signed_start.clone()); + pairs.insert("ske", delegation_key.signed_expiry.clone()); + pairs.insert("sks", delegation_key.signed_service.clone()); + pairs.insert("skv", delegation_key.signed_version.clone()); + + (string_to_sign, pairs) +} + /// fn string_to_sign(h: &HeaderMap, u: &Url, method: &Method, account: &str) -> String { // content length must only be specified if != 0 @@ -232,7 +460,7 @@ fn string_to_sign(h: &HeaderMap, u: &Url, method: &Method, account: &str) -> Str add_if_exists(h, &IF_UNMODIFIED_SINCE), add_if_exists(h, &RANGE), canonicalize_header(h), - canonicalized_resource(account, u) + canonicalize_resource(account, u) ) } @@ -257,7 +485,7 @@ fn canonicalize_header(headers: &HeaderMap) -> String { } /// -fn canonicalized_resource(account: &str, uri: &Url) -> String { +fn canonicalize_resource(account: &str, uri: &Url) -> String { let mut can_res: String = String::new(); can_res.push('/'); can_res.push_str(account); @@ -681,14 +909,15 @@ impl CredentialProvider for AzureCliCredential { #[cfg(test)] mod tests { - use super::*; - use crate::client::mock_server::MockServer; use futures::executor::block_on; use hyper::body::to_bytes; use hyper::{Body, Response}; use reqwest::{Client, Method}; use tempfile::NamedTempFile; + use super::*; + use crate::client::mock_server::MockServer; + #[tokio::test] async fn test_managed_identity() { let server = MockServer::new(); diff --git a/src/azure/mod.rs b/src/azure/mod.rs index af0a4ce..712b7a3 100644 --- a/src/azure/mod.rs +++ b/src/azure/mod.rs @@ -27,22 +27,26 @@ //! a way to drop old blocks. Instead unused blocks are automatically cleaned up //! after 7 days. use crate::{ - multipart::{PartId, PutPart, WriteMultiPart}, + multipart::{MultiPartStore, PartId, PutPart, WriteMultiPart}, path::Path, + signer::Signer, GetOptions, GetResult, ListResult, MultipartId, ObjectMeta, ObjectStore, PutOptions, PutResult, Result, }; use async_trait::async_trait; use bytes::Bytes; use futures::stream::BoxStream; +use reqwest::Method; use std::fmt::Debug; use std::sync::Arc; +use std::time::Duration; use tokio::io::AsyncWrite; +use url::Url; use crate::client::get::GetClientExt; use crate::client::list::ListClientExt; use crate::client::CredentialProvider; -pub use credential::authority_hosts; +pub use credential::{authority_hosts, AzureAccessKey, AzureAuthorizer}; mod builder; mod client; @@ -50,7 +54,6 @@ mod credential; /// [`CredentialProvider`] for [`MicrosoftAzure`] pub type AzureCredentialProvider = Arc>; -use crate::multipart::MultiPartStore; pub use builder::{AzureConfigKey, MicrosoftAzureBuilder}; pub use credential::AzureCredential; @@ -67,6 +70,11 @@ impl MicrosoftAzure { pub fn credentials(&self) -> &AzureCredentialProvider { &self.client.config().credentials } + + /// Create a full URL to the resource specified by `path` with this instance's configuration. + fn path_url(&self, path: &Path) -> url::Url { + self.client.config().path_url(path) + } } impl std::fmt::Display for MicrosoftAzure { @@ -128,6 +136,62 @@ impl ObjectStore for MicrosoftAzure { } } +#[async_trait] +impl Signer for MicrosoftAzure { + /// Create a URL containing the relevant [Service SAS] query parameters that authorize a request + /// via `method` to the resource at `path` valid for the duration specified in `expires_in`. + /// + /// [Service SAS]: https://learn.microsoft.com/en-us/rest/api/storageservices/create-service-sas + /// + /// # Example + /// + /// This example returns a URL that will enable a user to upload a file to + /// "some-folder/some-file.txt" in the next hour. + /// + /// ``` + /// # async fn example() -> Result<(), Box> { + /// # use object_store::{azure::MicrosoftAzureBuilder, path::Path, signer::Signer}; + /// # use reqwest::Method; + /// # use std::time::Duration; + /// # + /// let azure = MicrosoftAzureBuilder::new() + /// .with_account("my-account") + /// .with_access_key("my-access-key") + /// .with_container_name("my-container") + /// .build()?; + /// + /// let url = azure.signed_url( + /// Method::PUT, + /// &Path::from("some-folder/some-file.txt"), + /// Duration::from_secs(60 * 60) + /// ).await?; + /// # Ok(()) + /// # } + /// ``` + async fn signed_url(&self, method: Method, path: &Path, expires_in: Duration) -> Result { + let mut url = self.path_url(path); + let signer = self.client.signer(expires_in).await?; + signer.sign(&method, &mut url)?; + Ok(url) + } + + async fn signed_urls( + &self, + method: Method, + paths: &[Path], + expires_in: Duration, + ) -> Result> { + let mut urls = Vec::with_capacity(paths.len()); + let signer = self.client.signer(expires_in).await?; + for path in paths { + let mut url = self.path_url(path); + signer.sign(&method, &mut url)?; + urls.push(url); + } + Ok(urls) + } +} + /// Relevant docs: /// In Azure Blob Store, parts are "blocks" /// put_multipart_part -> PUT block @@ -202,6 +266,7 @@ mod tests { stream_get(&integration).await; put_opts(&integration, true).await; multipart(&integration, &integration).await; + signing(&integration).await; let validate = !integration.client.config().disable_tagging; tagging(&integration, validate, |p| { @@ -211,6 +276,38 @@ mod tests { .await } + #[ignore = "Used for manual testing against a real storage account."] + #[tokio::test] + async fn test_user_delegation_key() { + let account = std::env::var("AZURE_ACCOUNT_NAME").unwrap(); + let container = std::env::var("AZURE_CONTAINER_NAME").unwrap(); + let client_id = std::env::var("AZURE_CLIENT_ID").unwrap(); + let client_secret = std::env::var("AZURE_CLIENT_SECRET").unwrap(); + let tenant_id = std::env::var("AZURE_TENANT_ID").unwrap(); + let integration = MicrosoftAzureBuilder::new() + .with_account(account) + .with_container_name(container) + .with_client_id(client_id) + .with_client_secret(client_secret) + .with_tenant_id(&tenant_id) + .build() + .unwrap(); + + let data = Bytes::from("hello world"); + let path = Path::from("file.txt"); + integration.put(&path, data.clone()).await.unwrap(); + + let signed = integration + .signed_url(Method::GET, &path, Duration::from_secs(60)) + .await + .unwrap(); + + let resp = reqwest::get(signed).await.unwrap(); + let loaded = resp.bytes().await.unwrap(); + + assert_eq!(data, loaded); + } + #[test] fn azure_test_config_get_value() { let azure_client_id = "object_store:fake_access_key_id".to_string(); diff --git a/src/lib.rs b/src/lib.rs index b438254..ab462cc 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -2126,6 +2126,29 @@ mod tests { assert_eq!(meta.size, chunk_size * 2); } + #[cfg(any(feature = "azure", feature = "aws"))] + pub(crate) async fn signing(integration: &T) + where + T: ObjectStore + crate::signer::Signer, + { + use reqwest::Method; + use std::time::Duration; + + let data = Bytes::from("hello world"); + let path = Path::from("file.txt"); + integration.put(&path, data.clone()).await.unwrap(); + + let signed = integration + .signed_url(Method::GET, &path, Duration::from_secs(60)) + .await + .unwrap(); + + let resp = reqwest::get(signed).await.unwrap(); + let loaded = resp.bytes().await.unwrap(); + + assert_eq!(data, loaded); + } + #[cfg(any(feature = "aws", feature = "azure"))] pub(crate) async fn tagging(storage: &dyn ObjectStore, validate: bool, get_tags: F) where diff --git a/src/signer.rs b/src/signer.rs index ed92e28..da55c68 100644 --- a/src/signer.rs +++ b/src/signer.rs @@ -31,4 +31,20 @@ pub trait Signer: Send + Sync + fmt::Debug + 'static { /// implementation's credentials such that the URL can be handed to something that doesn't have /// access to the object store's credentials, to allow limited access to the object store. async fn signed_url(&self, method: Method, path: &Path, expires_in: Duration) -> Result; + + /// Generate signed urls for multiple paths. + /// + /// See [`Signer::signed_url`] for more details. + async fn signed_urls( + &self, + method: Method, + paths: &[Path], + expires_in: Duration, + ) -> Result> { + let mut urls = Vec::with_capacity(paths.len()); + for path in paths { + urls.push(self.signed_url(method.clone(), path, expires_in).await?); + } + Ok(urls) + } } From 88c6cf97c7741611511d8c5d9d38c849af084bd7 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 4 Jan 2024 21:32:26 +0000 Subject: [PATCH 254/397] DynamoDB ConditionalPut (#5247) * Parse Dynamo CondititionalPut * Add etag sort key * Conditional Put * Speedup repeated test runs * Clippy --- src/aws/dynamo.rs | 155 +++++++++++++++++++++++++++++++--------- src/aws/mod.rs | 18 ++++- src/aws/precondition.rs | 46 +++++++++--- src/lib.rs | 12 +++- 4 files changed, 185 insertions(+), 46 deletions(-) diff --git a/src/aws/dynamo.rs b/src/aws/dynamo.rs index ce1500b..f12a421 100644 --- a/src/aws/dynamo.rs +++ b/src/aws/dynamo.rs @@ -17,7 +17,9 @@ //! A DynamoDB based lock system +use std::borrow::Cow; use std::collections::HashMap; +use std::future::Future; use std::time::{Duration, Instant}; use chrono::Utc; @@ -61,16 +63,24 @@ const STORE: &str = "DynamoDB"; /// /// The DynamoDB schema is as follows: /// -/// * A string hash key named `"key"` +/// * A string partition key named `"path"` +/// * A string sort key named `"etag"` /// * A numeric [TTL] attribute named `"ttl"` /// * A numeric attribute named `"generation"` /// * A numeric attribute named `"timeout"` /// -/// To perform a conditional operation on an object with a given `path` and `etag` (if exists), +/// An appropriate DynamoDB table can be created with the CLI as follows: +/// +/// ```bash +/// $ aws dynamodb create-table --table-name --key-schema AttributeName=path,KeyType=HASH AttributeName=etag,KeyType=RANGE --attribute-definitions AttributeName=path,AttributeType=S AttributeName=etag,AttributeType=S +/// $ aws dynamodb update-time-to-live --table-name --time-to-live-specification Enabled=true,AttributeName=ttl +/// ``` +/// +/// To perform a conditional operation on an object with a given `path` and `etag` (`*` if creating), /// the commit protocol is as follows: /// /// 1. Perform HEAD request on `path` and error on precondition mismatch -/// 2. Create record in DynamoDB with key `{path}#{etag}` with the configured timeout +/// 2. Create record in DynamoDB with given `path` and `etag` with the configured timeout /// 1. On Success: Perform operation with the configured timeout /// 2. On Conflict: /// 1. Periodically re-perform HEAD request on `path` and error on precondition mismatch @@ -154,6 +164,16 @@ impl DynamoCommit { self } + /// Parse [`DynamoCommit`] from a string + pub(crate) fn from_str(value: &str) -> Option { + Some(match value.split_once(':') { + Some((table_name, timeout)) => { + Self::new(table_name.trim().to_string()).with_timeout(timeout.parse().ok()?) + } + None => Self::new(value.trim().to_string()), + }) + } + /// Returns the name of the DynamoDB table. pub(crate) fn table_name(&self) -> &str { &self.table_name @@ -165,23 +185,41 @@ impl DynamoCommit { from: &Path, to: &Path, ) -> Result<()> { - check_not_exists(client, to).await?; + self.conditional_op(client, to, None, || async { + client.copy_request(from, to).send().await?; + Ok(()) + }) + .await + } + + #[allow(clippy::future_not_send)] // Generics confound this lint + pub(crate) async fn conditional_op( + &self, + client: &S3Client, + to: &Path, + etag: Option<&str>, + op: F, + ) -> Result + where + F: FnOnce() -> Fut, + Fut: Future>, + { + check_precondition(client, to, etag).await?; let mut previous_lease = None; loop { let existing = previous_lease.as_ref(); - match self.try_lock(client, to.as_ref(), existing).await? { + match self.try_lock(client, to.as_ref(), etag, existing).await? { TryLockResult::Ok(lease) => { - let fut = client.copy_request(from, to).send(); let expiry = lease.acquire + lease.timeout; - return match tokio::time::timeout_at(expiry.into(), fut).await { - Ok(Ok(_)) => Ok(()), - Ok(Err(e)) => Err(e.into()), + return match tokio::time::timeout_at(expiry.into(), op()).await { + Ok(Ok(v)) => Ok(v), + Ok(Err(e)) => Err(e), Err(_) => Err(Error::Generic { store: "DynamoDB", source: format!( - "Failed to perform copy operation in {} milliseconds", + "Failed to perform conditional operation in {} milliseconds", self.timeout ) .into(), @@ -193,7 +231,7 @@ impl DynamoCommit { let expiry = conflict.timeout * self.max_clock_skew_rate; loop { interval.tick().await; - check_not_exists(client, to).await?; + check_precondition(client, to, etag).await?; if conflict.acquire.elapsed() > expiry { previous_lease = Some(conflict); break; @@ -205,8 +243,11 @@ impl DynamoCommit { } /// Retrieve a lock, returning an error if it doesn't exist - async fn get_lock(&self, s3: &S3Client, key: &str) -> Result { - let key_attributes = [("key", AttributeValue::String(key))]; + async fn get_lock(&self, s3: &S3Client, path: &str, etag: Option<&str>) -> Result { + let key_attributes = [ + ("path", AttributeValue::from(path)), + ("etag", AttributeValue::from(etag.unwrap_or("*"))), + ]; let req = GetItem { table_name: &self.table_name, key: Map(&key_attributes), @@ -216,7 +257,7 @@ impl DynamoCommit { let resp = self .request(s3, credential.as_deref(), "DynamoDB_20120810.GetItem", req) .await - .map_err(|e| e.error(STORE, key.to_string()))?; + .map_err(|e| e.error(STORE, path.to_string()))?; let body = resp.bytes().await.map_err(|e| Error::Generic { store: STORE, @@ -230,7 +271,7 @@ impl DynamoCommit { })?; extract_lease(&response.item).ok_or_else(|| Error::NotFound { - path: key.into(), + path: path.into(), source: "DynamoDB GetItem returned no items".to_string().into(), }) } @@ -239,7 +280,8 @@ impl DynamoCommit { async fn try_lock( &self, s3: &S3Client, - key: &str, + path: &str, + etag: Option<&str>, existing: Option<&Lease>, ) -> Result { let attributes; @@ -257,12 +299,13 @@ impl DynamoCommit { let ttl = (Utc::now() + self.ttl).timestamp(); let items = [ - ("key", AttributeValue::String(key)), + ("path", AttributeValue::from(path)), + ("etag", AttributeValue::from(etag.unwrap_or("*"))), ("generation", AttributeValue::Number(next_gen)), ("timeout", AttributeValue::Number(self.timeout)), ("ttl", AttributeValue::Number(ttl as _)), ]; - let names = [("#pk", "key")]; + let names = [("#pk", "path")]; let req = PutItem { table_name: &self.table_name, @@ -302,7 +345,9 @@ impl DynamoCommit { // // // - None => Ok(TryLockResult::Conflict(self.get_lock(s3, key).await?)), + None => Ok(TryLockResult::Conflict( + self.get_lock(s3, path, etag).await?, + )), }, _ => Err(Error::Generic { store: STORE, @@ -347,19 +392,37 @@ enum TryLockResult { Conflict(Lease), } -/// Returns an [`Error::AlreadyExists`] if `path` exists -async fn check_not_exists(client: &S3Client, path: &Path) -> Result<()> { +/// Validates that `path` has the given `etag` or doesn't exist if `None` +async fn check_precondition(client: &S3Client, path: &Path, etag: Option<&str>) -> Result<()> { let options = GetOptions { head: true, ..Default::default() }; - match client.get_opts(path, options).await { - Ok(_) => Err(Error::AlreadyExists { - path: path.to_string(), - source: "Already Exists".to_string().into(), - }), - Err(Error::NotFound { .. }) => Ok(()), - Err(e) => Err(e), + + match etag { + Some(expected) => match client.get_opts(path, options).await { + Ok(r) => match r.meta.e_tag { + Some(actual) if expected == actual => Ok(()), + actual => Err(Error::Precondition { + path: path.to_string(), + source: format!("{} does not match {expected}", actual.unwrap_or_default()) + .into(), + }), + }, + Err(Error::NotFound { .. }) => Err(Error::Precondition { + path: path.to_string(), + source: format!("Object at location {path} not found").into(), + }), + Err(e) => Err(e), + }, + None => match client.get_opts(path, options).await { + Ok(_) => Err(Error::AlreadyExists { + path: path.to_string(), + source: "Already Exists".to_string().into(), + }), + Err(Error::NotFound { .. }) => Ok(()), + Err(e) => Err(e), + }, } } @@ -493,11 +556,17 @@ impl<'a, K: Serialize, V: Serialize> Serialize for Map<'a, K, V> { #[derive(Debug, Serialize, Deserialize)] enum AttributeValue<'a> { #[serde(rename = "S")] - String(&'a str), + String(Cow<'a, str>), #[serde(rename = "N", with = "number")] Number(u64), } +impl<'a> From<&'a str> for AttributeValue<'a> { + fn from(value: &'a str) -> Self { + Self::String(Cow::Borrowed(value)) + } +} + /// Numbers are serialized as strings mod number { use serde::{Deserialize, Deserializer, Serializer}; @@ -518,10 +587,11 @@ pub(crate) use tests::integration_test; #[cfg(test)] mod tests { - use super::*; use crate::aws::AmazonS3; use crate::ObjectStore; + use rand::distributions::Alphanumeric; + use rand::{thread_rng, Rng}; #[test] fn test_attribute_serde() { @@ -544,24 +614,43 @@ mod tests { let _ = integration.delete(&dst).await; // Delete if present // Create a lock if not already exists - let existing = match d.try_lock(client, dst.as_ref(), None).await.unwrap() { + let existing = match d.try_lock(client, dst.as_ref(), None, None).await.unwrap() { TryLockResult::Conflict(l) => l, TryLockResult::Ok(l) => l, }; // Should not be able to acquire a lock again - let r = d.try_lock(client, dst.as_ref(), None).await; + let r = d.try_lock(client, dst.as_ref(), None, None).await; assert!(matches!(r, Ok(TryLockResult::Conflict(_)))); // But should still be able to reclaim lock and perform copy d.copy_if_not_exists(client, &src, &dst).await.unwrap(); - match d.try_lock(client, dst.as_ref(), None).await.unwrap() { + match d.try_lock(client, dst.as_ref(), None, None).await.unwrap() { TryLockResult::Conflict(new) => { // Should have incremented generation to do so assert_eq!(new.generation, existing.generation + 1); } _ => panic!("Should conflict"), } + + let rng = thread_rng(); + let etag = String::from_utf8(rng.sample_iter(Alphanumeric).take(32).collect()).unwrap(); + let t = Some(etag.as_str()); + + let l = match d.try_lock(client, dst.as_ref(), t, None).await.unwrap() { + TryLockResult::Ok(l) => l, + _ => panic!("should not conflict"), + }; + + match d.try_lock(client, dst.as_ref(), t, None).await.unwrap() { + TryLockResult::Conflict(c) => assert_eq!(l.generation, c.generation), + _ => panic!("should conflict"), + } + + match d.try_lock(client, dst.as_ref(), t, Some(&l)).await.unwrap() { + TryLockResult::Ok(new) => assert_eq!(new.generation, l.generation + 1), + _ => panic!("should not conflict"), + } } } diff --git a/src/aws/mod.rs b/src/aws/mod.rs index 20e7b03..d167c78 100644 --- a/src/aws/mod.rs +++ b/src/aws/mod.rs @@ -187,12 +187,26 @@ impl ObjectStore for AmazonS3 { r => r, } } - (PutMode::Update(v), Some(S3ConditionalPut::ETagMatch)) => { + (PutMode::Create, Some(S3ConditionalPut::Dynamo(d))) => { + d.conditional_op(&self.client, location, None, move || request.do_put()) + .await + } + (PutMode::Update(v), Some(put)) => { let etag = v.e_tag.ok_or_else(|| Error::Generic { store: STORE, source: "ETag required for conditional put".to_string().into(), })?; - request.header(&IF_MATCH, etag.as_str()).do_put().await + match put { + S3ConditionalPut::ETagMatch => { + request.header(&IF_MATCH, etag.as_str()).do_put().await + } + S3ConditionalPut::Dynamo(d) => { + d.conditional_op(&self.client, location, Some(&etag), move || { + request.do_put() + }) + .await + } + } } } } diff --git a/src/aws/precondition.rs b/src/aws/precondition.rs index 83d45db..ad9e215 100644 --- a/src/aws/precondition.rs +++ b/src/aws/precondition.rs @@ -48,7 +48,7 @@ pub enum S3CopyIfNotExists { HeaderWithStatus(String, String, reqwest::StatusCode), /// The name of a DynamoDB table to use for coordination /// - /// Encoded as either `dynamodb:` or `dynamodb::` + /// Encoded as either `dynamo:` or `dynamo::` /// ignoring whitespace. The default timeout is used if not specified /// /// See [`DynamoCommit`] for more information @@ -88,12 +88,7 @@ impl S3CopyIfNotExists { code, )) } - "dynamo" => Some(Self::Dynamo(match value.split_once(':') { - Some((table_name, timeout)) => DynamoCommit::new(table_name.trim().to_string()) - .with_timeout(timeout.parse().ok()?), - None => DynamoCommit::new(value.trim().to_string()), - })), - + "dynamo" => Some(Self::Dynamo(DynamoCommit::from_str(value)?)), _ => None, } } @@ -111,7 +106,7 @@ impl Parse for S3CopyIfNotExists { /// Configure how to provide conditional put support for [`AmazonS3`]. /// /// [`AmazonS3`]: super::AmazonS3 -#[derive(Debug, Clone)] +#[derive(Debug, Clone, Eq, PartialEq)] #[allow(missing_copy_implementations)] #[non_exhaustive] pub enum S3ConditionalPut { @@ -122,12 +117,23 @@ pub enum S3ConditionalPut { /// /// [HTTP precondition]: https://datatracker.ietf.org/doc/html/rfc9110#name-preconditions ETagMatch, + + /// The name of a DynamoDB table to use for coordination + /// + /// Encoded as either `dynamo:` or `dynamo::` + /// ignoring whitespace. The default timeout is used if not specified + /// + /// See [`DynamoCommit`] for more information + /// + /// This will use the same region, credentials and endpoint as configured for S3 + Dynamo(DynamoCommit), } impl std::fmt::Display for S3ConditionalPut { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { Self::ETagMatch => write!(f, "etag"), + Self::Dynamo(lock) => write!(f, "dynamo: {}", lock.table_name()), } } } @@ -136,7 +142,10 @@ impl S3ConditionalPut { fn from_str(s: &str) -> Option { match s.trim() { "etag" => Some(Self::ETagMatch), - _ => None, + trimmed => match trimmed.split_once(':')? { + ("dynamo", s) => Some(Self::Dynamo(DynamoCommit::from_str(s)?)), + _ => None, + }, } } } @@ -153,6 +162,7 @@ impl Parse for S3ConditionalPut { #[cfg(test)] mod tests { use super::S3CopyIfNotExists; + use crate::aws::{DynamoCommit, S3ConditionalPut}; #[test] fn parse_s3_copy_if_not_exists_header() { @@ -177,6 +187,24 @@ mod tests { assert_eq!(expected, S3CopyIfNotExists::from_str(input)); } + #[test] + fn parse_s3_copy_if_not_exists_dynamo() { + let input = "dynamo: table:100"; + let expected = Some(S3CopyIfNotExists::Dynamo( + DynamoCommit::new("table".into()).with_timeout(100), + )); + assert_eq!(expected, S3CopyIfNotExists::from_str(input)); + } + + #[test] + fn parse_s3_condition_put_dynamo() { + let input = "dynamo: table:1300"; + let expected = Some(S3ConditionalPut::Dynamo( + DynamoCommit::new("table".into()).with_timeout(1300), + )); + assert_eq!(expected, S3ConditionalPut::from_str(input)); + } + #[test] fn parse_s3_copy_if_not_exists_header_whitespace_invariant() { let expected = Some(S3CopyIfNotExists::Header( diff --git a/src/lib.rs b/src/lib.rs index ab462cc..8fc47b2 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1233,6 +1233,7 @@ mod tests { use crate::test_util::flatten_list_stream; use chrono::TimeZone; use futures::stream::FuturesUnordered; + use rand::distributions::Alphanumeric; use rand::{thread_rng, Rng}; use std::future::Future; use tokio::io::AsyncWriteExt; @@ -1726,8 +1727,15 @@ mod tests { } pub(crate) async fn put_opts(storage: &dyn ObjectStore, supports_update: bool) { + // When using DynamoCommit repeated runs of this test will produce the same sequence of records in DynamoDB + // As a result each conditional operation will need to wait for the lease to timeout before proceeding + // One solution would be to clear DynamoDB before each test, but this would require non-trivial additional code + // so we instead just generate a random suffix for the filenames + let rng = thread_rng(); + let suffix = String::from_utf8(rng.sample_iter(Alphanumeric).take(32).collect()).unwrap(); + delete_fixtures(storage).await; - let path = Path::from("put_opts"); + let path = Path::from(format!("put_opts_{suffix}")); let v1 = storage .put_opts(&path, "a".into(), PutMode::Create.into()) .await @@ -1779,7 +1787,7 @@ mod tests { const NUM_WORKERS: usize = 5; const NUM_INCREMENTS: usize = 10; - let path = Path::from("RACE"); + let path = Path::from(format!("RACE-{suffix}")); let mut futures: FuturesUnordered<_> = (0..NUM_WORKERS) .map(|_| async { for _ in 0..NUM_INCREMENTS { From a90a7a537fa96311a9b675325c6a60bea7eea76b Mon Sep 17 00:00:00 2001 From: Chris Barnes Date: Fri, 5 Jan 2024 06:17:00 +0000 Subject: [PATCH 255/397] object_store: full HTTP range support (#5222) * object_store: full HTTP range support - Support suffix and offset ranges in GetOptions and get_opts - Ensure that, if a range is requested, the response contains exactly that range * object_store: review comments - Use idiomatic snafu error handling - fast-fail on azure suffix requests - remove unused GetRange utilities * Cleanup * Further cleanup / fixes * object_store: Display for GetRange includes bytes= * Update object_store/src/util.rs Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> * Use size from ContentRange * Update test * Fix as_range * Update test * Tighten range validation logic - Raise an error before the request is made if the range has <= 0 bytes in it - `GetRange::as_range` now handles more out-of-bounds cases, although in most cases these should result in a 416 from the server anyway. * allow return of partial range * Tweak docs and loosen suffix restrictions * Fix Azure and Memory --------- Co-authored-by: Raphael Taylor-Davies Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> --- src/azure/client.rs | 10 ++- src/client/get.rs | 69 ++++++++++++--- src/client/mod.rs | 3 +- src/lib.rs | 67 ++++++++++++++- src/local.rs | 13 ++- src/memory.rs | 35 +++----- src/util.rs | 182 +++++++++++++++++++++++++++++++++++++++- tests/get_range_file.rs | 25 ++++++ 8 files changed, 360 insertions(+), 44 deletions(-) diff --git a/src/azure/client.rs b/src/azure/client.rs index 865e0a1..41b7cbd 100644 --- a/src/azure/client.rs +++ b/src/azure/client.rs @@ -25,7 +25,7 @@ use crate::client::retry::RetryExt; use crate::client::GetOptionsExt; use crate::multipart::PartId; use crate::path::DELIMITER; -use crate::util::deserialize_rfc1123; +use crate::util::{deserialize_rfc1123, GetRange}; use crate::{ ClientOptions, GetOptions, ListResult, ObjectMeta, Path, PutMode, PutOptions, PutResult, Result, RetryConfig, @@ -441,6 +441,14 @@ impl GetClient for AzureClient { /// /// async fn get_request(&self, path: &Path, options: GetOptions) -> Result { + // As of 2024-01-02, Azure does not support suffix requests, + // so we should fail fast here rather than sending one + if let Some(GetRange::Suffix(_)) = options.range.as_ref() { + return Err(crate::Error::NotSupported { + source: "Azure does not support suffix range requests".into(), + }); + } + let credential = self.get_credential().await?; let url = self.config.path_url(path); let method = match options.head { diff --git a/src/client/get.rs b/src/client/get.rs index b7e7f24..2e399e5 100644 --- a/src/client/get.rs +++ b/src/client/get.rs @@ -19,7 +19,7 @@ use std::ops::Range; use crate::client::header::{header_meta, HeaderConfig}; use crate::path::Path; -use crate::{Error, GetOptions, GetResult, GetResultPayload, Result}; +use crate::{GetOptions, GetRange, GetResult, GetResultPayload, Result}; use async_trait::async_trait; use futures::{StreamExt, TryStreamExt}; use hyper::header::CONTENT_RANGE; @@ -49,6 +49,12 @@ pub trait GetClientExt { impl GetClientExt for T { async fn get_opts(&self, location: &Path, options: GetOptions) -> Result { let range = options.range.clone(); + if let Some(r) = range.as_ref() { + r.is_valid().map_err(|e| crate::Error::Generic { + store: T::STORE, + source: Box::new(e), + })?; + } let response = self.get_request(location, options).await?; get_result::(location, range, response).map_err(|e| crate::Error::Generic { store: T::STORE, @@ -94,6 +100,11 @@ enum GetResultError { source: crate::client::header::Error, }, + #[snafu(context(false))] + InvalidRangeRequest { + source: crate::util::InvalidGetRange, + }, + #[snafu(display("Received non-partial response when range requested"))] NotPartial, @@ -115,7 +126,7 @@ enum GetResultError { fn get_result( location: &Path, - range: Option>, + range: Option, response: Response, ) -> Result { let mut meta = header_meta(location, response.headers(), T::HEADER_CONFIG)?; @@ -135,13 +146,16 @@ fn get_result( let value = ContentRange::from_str(value).context(ParseContentRangeSnafu { value })?; let actual = value.range; + // Update size to reflect full size of object (#5272) + meta.size = value.size; + + let expected = expected.as_range(meta.size)?; + ensure!( actual == expected, UnexpectedRangeSnafu { expected, actual } ); - // Update size to reflect full size of object (#5272) - meta.size = value.size; actual } else { 0..meta.size @@ -149,7 +163,7 @@ fn get_result( let stream = response .bytes_stream() - .map_err(|source| Error::Generic { + .map_err(|source| crate::Error::Generic { store: T::STORE, source: Box::new(source), }) @@ -220,20 +234,22 @@ mod tests { let bytes = res.bytes().await.unwrap(); assert_eq!(bytes.len(), 12); + let get_range = GetRange::from(2..3); + let resp = make_response( 12, Some(2..3), StatusCode::PARTIAL_CONTENT, Some("bytes 2-2/12"), ); - let res = get_result::(&path, Some(2..3), resp).unwrap(); + let res = get_result::(&path, Some(get_range.clone()), resp).unwrap(); assert_eq!(res.meta.size, 12); assert_eq!(res.range, 2..3); let bytes = res.bytes().await.unwrap(); assert_eq!(bytes.len(), 1); let resp = make_response(12, Some(2..3), StatusCode::OK, None); - let err = get_result::(&path, Some(2..3), resp).unwrap_err(); + let err = get_result::(&path, Some(get_range.clone()), resp).unwrap_err(); assert_eq!( err.to_string(), "Received non-partial response when range requested" @@ -245,7 +261,7 @@ mod tests { StatusCode::PARTIAL_CONTENT, Some("bytes 2-3/12"), ); - let err = get_result::(&path, Some(2..3), resp).unwrap_err(); + let err = get_result::(&path, Some(get_range.clone()), resp).unwrap_err(); assert_eq!(err.to_string(), "Requested 2..3, got 2..4"); let resp = make_response( @@ -254,17 +270,50 @@ mod tests { StatusCode::PARTIAL_CONTENT, Some("bytes 2-2/*"), ); - let err = get_result::(&path, Some(2..3), resp).unwrap_err(); + let err = get_result::(&path, Some(get_range.clone()), resp).unwrap_err(); assert_eq!( err.to_string(), "Failed to parse value for CONTENT_RANGE header: \"bytes 2-2/*\"" ); let resp = make_response(12, Some(2..3), StatusCode::PARTIAL_CONTENT, None); - let err = get_result::(&path, Some(2..3), resp).unwrap_err(); + let err = get_result::(&path, Some(get_range.clone()), resp).unwrap_err(); assert_eq!( err.to_string(), "Content-Range header not present in partial response" ); + + let resp = make_response( + 2, + Some(2..3), + StatusCode::PARTIAL_CONTENT, + Some("bytes 2-3/2"), + ); + let err = get_result::(&path, Some(get_range.clone()), resp).unwrap_err(); + assert_eq!( + err.to_string(), + "InvalidRangeRequest: Wanted range starting at 2, but object was only 2 bytes long" + ); + + let resp = make_response( + 6, + Some(2..6), + StatusCode::PARTIAL_CONTENT, + Some("bytes 2-5/6"), + ); + let res = get_result::(&path, Some(GetRange::Suffix(4)), resp).unwrap(); + assert_eq!(res.meta.size, 6); + assert_eq!(res.range, 2..6); + let bytes = res.bytes().await.unwrap(); + assert_eq!(bytes.len(), 4); + + let resp = make_response( + 6, + Some(2..6), + StatusCode::PARTIAL_CONTENT, + Some("bytes 2-3/6"), + ); + let err = get_result::(&path, Some(GetRange::Suffix(4)), resp).unwrap_err(); + assert_eq!(err.to_string(), "Requested 2..6, got 2..4"); } } diff --git a/src/client/mod.rs b/src/client/mod.rs index 2baf586..4a78927 100644 --- a/src/client/mod.rs +++ b/src/client/mod.rs @@ -594,8 +594,7 @@ impl GetOptionsExt for RequestBuilder { use hyper::header::*; if let Some(range) = options.range { - let range = format!("bytes={}-{}", range.start, range.end.saturating_sub(1)); - self = self.header(RANGE, range); + self = self.header(RANGE, range.to_string()); } if let Some(tag) = options.if_match { diff --git a/src/lib.rs b/src/lib.rs index 8fc47b2..53a5356 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -499,6 +499,7 @@ mod parse; mod util; pub use parse::{parse_url, parse_url_opts}; +pub use util::GetRange; use crate::path::Path; #[cfg(not(target_arch = "wasm32"))] @@ -580,10 +581,12 @@ pub trait ObjectStore: std::fmt::Display + Send + Sync + Debug + 'static { async fn get_opts(&self, location: &Path, options: GetOptions) -> Result; /// Return the bytes that are stored at the specified location - /// in the given byte range + /// in the given byte range. + /// + /// See [`GetRange::Bounded`] for more details on how `range` gets interpreted async fn get_range(&self, location: &Path, range: Range) -> Result { let options = GetOptions { - range: Some(range.clone()), + range: Some(range.into()), ..Default::default() }; self.get_opts(location, options).await?.bytes().await @@ -913,7 +916,7 @@ pub struct GetOptions { /// otherwise returning [`Error::NotModified`] /// /// - pub range: Option>, + pub range: Option, /// Request a particular object version pub version: Option, /// Request transfer of no content @@ -1308,7 +1311,7 @@ mod tests { assert_eq!(bytes, expected_data.slice(range.clone())); let opts = GetOptions { - range: Some(2..5), + range: Some(GetRange::Bounded(2..5)), ..Default::default() }; let result = storage.get_opts(&location, opts).await.unwrap(); @@ -1324,6 +1327,62 @@ mod tests { // Should be a non-fatal error out_of_range_result.unwrap_err(); + let opts = GetOptions { + range: Some(GetRange::Bounded(2..100)), + ..Default::default() + }; + let result = storage.get_opts(&location, opts).await.unwrap(); + assert_eq!(result.range, 2..14); + assert_eq!(result.meta.size, 14); + let bytes = result.bytes().await.unwrap(); + assert_eq!(bytes, b"bitrary data".as_ref()); + + let opts = GetOptions { + range: Some(GetRange::Suffix(2)), + ..Default::default() + }; + match storage.get_opts(&location, opts).await { + Ok(result) => { + assert_eq!(result.range, 12..14); + assert_eq!(result.meta.size, 14); + let bytes = result.bytes().await.unwrap(); + assert_eq!(bytes, b"ta".as_ref()); + } + Err(Error::NotSupported { .. }) => {} + Err(e) => panic!("{e}"), + } + + let opts = GetOptions { + range: Some(GetRange::Suffix(100)), + ..Default::default() + }; + match storage.get_opts(&location, opts).await { + Ok(result) => { + assert_eq!(result.range, 0..14); + assert_eq!(result.meta.size, 14); + let bytes = result.bytes().await.unwrap(); + assert_eq!(bytes, b"arbitrary data".as_ref()); + } + Err(Error::NotSupported { .. }) => {} + Err(e) => panic!("{e}"), + } + + let opts = GetOptions { + range: Some(GetRange::Offset(3)), + ..Default::default() + }; + let result = storage.get_opts(&location, opts).await.unwrap(); + assert_eq!(result.range, 3..14); + assert_eq!(result.meta.size, 14); + let bytes = result.bytes().await.unwrap(); + assert_eq!(bytes, b"itrary data".as_ref()); + + let opts = GetOptions { + range: Some(GetRange::Offset(100)), + ..Default::default() + }; + storage.get_opts(&location, opts).await.unwrap_err(); + let ranges = vec![0..1, 2..3, 0..5]; let bytes = storage.get_ranges(&location, &ranges).await.unwrap(); for (range, bytes) in ranges.iter().zip(bytes) { diff --git a/src/local.rs b/src/local.rs index 71b96f0..e985ff0 100644 --- a/src/local.rs +++ b/src/local.rs @@ -19,6 +19,7 @@ use crate::{ maybe_spawn_blocking, path::{absolute_path_to_url, Path}, + util::InvalidGetRange, GetOptions, GetResult, GetResultPayload, ListResult, MultipartId, ObjectMeta, ObjectStore, PutMode, PutOptions, PutResult, Result, }; @@ -111,6 +112,11 @@ pub(crate) enum Error { actual: usize, }, + #[snafu(display("Requested range was invalid"))] + InvalidRange { + source: InvalidGetRange, + }, + #[snafu(display("Unable to copy file from {} to {}: {}", from.display(), to.display(), source))] UnableToCopyFile { from: PathBuf, @@ -424,9 +430,14 @@ impl ObjectStore for LocalFileSystem { let meta = convert_metadata(metadata, location)?; options.check_preconditions(&meta)?; + let range = match options.range { + Some(r) => r.as_range(meta.size).context(InvalidRangeSnafu)?, + None => 0..meta.size, + }; + Ok(GetResult { payload: GetResultPayload::File(file, path), - range: options.range.unwrap_or(0..meta.size), + range, meta, }) }) diff --git a/src/memory.rs b/src/memory.rs index 3823001..41cfcc4 100644 --- a/src/memory.rs +++ b/src/memory.rs @@ -16,9 +16,10 @@ // under the License. //! An in-memory object store implementation +use crate::util::InvalidGetRange; use crate::{ - path::Path, GetResult, GetResultPayload, ListResult, ObjectMeta, ObjectStore, PutMode, - PutOptions, PutResult, Result, UpdateVersion, + path::Path, GetRange, GetResult, GetResultPayload, ListResult, ObjectMeta, ObjectStore, + PutMode, PutOptions, PutResult, Result, UpdateVersion, }; use crate::{GetOptions, MultipartId}; use async_trait::async_trait; @@ -26,7 +27,7 @@ use bytes::Bytes; use chrono::{DateTime, Utc}; use futures::{stream::BoxStream, StreamExt}; use parking_lot::RwLock; -use snafu::{ensure, OptionExt, Snafu}; +use snafu::{OptionExt, ResultExt, Snafu}; use std::collections::BTreeMap; use std::collections::BTreeSet; use std::io; @@ -43,13 +44,8 @@ enum Error { #[snafu(display("No data in memory found. Location: {path}"))] NoDataInMemory { path: String }, - #[snafu(display( - "Requested range {}..{} is out of bounds for object with length {}", range.start, range.end, len - ))] - OutOfRange { range: Range, len: usize }, - - #[snafu(display("Invalid range: {}..{}", range.start, range.end))] - BadRange { range: Range }, + #[snafu(display("Invalid range: {source}"))] + Range { source: InvalidGetRange }, #[snafu(display("Object already exists at that location: {path}"))] AlreadyExists { path: String }, @@ -220,10 +216,8 @@ impl ObjectStore for InMemory { let (range, data) = match options.range { Some(range) => { - let len = entry.data.len(); - ensure!(range.end <= len, OutOfRangeSnafu { range, len }); - ensure!(range.start <= range.end, BadRangeSnafu { range }); - (range.clone(), entry.data.slice(range)) + let r = range.as_range(entry.data.len()).context(RangeSnafu)?; + (r.clone(), entry.data.slice(r)) } None => (0..entry.data.len(), entry.data), }; @@ -241,14 +235,11 @@ impl ObjectStore for InMemory { ranges .iter() .map(|range| { - let range = range.clone(); - let len = entry.data.len(); - ensure!( - range.end <= entry.data.len(), - OutOfRangeSnafu { range, len } - ); - ensure!(range.start <= range.end, BadRangeSnafu { range }); - Ok(entry.data.slice(range)) + let r = GetRange::Bounded(range.clone()) + .as_range(entry.data.len()) + .context(RangeSnafu)?; + + Ok(entry.data.slice(r)) }) .collect() } diff --git a/src/util.rs b/src/util.rs index fd86ba7..a19d5aa 100644 --- a/src/util.rs +++ b/src/util.rs @@ -16,9 +16,15 @@ // under the License. //! Common logic for interacting with remote object stores +use std::{ + fmt::Display, + ops::{Range, RangeBounds}, +}; + use super::Result; use bytes::Bytes; use futures::{stream::StreamExt, Stream, TryStreamExt}; +use snafu::Snafu; #[cfg(any(feature = "azure", feature = "http"))] pub static RFC1123_FMT: &str = "%a, %d %h %Y %T GMT"; @@ -98,12 +104,12 @@ pub const OBJECT_STORE_COALESCE_PARALLEL: usize = 10; /// * Make multiple `fetch` requests in parallel (up to maximum of 10) /// pub async fn coalesce_ranges( - ranges: &[std::ops::Range], + ranges: &[Range], fetch: F, coalesce: usize, ) -> Result, E> where - F: Send + FnMut(std::ops::Range) -> Fut, + F: Send + FnMut(Range) -> Fut, E: Send, Fut: std::future::Future> + Send, { @@ -124,13 +130,13 @@ where let start = range.start - fetch_range.start; let end = range.end - fetch_range.start; - fetch_bytes.slice(start..end) + fetch_bytes.slice(start..end.min(fetch_bytes.len())) }) .collect()) } /// Returns a sorted list of ranges that cover `ranges` -fn merge_ranges(ranges: &[std::ops::Range], coalesce: usize) -> Vec> { +fn merge_ranges(ranges: &[Range], coalesce: usize) -> Vec> { if ranges.is_empty() { return vec![]; } @@ -167,6 +173,119 @@ fn merge_ranges(ranges: &[std::ops::Range], coalesce: usize) -> Vec), + /// Request all bytes starting from a given byte offset + Offset(usize), + /// Request up to the last n bytes + Suffix(usize), +} + +#[derive(Debug, Snafu)] +pub(crate) enum InvalidGetRange { + #[snafu(display( + "Wanted range starting at {requested}, but object was only {length} bytes long" + ))] + StartTooLarge { requested: usize, length: usize }, + + #[snafu(display("Range started at {start} and ended at {end}"))] + Inconsistent { start: usize, end: usize }, +} + +impl GetRange { + pub(crate) fn is_valid(&self) -> Result<(), InvalidGetRange> { + match self { + Self::Bounded(r) if r.end <= r.start => { + return Err(InvalidGetRange::Inconsistent { + start: r.start, + end: r.end, + }); + } + _ => (), + }; + Ok(()) + } + + /// Convert to a [`Range`] if valid. + pub(crate) fn as_range(&self, len: usize) -> Result, InvalidGetRange> { + self.is_valid()?; + match self { + Self::Bounded(r) => { + if r.start >= len { + Err(InvalidGetRange::StartTooLarge { + requested: r.start, + length: len, + }) + } else if r.end > len { + Ok(r.start..len) + } else { + Ok(r.clone()) + } + } + Self::Offset(o) => { + if *o >= len { + Err(InvalidGetRange::StartTooLarge { + requested: *o, + length: len, + }) + } else { + Ok(*o..len) + } + } + Self::Suffix(n) => Ok(len.saturating_sub(*n)..len), + } + } +} + +impl Display for GetRange { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Self::Bounded(r) => write!(f, "bytes={}-{}", r.start, r.end - 1), + Self::Offset(o) => write!(f, "bytes={o}-"), + Self::Suffix(n) => write!(f, "bytes=-{n}"), + } + } +} + +impl> From for GetRange { + fn from(value: T) -> Self { + use std::ops::Bound::*; + let first = match value.start_bound() { + Included(i) => *i, + Excluded(i) => i + 1, + Unbounded => 0, + }; + match value.end_bound() { + Included(i) => Self::Bounded(first..(i + 1)), + Excluded(i) => Self::Bounded(first..*i), + Unbounded => Self::Offset(first), + } + } +} + #[cfg(test)] mod tests { use crate::Error; @@ -269,4 +388,59 @@ mod tests { } } } + + #[test] + fn getrange_str() { + assert_eq!(GetRange::Offset(0).to_string(), "bytes=0-"); + assert_eq!(GetRange::Bounded(10..19).to_string(), "bytes=10-18"); + assert_eq!(GetRange::Suffix(10).to_string(), "bytes=-10"); + } + + #[test] + fn getrange_from() { + assert_eq!(Into::::into(10..15), GetRange::Bounded(10..15),); + assert_eq!(Into::::into(10..=15), GetRange::Bounded(10..16),); + assert_eq!(Into::::into(10..), GetRange::Offset(10),); + assert_eq!(Into::::into(..=15), GetRange::Bounded(0..16)); + } + + #[test] + fn test_as_range() { + let range = GetRange::Bounded(2..5); + assert_eq!(range.as_range(5).unwrap(), 2..5); + + let range = range.as_range(4).unwrap(); + assert_eq!(range, 2..4); + + let range = GetRange::Bounded(3..3); + let err = range.as_range(2).unwrap_err().to_string(); + assert_eq!(err, "Range started at 3 and ended at 3"); + + let range = GetRange::Bounded(2..2); + let err = range.as_range(3).unwrap_err().to_string(); + assert_eq!(err, "Range started at 2 and ended at 2"); + + let range = GetRange::Suffix(3); + assert_eq!(range.as_range(3).unwrap(), 0..3); + assert_eq!(range.as_range(2).unwrap(), 0..2); + + let range = GetRange::Suffix(0); + assert_eq!(range.as_range(0).unwrap(), 0..0); + + let range = GetRange::Offset(2); + let err = range.as_range(2).unwrap_err().to_string(); + assert_eq!( + err, + "Wanted range starting at 2, but object was only 2 bytes long" + ); + + let err = range.as_range(1).unwrap_err().to_string(); + assert_eq!( + err, + "Wanted range starting at 2, but object was only 1 bytes long" + ); + + let range = GetRange::Offset(1); + assert_eq!(range.as_range(2).unwrap(), 1..2); + } } diff --git a/tests/get_range_file.rs b/tests/get_range_file.rs index 85231a5..f73d785 100644 --- a/tests/get_range_file.rs +++ b/tests/get_range_file.rs @@ -93,4 +93,29 @@ async fn test_get_range() { let data = store.get_range(&path, range.clone()).await.unwrap(); assert_eq!(&data[..], &expected[range]) } + + let over_range = 0..(expected.len() * 2); + let data = store.get_range(&path, over_range.clone()).await.unwrap(); + assert_eq!(&data[..], expected) +} + +/// Test that, when a requesting a range which overhangs the end of the resource, +/// the resulting [GetResult::range] reports the returned range, +/// not the requested. +#[tokio::test] +async fn test_get_opts_over_range() { + let tmp = tempdir().unwrap(); + let store = MyStore(LocalFileSystem::new_with_prefix(tmp.path()).unwrap()); + let path = Path::from("foo"); + + let expected = Bytes::from_static(b"hello world"); + store.put(&path, expected.clone()).await.unwrap(); + + let opts = GetOptions { + range: Some(GetRange::Bounded(0..(expected.len() * 2))), + ..Default::default() + }; + let res = store.get_opts(&path, opts).await.unwrap(); + assert_eq!(res.range, 0..expected.len()); + assert_eq!(res.bytes().await.unwrap(), expected); } From 1d377574c512f51a27d87aee482c1c0a1c19be5a Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Fri, 5 Jan 2024 11:08:11 +0000 Subject: [PATCH 256/397] Support S3 Express One Zone (#5268) * Support S3 Express One Zone * Fix endpoint * Update object_store/src/aws/builder.rs * Fix credential caching * Review feedback * Clippy --- src/aws/builder.rs | 93 ++++++++++++++++++++++++---- src/aws/client.rs | 137 +++++++++++++++++++++------------------- src/aws/credential.rs | 141 ++++++++++++++++++++++++++++++++---------- src/aws/dynamo.rs | 5 +- src/aws/mod.rs | 24 +++++-- src/client/mod.rs | 7 +++ src/client/token.rs | 18 +++--- 7 files changed, 301 insertions(+), 124 deletions(-) diff --git a/src/aws/builder.rs b/src/aws/builder.rs index 542f17a..9a296bc 100644 --- a/src/aws/builder.rs +++ b/src/aws/builder.rs @@ -17,7 +17,7 @@ use crate::aws::client::{S3Client, S3Config}; use crate::aws::credential::{ - InstanceCredentialProvider, TaskCredentialProvider, WebIdentityProvider, + InstanceCredentialProvider, SessionProvider, TaskCredentialProvider, WebIdentityProvider, }; use crate::aws::{ AmazonS3, AwsCredential, AwsCredentialProvider, Checksum, S3ConditionalPut, S3CopyIfNotExists, @@ -31,6 +31,7 @@ use serde::{Deserialize, Serialize}; use snafu::{OptionExt, ResultExt, Snafu}; use std::str::FromStr; use std::sync::Arc; +use std::time::Duration; use tracing::info; use url::Url; @@ -77,6 +78,9 @@ enum Error { source: reqwest::Error, }, + #[snafu(display("Invalid Zone suffix for bucket '{bucket}'"))] + ZoneSuffix { bucket: String }, + #[snafu(display("Failed to parse the region for bucket '{}'", bucket))] RegionParse { bucket: String }, } @@ -134,6 +138,8 @@ pub struct AmazonS3Builder { imdsv1_fallback: ConfigValue, /// When set to true, virtual hosted style request has to be used virtual_hosted_style_request: ConfigValue, + /// When set to true, S3 express is used + s3_express: ConfigValue, /// When set to true, unsigned payload option has to be used unsigned_payload: ConfigValue, /// Checksum algorithm which has to be used for object integrity check during upload @@ -307,6 +313,13 @@ pub enum AmazonS3ConfigKey { /// - `disable_tagging` DisableTagging, + /// Enable Support for S3 Express One Zone + /// + /// Supported keys: + /// - `aws_s3_express` + /// - `s3_express` + S3Express, + /// Client options Client(ClientConfigKey), } @@ -322,6 +335,7 @@ impl AsRef for AmazonS3ConfigKey { Self::Token => "aws_session_token", Self::ImdsV1Fallback => "aws_imdsv1_fallback", Self::VirtualHostedStyleRequest => "aws_virtual_hosted_style_request", + Self::S3Express => "aws_s3_express", Self::DefaultRegion => "aws_default_region", Self::MetadataEndpoint => "aws_metadata_endpoint", Self::UnsignedPayload => "aws_unsigned_payload", @@ -351,6 +365,7 @@ impl FromStr for AmazonS3ConfigKey { "aws_virtual_hosted_style_request" | "virtual_hosted_style_request" => { Ok(Self::VirtualHostedStyleRequest) } + "aws_s3_express" | "s3_express" => Ok(Self::S3Express), "aws_imdsv1_fallback" | "imdsv1_fallback" => Ok(Self::ImdsV1Fallback), "aws_metadata_endpoint" | "metadata_endpoint" => Ok(Self::MetadataEndpoint), "aws_unsigned_payload" | "unsigned_payload" => Ok(Self::UnsignedPayload), @@ -448,6 +463,7 @@ impl AmazonS3Builder { AmazonS3ConfigKey::VirtualHostedStyleRequest => { self.virtual_hosted_style_request.parse(value) } + AmazonS3ConfigKey::S3Express => self.s3_express.parse(value), AmazonS3ConfigKey::DefaultRegion => { self.region = self.region.or_else(|| Some(value.into())) } @@ -497,6 +513,7 @@ impl AmazonS3Builder { AmazonS3ConfigKey::VirtualHostedStyleRequest => { Some(self.virtual_hosted_style_request.to_string()) } + AmazonS3ConfigKey::S3Express => Some(self.s3_express.to_string()), AmazonS3ConfigKey::MetadataEndpoint => self.metadata_endpoint.clone(), AmazonS3ConfigKey::UnsignedPayload => Some(self.unsigned_payload.to_string()), AmazonS3ConfigKey::Checksum => { @@ -619,7 +636,8 @@ impl AmazonS3Builder { } /// Sets if virtual hosted style request has to be used. - /// If `virtual_hosted_style_request` is : + /// + /// If `virtual_hosted_style_request` is: /// * false (default): Path style request is used /// * true: Virtual hosted style request is used /// @@ -632,6 +650,12 @@ impl AmazonS3Builder { self } + /// Configure this as an S3 Express One Zone Bucket + pub fn with_s3_express(mut self, s3_express: bool) -> Self { + self.s3_express = s3_express.into(); + self + } + /// Set the retry configuration pub fn with_retry(mut self, retry_config: RetryConfig) -> Self { self.retry_config = retry_config; @@ -823,18 +847,39 @@ impl AmazonS3Builder { )) as _ }; - // If `endpoint` is provided then its assumed to be consistent with - // `virtual_hosted_style_request`. i.e. if `virtual_hosted_style_request` is true then - // `endpoint` should have bucket name included. - let bucket_endpoint = if self.virtual_hosted_style_request.get()? { - self.endpoint - .clone() - .unwrap_or_else(|| format!("https://{bucket}.s3.{region}.amazonaws.com")) - } else { - match &self.endpoint { - None => format!("https://s3.{region}.amazonaws.com/{bucket}"), - Some(endpoint) => format!("{endpoint}/{bucket}"), + let (session_provider, zonal_endpoint) = match self.s3_express.get()? { + true => { + let zone = parse_bucket_az(&bucket).context(ZoneSuffixSnafu { bucket: &bucket })?; + + // https://docs.aws.amazon.com/AmazonS3/latest/userguide/s3-express-Regions-and-Zones.html + let endpoint = format!("https://{bucket}.s3express-{zone}.{region}.amazonaws.com"); + + let session = Arc::new( + TokenCredentialProvider::new( + SessionProvider { + endpoint: endpoint.clone(), + region: region.clone(), + credentials: Arc::clone(&credentials), + }, + self.client_options.client()?, + self.retry_config.clone(), + ) + .with_min_ttl(Duration::from_secs(60)), // Credentials only valid for 5 minutes + ); + (Some(session as _), Some(endpoint)) } + false => (None, None), + }; + + // If `endpoint` is provided it's assumed to be consistent with `virtual_hosted_style_request` or `s3_express`. + // For example, if `virtual_hosted_style_request` is true then `endpoint` should have bucket name included. + let virtual_hosted = self.virtual_hosted_style_request.get()?; + let bucket_endpoint = match (&self.endpoint, zonal_endpoint, virtual_hosted) { + (Some(endpoint), _, true) => endpoint.clone(), + (Some(endpoint), _, false) => format!("{endpoint}/{bucket}"), + (None, Some(endpoint), _) => endpoint, + (None, None, true) => format!("https://{bucket}.s3.{region}.amazonaws.com"), + (None, None, false) => format!("https://s3.{region}.amazonaws.com/{bucket}"), }; let config = S3Config { @@ -843,6 +888,7 @@ impl AmazonS3Builder { bucket, bucket_endpoint, credentials, + session_provider, retry_config: self.retry_config, client_options: self.client_options, sign_payload: !self.unsigned_payload.get()?, @@ -859,6 +905,13 @@ impl AmazonS3Builder { } } +/// Extracts the AZ from a S3 Express One Zone bucket name +/// +/// +fn parse_bucket_az(bucket: &str) -> Option<&str> { + Some(bucket.strip_suffix("--x-s3")?.rsplit_once("--")?.1) +} + #[cfg(test)] mod tests { use super::*; @@ -1088,4 +1141,18 @@ mod tests { "Generic Config error: \"md5\" is not a valid checksum algorithm" ); } + + #[test] + fn test_parse_bucket_az() { + let cases = [ + ("bucket-base-name--usw2-az1--x-s3", Some("usw2-az1")), + ("bucket-base--name--azid--x-s3", Some("azid")), + ("bucket-base-name", None), + ("bucket-base-name--x-s3", None), + ]; + + for (bucket, expected) in cases { + assert_eq!(parse_bucket_az(bucket), expected) + } + } } diff --git a/src/aws/client.rs b/src/aws/client.rs index 45d97ea..e06a0ce 100644 --- a/src/aws/client.rs +++ b/src/aws/client.rs @@ -18,7 +18,8 @@ use crate::aws::checksum::Checksum; use crate::aws::credential::{AwsCredential, CredentialExt}; use crate::aws::{ - AwsCredentialProvider, S3ConditionalPut, S3CopyIfNotExists, STORE, STRICT_PATH_ENCODE_SET, + AwsAuthorizer, AwsCredentialProvider, S3ConditionalPut, S3CopyIfNotExists, STORE, + STRICT_PATH_ENCODE_SET, }; use crate::client::get::GetClient; use crate::client::header::{get_etag, HeaderConfig}; @@ -171,6 +172,7 @@ pub struct S3Config { pub bucket: String, pub bucket_endpoint: String, pub credentials: AwsCredentialProvider, + pub session_provider: Option, pub retry_config: RetryConfig, pub client_options: ClientOptions, pub sign_payload: bool, @@ -186,12 +188,54 @@ impl S3Config { format!("{}/{}", self.bucket_endpoint, encode_path(path)) } + async fn get_session_credential(&self) -> Result> { + let credential = match self.skip_signature { + false => { + let provider = self.session_provider.as_ref().unwrap_or(&self.credentials); + Some(provider.get_credential().await?) + } + true => None, + }; + + Ok(SessionCredential { + credential, + session_token: self.session_provider.is_some(), + config: self, + }) + } + pub(crate) async fn get_credential(&self) -> Result>> { Ok(match self.skip_signature { false => Some(self.credentials.get_credential().await?), true => None, }) } + + #[inline] + pub(crate) fn is_s3_express(&self) -> bool { + self.session_provider.is_some() + } +} + +struct SessionCredential<'a> { + credential: Option>, + session_token: bool, + config: &'a S3Config, +} + +impl<'a> SessionCredential<'a> { + fn authorizer(&self) -> Option> { + let mut authorizer = + AwsAuthorizer::new(self.credential.as_deref()?, "s3", &self.config.region) + .with_sign_payload(self.config.sign_payload); + + if self.session_token { + let token = HeaderName::from_static("x-amz-s3session-token"); + authorizer = authorizer.with_token_header(token) + } + + Some(authorizer) + } } #[derive(Debug, Snafu)] @@ -219,6 +263,7 @@ pub(crate) struct Request<'a> { config: &'a S3Config, builder: RequestBuilder, payload_sha256: Option>, + use_session_creds: bool, } impl<'a> Request<'a> { @@ -237,16 +282,18 @@ impl<'a> Request<'a> { } pub async fn send(self) -> Result { - let credential = self.config.get_credential().await?; + let credential = match self.use_session_creds { + true => self.config.get_session_credential().await?, + false => SessionCredential { + credential: self.config.get_credential().await?, + session_token: false, + config: self.config, + }, + }; + let path = self.path.as_ref(); self.builder - .with_aws_sigv4( - credential.as_deref(), - &self.config.region, - "s3", - self.config.sign_payload, - self.payload_sha256.as_deref(), - ) + .with_aws_sigv4(credential.authorizer(), self.payload_sha256.as_deref()) .send_retry(&self.config.retry_config) .await .context(RetrySnafu { path }) @@ -300,6 +347,7 @@ impl S3Client { builder, payload_sha256, config: &self.config, + use_session_creds: true, } } @@ -309,19 +357,13 @@ impl S3Client { path: &Path, query: &T, ) -> Result<()> { - let credential = self.config.get_credential().await?; + let credential = self.config.get_session_credential().await?; let url = self.config.path_url(path); self.client .request(Method::DELETE, url) .query(query) - .with_aws_sigv4( - credential.as_deref(), - &self.config.region, - "s3", - self.config.sign_payload, - None, - ) + .with_aws_sigv4(credential.authorizer(), None) .send_retry(&self.config.retry_config) .await .map_err(|e| e.error(STORE, path.to_string()))?; @@ -341,7 +383,7 @@ impl S3Client { return Ok(Vec::new()); } - let credential = self.config.get_credential().await?; + let credential = self.config.get_session_credential().await?; let url = format!("{}?delete", self.config.bucket_endpoint); let mut buffer = Vec::new(); @@ -399,13 +441,7 @@ impl S3Client { let response = builder .header(CONTENT_TYPE, "application/xml") .body(body) - .with_aws_sigv4( - credential.as_deref(), - &self.config.region, - "s3", - self.config.sign_payload, - payload_sha256.as_deref(), - ) + .with_aws_sigv4(credential.authorizer(), payload_sha256.as_deref()) .send_retry(&self.config.retry_config) .await .context(DeleteObjectsRequestSnafu {})? @@ -452,23 +488,18 @@ impl S3Client { path: from, config: &self.config, payload_sha256: None, + use_session_creds: false, } } pub async fn create_multipart(&self, location: &Path) -> Result { - let credential = self.config.get_credential().await?; + let credential = self.config.get_session_credential().await?; let url = format!("{}?uploads=", self.config.path_url(location),); let response = self .client .request(Method::POST, url) - .with_aws_sigv4( - credential.as_deref(), - &self.config.region, - "s3", - self.config.sign_payload, - None, - ) + .with_aws_sigv4(credential.authorizer(), None) .send_retry(&self.config.retry_config) .await .context(CreateMultipartRequestSnafu)? @@ -510,7 +541,7 @@ impl S3Client { let request = CompleteMultipartUpload::from(parts); let body = quick_xml::se::to_string(&request).unwrap(); - let credential = self.config.get_credential().await?; + let credential = self.config.get_session_credential().await?; let url = self.config.path_url(location); let response = self @@ -518,13 +549,7 @@ impl S3Client { .request(Method::POST, url) .query(&[("uploadId", upload_id)]) .body(body) - .with_aws_sigv4( - credential.as_deref(), - &self.config.region, - "s3", - self.config.sign_payload, - None, - ) + .with_aws_sigv4(credential.authorizer(), None) .send_retry(&self.config.retry_config) .await .context(CompleteMultipartRequestSnafu)?; @@ -547,18 +572,12 @@ impl S3Client { #[cfg(test)] pub async fn get_object_tagging(&self, path: &Path) -> Result { - let credential = self.config.get_credential().await?; + let credential = self.config.get_session_credential().await?; let url = format!("{}?tagging", self.config.path_url(path)); let response = self .client .request(Method::GET, url) - .with_aws_sigv4( - credential.as_deref(), - &self.config.region, - "s3", - self.config.sign_payload, - None, - ) + .with_aws_sigv4(credential.authorizer(), None) .send_retry(&self.config.retry_config) .await .map_err(|e| e.error(STORE, path.to_string()))?; @@ -578,7 +597,7 @@ impl GetClient for S3Client { /// Make an S3 GET request async fn get_request(&self, path: &Path, options: GetOptions) -> Result { - let credential = self.config.get_credential().await?; + let credential = self.config.get_session_credential().await?; let url = self.config.path_url(path); let method = match options.head { true => Method::HEAD, @@ -593,13 +612,7 @@ impl GetClient for S3Client { let response = builder .with_get_options(options) - .with_aws_sigv4( - credential.as_deref(), - &self.config.region, - "s3", - self.config.sign_payload, - None, - ) + .with_aws_sigv4(credential.authorizer(), None) .send_retry(&self.config.retry_config) .await .map_err(|e| e.error(STORE, path.to_string()))?; @@ -618,7 +631,7 @@ impl ListClient for S3Client { token: Option<&str>, offset: Option<&str>, ) -> Result<(ListResult, Option)> { - let credential = self.config.get_credential().await?; + let credential = self.config.get_session_credential().await?; let url = self.config.bucket_endpoint.clone(); let mut query = Vec::with_capacity(4); @@ -645,13 +658,7 @@ impl ListClient for S3Client { .client .request(Method::GET, &url) .query(&query) - .with_aws_sigv4( - credential.as_deref(), - &self.config.region, - "s3", - self.config.sign_payload, - None, - ) + .with_aws_sigv4(credential.authorizer(), None) .send_retry(&self.config.retry_config) .await .context(ListRequestSnafu)? diff --git a/src/aws/credential.rs b/src/aws/credential.rs index d290da8..f8614f4 100644 --- a/src/aws/credential.rs +++ b/src/aws/credential.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -use crate::aws::{STORE, STRICT_ENCODE_SET, STRICT_PATH_ENCODE_SET}; +use crate::aws::{AwsCredentialProvider, STORE, STRICT_ENCODE_SET, STRICT_PATH_ENCODE_SET}; use crate::client::retry::RetryExt; use crate::client::token::{TemporaryToken, TokenCache}; use crate::client::TokenProvider; @@ -24,16 +24,40 @@ use crate::{CredentialProvider, Result, RetryConfig}; use async_trait::async_trait; use bytes::Buf; use chrono::{DateTime, Utc}; +use hyper::header::HeaderName; use percent_encoding::utf8_percent_encode; -use reqwest::header::{HeaderMap, HeaderValue}; +use reqwest::header::{HeaderMap, HeaderValue, AUTHORIZATION}; use reqwest::{Client, Method, Request, RequestBuilder, StatusCode}; use serde::Deserialize; +use snafu::{ResultExt, Snafu}; use std::collections::BTreeMap; use std::sync::Arc; use std::time::{Duration, Instant}; use tracing::warn; use url::Url; +#[derive(Debug, Snafu)] +#[allow(clippy::enum_variant_names)] +enum Error { + #[snafu(display("Error performing CreateSession request: {source}"))] + CreateSessionRequest { source: crate::client::retry::Error }, + + #[snafu(display("Error getting CreateSession response: {source}"))] + CreateSessionResponse { source: reqwest::Error }, + + #[snafu(display("Invalid CreateSessionOutput response: {source}"))] + CreateSessionOutput { source: quick_xml::DeError }, +} + +impl From for crate::Error { + fn from(value: Error) -> Self { + Self::Generic { + store: STORE, + source: Box::new(value), + } + } +} + type StdError = Box; /// SHA256 hash of empty string @@ -75,13 +99,13 @@ pub struct AwsAuthorizer<'a> { credential: &'a AwsCredential, service: &'a str, region: &'a str, + token_header: Option, sign_payload: bool, } -const DATE_HEADER: &str = "x-amz-date"; -const HASH_HEADER: &str = "x-amz-content-sha256"; -const TOKEN_HEADER: &str = "x-amz-security-token"; -const AUTH_HEADER: &str = "authorization"; +static DATE_HEADER: HeaderName = HeaderName::from_static("x-amz-date"); +static HASH_HEADER: HeaderName = HeaderName::from_static("x-amz-content-sha256"); +static TOKEN_HEADER: HeaderName = HeaderName::from_static("x-amz-security-token"); const ALGORITHM: &str = "AWS4-HMAC-SHA256"; impl<'a> AwsAuthorizer<'a> { @@ -93,6 +117,7 @@ impl<'a> AwsAuthorizer<'a> { region, date: None, sign_payload: true, + token_header: None, } } @@ -103,6 +128,12 @@ impl<'a> AwsAuthorizer<'a> { self } + /// Overrides the header name for security tokens, defaults to `x-amz-security-token` + pub(crate) fn with_token_header(mut self, header: HeaderName) -> Self { + self.token_header = Some(header); + self + } + /// Authorize `request` with an optional pre-calculated SHA256 digest by attaching /// the relevant [AWS SigV4] headers /// @@ -119,7 +150,8 @@ impl<'a> AwsAuthorizer<'a> { pub fn authorize(&self, request: &mut Request, pre_calculated_digest: Option<&[u8]>) { if let Some(ref token) = self.credential.token { let token_val = HeaderValue::from_str(token).unwrap(); - request.headers_mut().insert(TOKEN_HEADER, token_val); + let header = self.token_header.as_ref().unwrap_or(&TOKEN_HEADER); + request.headers_mut().insert(header, token_val); } let host = &request.url()[url::Position::BeforeHost..url::Position::AfterPort]; @@ -129,7 +161,7 @@ impl<'a> AwsAuthorizer<'a> { let date = self.date.unwrap_or_else(Utc::now); let date_str = date.format("%Y%m%dT%H%M%SZ").to_string(); let date_val = HeaderValue::from_str(&date_str).unwrap(); - request.headers_mut().insert(DATE_HEADER, date_val); + request.headers_mut().insert(&DATE_HEADER, date_val); let digest = match self.sign_payload { false => UNSIGNED_PAYLOAD.to_string(), @@ -146,7 +178,7 @@ impl<'a> AwsAuthorizer<'a> { }; let header_digest = HeaderValue::from_str(&digest).unwrap(); - request.headers_mut().insert(HASH_HEADER, header_digest); + request.headers_mut().insert(&HASH_HEADER, header_digest); let (signed_headers, canonical_headers) = canonicalize_headers(request.headers()); @@ -174,7 +206,9 @@ impl<'a> AwsAuthorizer<'a> { ); let authorization_val = HeaderValue::from_str(&authorisation).unwrap(); - request.headers_mut().insert(AUTH_HEADER, authorization_val); + request + .headers_mut() + .insert(&AUTHORIZATION, authorization_val); } pub(crate) fn sign(&self, method: Method, url: &mut Url, expires_in: Duration) { @@ -284,10 +318,7 @@ pub trait CredentialExt { /// Sign a request fn with_aws_sigv4( self, - credential: Option<&AwsCredential>, - region: &str, - service: &str, - sign_payload: bool, + authorizer: Option>, payload_sha256: Option<&[u8]>, ) -> Self; } @@ -295,20 +326,14 @@ pub trait CredentialExt { impl CredentialExt for RequestBuilder { fn with_aws_sigv4( self, - credential: Option<&AwsCredential>, - region: &str, - service: &str, - sign_payload: bool, + authorizer: Option>, payload_sha256: Option<&[u8]>, ) -> Self { - match credential { - Some(credential) => { + match authorizer { + Some(authorizer) => { let (client, request) = self.build_split(); let mut request = request.expect("request valid"); - - AwsAuthorizer::new(credential, service, region) - .with_sign_payload(sign_payload) - .authorize(&mut request, payload_sha256); + authorizer.authorize(&mut request, payload_sha256); Self::from_parts(client, request) } @@ -555,20 +580,20 @@ struct AssumeRoleResponse { #[derive(Debug, Deserialize)] #[serde(rename_all = "PascalCase")] struct AssumeRoleResult { - credentials: AssumeRoleCredentials, + credentials: SessionCredentials, } #[derive(Debug, Deserialize)] #[serde(rename_all = "PascalCase")] -struct AssumeRoleCredentials { +struct SessionCredentials { session_token: String, secret_access_key: String, access_key_id: String, expiration: DateTime, } -impl From for AwsCredential { - fn from(s: AssumeRoleCredentials) -> Self { +impl From for AwsCredential { + fn from(s: SessionCredentials) -> Self { Self { key_id: s.access_key_id, secret_key: s.secret_access_key, @@ -659,6 +684,56 @@ async fn task_credential( }) } +/// A session provider as used by S3 Express One Zone +/// +/// +#[derive(Debug)] +pub struct SessionProvider { + pub endpoint: String, + pub region: String, + pub credentials: AwsCredentialProvider, +} + +#[async_trait] +impl TokenProvider for SessionProvider { + type Credential = AwsCredential; + + async fn fetch_token( + &self, + client: &Client, + retry: &RetryConfig, + ) -> Result>> { + let creds = self.credentials.get_credential().await?; + let authorizer = AwsAuthorizer::new(&creds, "s3", &self.region); + + let bytes = client + .get(format!("{}?session", self.endpoint)) + .with_aws_sigv4(Some(authorizer), None) + .send_retry(retry) + .await + .context(CreateSessionRequestSnafu)? + .bytes() + .await + .context(CreateSessionResponseSnafu)?; + + let resp: CreateSessionOutput = + quick_xml::de::from_reader(bytes.reader()).context(CreateSessionOutputSnafu)?; + + let creds = resp.credentials; + Ok(TemporaryToken { + token: Arc::new(creds.into()), + // Credentials last 5 minutes - https://docs.aws.amazon.com/AmazonS3/latest/API/API_CreateSession.html + expiry: Some(Instant::now() + Duration::from_secs(5 * 60)), + }) + } +} + +#[derive(Debug, Deserialize)] +#[serde(rename_all = "PascalCase")] +struct CreateSessionOutput { + credentials: SessionCredentials, +} + #[cfg(test)] mod tests { use super::*; @@ -700,10 +775,11 @@ mod tests { service: "ec2", region: "us-east-1", sign_payload: true, + token_header: None, }; signer.authorize(&mut request, None); - assert_eq!(request.headers().get(AUTH_HEADER).unwrap(), "AWS4-HMAC-SHA256 Credential=AKIAIOSFODNN7EXAMPLE/20220806/us-east-1/ec2/aws4_request, SignedHeaders=host;x-amz-content-sha256;x-amz-date, Signature=a3c787a7ed37f7fdfbfd2d7056a3d7c9d85e6d52a2bfbec73793c0be6e7862d4") + assert_eq!(request.headers().get(&AUTHORIZATION).unwrap(), "AWS4-HMAC-SHA256 Credential=AKIAIOSFODNN7EXAMPLE/20220806/us-east-1/ec2/aws4_request, SignedHeaders=host;x-amz-content-sha256;x-amz-date, Signature=a3c787a7ed37f7fdfbfd2d7056a3d7c9d85e6d52a2bfbec73793c0be6e7862d4") } #[test] @@ -737,11 +813,12 @@ mod tests { credential: &credential, service: "ec2", region: "us-east-1", + token_header: None, sign_payload: false, }; authorizer.authorize(&mut request, None); - assert_eq!(request.headers().get(AUTH_HEADER).unwrap(), "AWS4-HMAC-SHA256 Credential=AKIAIOSFODNN7EXAMPLE/20220806/us-east-1/ec2/aws4_request, SignedHeaders=host;x-amz-content-sha256;x-amz-date, Signature=653c3d8ea261fd826207df58bc2bb69fbb5003e9eb3c0ef06e4a51f2a81d8699"); + assert_eq!(request.headers().get(&AUTHORIZATION).unwrap(), "AWS4-HMAC-SHA256 Credential=AKIAIOSFODNN7EXAMPLE/20220806/us-east-1/ec2/aws4_request, SignedHeaders=host;x-amz-content-sha256;x-amz-date, Signature=653c3d8ea261fd826207df58bc2bb69fbb5003e9eb3c0ef06e4a51f2a81d8699"); } #[test] @@ -762,6 +839,7 @@ mod tests { credential: &credential, service: "s3", region: "us-east-1", + token_header: None, sign_payload: false, }; @@ -813,11 +891,12 @@ mod tests { credential: &credential, service: "s3", region: "us-east-1", + token_header: None, sign_payload: true, }; authorizer.authorize(&mut request, None); - assert_eq!(request.headers().get(AUTH_HEADER).unwrap(), "AWS4-HMAC-SHA256 Credential=H20ABqCkLZID4rLe/20220809/us-east-1/s3/aws4_request, SignedHeaders=host;x-amz-content-sha256;x-amz-date, Signature=9ebf2f92872066c99ac94e573b4e1b80f4dbb8a32b1e8e23178318746e7d1b4d") + assert_eq!(request.headers().get(&AUTHORIZATION).unwrap(), "AWS4-HMAC-SHA256 Credential=H20ABqCkLZID4rLe/20220809/us-east-1/s3/aws4_request, SignedHeaders=host;x-amz-content-sha256;x-amz-date, Signature=9ebf2f92872066c99ac94e573b4e1b80f4dbb8a32b1e8e23178318746e7d1b4d") } #[tokio::test] diff --git a/src/aws/dynamo.rs b/src/aws/dynamo.rs index f12a421..4331ae2 100644 --- a/src/aws/dynamo.rs +++ b/src/aws/dynamo.rs @@ -29,7 +29,7 @@ use serde::{Deserialize, Serialize, Serializer}; use crate::aws::client::S3Client; use crate::aws::credential::CredentialExt; -use crate::aws::AwsCredential; +use crate::aws::{AwsAuthorizer, AwsCredential}; use crate::client::get::GetClientExt; use crate::client::retry::Error as RetryError; use crate::client::retry::RetryExt; @@ -365,6 +365,7 @@ impl DynamoCommit { req: R, ) -> Result { let region = &s3.config.region; + let authorizer = cred.map(|x| AwsAuthorizer::new(x, "dynamodb", region)); let builder = match &s3.config.endpoint { Some(e) => s3.client.post(e), @@ -378,7 +379,7 @@ impl DynamoCommit { .timeout(Duration::from_millis(self.timeout)) .json(&req) .header("X-Amz-Target", target) - .with_aws_sigv4(cred, region, "dynamodb", true, None) + .with_aws_sigv4(authorizer, None) .send_retry(&s3.config.retry_config) .await } diff --git a/src/aws/mod.rs b/src/aws/mod.rs index d167c78..4e88524 100644 --- a/src/aws/mod.rs +++ b/src/aws/mod.rs @@ -269,6 +269,16 @@ impl ObjectStore for AmazonS3 { prefix: Option<&Path>, offset: &Path, ) -> BoxStream<'_, Result> { + if self.client.config.is_s3_express() { + let offset = offset.clone(); + // S3 Express does not support start-after + return self + .client + .list(prefix) + .try_filter(move |f| futures::future::ready(f.location > offset)) + .boxed(); + } + self.client.list_with_offset(prefix, offset) } @@ -388,11 +398,15 @@ mod tests { multipart(&integration, &integration).await; signing(&integration).await; - tagging(&integration, !config.disable_tagging, |p| { - let client = Arc::clone(&integration.client); - async move { client.get_object_tagging(&p).await } - }) - .await; + // Object tagging is not supported by S3 Express One Zone + if config.session_provider.is_none() { + tagging(&integration, !config.disable_tagging, |p| { + let client = Arc::clone(&integration.client); + async move { client.get_object_tagging(&p).await } + }) + .await; + } + if test_not_exists { copy_if_not_exists(&integration).await; } diff --git a/src/client/mod.rs b/src/client/mod.rs index 4a78927..252e9fd 100644 --- a/src/client/mod.rs +++ b/src/client/mod.rs @@ -679,6 +679,13 @@ mod cloud { cache: Default::default(), } } + + /// Override the minimum remaining TTL for a cached token to be used + #[cfg(feature = "aws")] + pub fn with_min_ttl(mut self, min_ttl: Duration) -> Self { + self.cache = self.cache.with_min_ttl(min_ttl); + self + } } #[async_trait] diff --git a/src/client/token.rs b/src/client/token.rs index 7e48d35..7a3c807 100644 --- a/src/client/token.rs +++ b/src/client/token.rs @@ -16,7 +16,7 @@ // under the License. use std::future::Future; -use std::time::Instant; +use std::time::{Duration, Instant}; use tokio::sync::Mutex; /// A temporary authentication token with an associated expiry @@ -34,17 +34,25 @@ pub struct TemporaryToken { #[derive(Debug)] pub struct TokenCache { cache: Mutex>>, + min_ttl: Duration, } impl Default for TokenCache { fn default() -> Self { Self { cache: Default::default(), + min_ttl: Duration::from_secs(300), } } } impl TokenCache { + /// Override the minimum remaining TTL for a cached token to be used + #[cfg(feature = "aws")] + pub fn with_min_ttl(self, min_ttl: Duration) -> Self { + Self { min_ttl, ..self } + } + pub async fn get_or_insert_with(&self, f: F) -> Result where F: FnOnce() -> Fut + Send, @@ -55,13 +63,7 @@ impl TokenCache { if let Some(cached) = locked.as_ref() { match cached.expiry { - Some(ttl) - if ttl - .checked_duration_since(now) - .unwrap_or_default() - .as_secs() - > 300 => - { + Some(ttl) if ttl.checked_duration_since(now).unwrap_or_default() > self.min_ttl => { return Ok(cached.token.clone()); } None => return Ok(cached.token.clone()), From 2550b9795ed8fc7e3b8ec30e263131957b6f19d8 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Fri, 5 Jan 2024 13:21:31 +0000 Subject: [PATCH 257/397] Prepare object_store 0.9.0 (#5224) (#5285) --- CHANGELOG-old.md | 76 ++++++++++++++++++++++ CHANGELOG.md | 104 ++++++++++++------------------- Cargo.toml | 2 +- dev/release/update_change_log.sh | 4 +- 4 files changed, 120 insertions(+), 66 deletions(-) diff --git a/CHANGELOG-old.md b/CHANGELOG-old.md index 6780f7d..d01b8a3 100644 --- a/CHANGELOG-old.md +++ b/CHANGELOG-old.md @@ -19,6 +19,82 @@ # Historical Changelog +## [object_store_0.8.0](https://github.com/apache/arrow-rs/tree/object_store_0.8.0) (2023-11-02) + +[Full Changelog](https://github.com/apache/arrow-rs/compare/object_store_0.7.1...object_store_0.8.0) + +**Breaking changes:** + +- Remove ObjectStore::append [\#5016](https://github.com/apache/arrow-rs/pull/5016) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Don't panic on invalid Azure access key \(\#4972\) [\#4974](https://github.com/apache/arrow-rs/pull/4974) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Return `PutResult` with an ETag from ObjectStore::put \(\#4934\) [\#4944](https://github.com/apache/arrow-rs/pull/4944) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Add ObjectMeta::version and GetOptions::version \(\#4925\) [\#4935](https://github.com/apache/arrow-rs/pull/4935) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Add GetOptions::head [\#4931](https://github.com/apache/arrow-rs/pull/4931) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Remove Nested async and Fallibility from ObjectStore::list [\#4930](https://github.com/apache/arrow-rs/pull/4930) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Add ObjectStore::put_opts / Conditional Put [\#4879](https://github.com/apache/arrow-rs/pull/4984) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) + +**Implemented enhancements:** + +- Relax Path Safety on Parse [\#5019](https://github.com/apache/arrow-rs/issues/5019) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- ObjectStore: hard to determine the cause of the error thrown from retry [\#5013](https://github.com/apache/arrow-rs/issues/5013) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- continue existing multi-part upload [\#4961](https://github.com/apache/arrow-rs/issues/4961) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Simplify ObjectStore::List [\#4946](https://github.com/apache/arrow-rs/issues/4946) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Return ETag and Version on Put [\#4934](https://github.com/apache/arrow-rs/issues/4934) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Support Not Signing Requests in AmazonS3 [\#4927](https://github.com/apache/arrow-rs/issues/4927) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Get Object By Version [\#4925](https://github.com/apache/arrow-rs/issues/4925) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Plans for supporting Extension Array to support Fixed shape tensor Array [\#4890](https://github.com/apache/arrow-rs/issues/4890) +- Conditional Put Support [\#4879](https://github.com/apache/arrow-rs/issues/4879) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- creates\_dir\_if\_not\_present\_append Test is Flaky [\#4872](https://github.com/apache/arrow-rs/issues/4872) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Release object\_store `0.7.1` [\#4858](https://github.com/apache/arrow-rs/issues/4858) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Support User-Defined Object Metadata [\#4754](https://github.com/apache/arrow-rs/issues/4754) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- APIs for directly managing multi-part uploads and saving potential parquet footers [\#4608](https://github.com/apache/arrow-rs/issues/4608) + +**Fixed bugs:** + +- ObjectStore parse\_url Incorrectly Handles URLs with Spaces [\#5017](https://github.com/apache/arrow-rs/issues/5017) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- \[objects-store\]: periods/dots error in GCP bucket [\#4991](https://github.com/apache/arrow-rs/issues/4991) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Azure ImdsManagedIdentityProvider does not work in Azure functions [\#4976](https://github.com/apache/arrow-rs/issues/4976) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Panic when using an azure object store with an invalid access key [\#4972](https://github.com/apache/arrow-rs/issues/4972) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Handle Body Errors in AWS CompleteMultipartUpload [\#4965](https://github.com/apache/arrow-rs/issues/4965) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- ObjectStore multiple\_append Test is Flaky [\#4868](https://github.com/apache/arrow-rs/issues/4868) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- \[objectstore\] Problem with special characters in file path [\#4454](https://github.com/apache/arrow-rs/issues/4454) + +**Closed issues:** + +- Include onelake fabric path for https [\#5000](https://github.com/apache/arrow-rs/issues/5000) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- \[object\_store\] Support generating and using signed upload URLs [\#4763](https://github.com/apache/arrow-rs/issues/4763) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] + +**Merged pull requests:** + +- Relax path safety \(\#5019\) [\#5020](https://github.com/apache/arrow-rs/pull/5020) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Decode URL paths \(\#5017\) [\#5018](https://github.com/apache/arrow-rs/pull/5018) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- ObjectStore: make error msg thrown from retry more detailed [\#5012](https://github.com/apache/arrow-rs/pull/5012) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([Rachelint](https://github.com/Rachelint)) +- Support onelake fabric paths in parse\_url \(\#5000\) [\#5002](https://github.com/apache/arrow-rs/pull/5002) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Object tagging \(\#4754\) [\#4999](https://github.com/apache/arrow-rs/pull/4999) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- \[MINOR\] No need to jump to web pages [\#4994](https://github.com/apache/arrow-rs/pull/4994) ([smallzhongfeng](https://github.com/smallzhongfeng)) +- Pushdown list\_with\_offset for GCS [\#4993](https://github.com/apache/arrow-rs/pull/4993) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Support bucket name with `.` when parsing GCS URL \(\#4991\) [\#4992](https://github.com/apache/arrow-rs/pull/4992) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Increase default timeout to 30 seconds [\#4989](https://github.com/apache/arrow-rs/pull/4989) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Conditional Put \(\#4879\) [\#4984](https://github.com/apache/arrow-rs/pull/4984) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Update quick-xml requirement from 0.30.0 to 0.31.0 in /object\_store [\#4983](https://github.com/apache/arrow-rs/pull/4983) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- Bump actions/setup-node from 3 to 4 [\#4982](https://github.com/apache/arrow-rs/pull/4982) ([dependabot[bot]](https://github.com/apps/dependabot)) +- Support ImdsManagedIdentityProvider in Azure Functions \(\#4976\) [\#4977](https://github.com/apache/arrow-rs/pull/4977) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Add MultiPartStore \(\#4961\) \(\#4608\) [\#4971](https://github.com/apache/arrow-rs/pull/4971) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Split gcp Module [\#4956](https://github.com/apache/arrow-rs/pull/4956) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Add module links in docs root [\#4955](https://github.com/apache/arrow-rs/pull/4955) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Prepare arrow 48.0.0 [\#4948](https://github.com/apache/arrow-rs/pull/4948) ([tustvold](https://github.com/tustvold)) +- Allow opting out of request signing \(\#4927\) [\#4929](https://github.com/apache/arrow-rs/pull/4929) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Default connection and request timeouts of 5 seconds [\#4928](https://github.com/apache/arrow-rs/pull/4928) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Support service\_account in ApplicationDefaultCredentials and Use SelfSignedJwt [\#4926](https://github.com/apache/arrow-rs/pull/4926) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Generate `ETag`s for `InMemory` and `LocalFileSystem` \(\#4879\) [\#4922](https://github.com/apache/arrow-rs/pull/4922) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Cleanup `object_store::retry` client error handling [\#4915](https://github.com/apache/arrow-rs/pull/4915) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Fix integration tests [\#4889](https://github.com/apache/arrow-rs/pull/4889) ([tustvold](https://github.com/tustvold)) +- Support Parsing Avro File Headers [\#4888](https://github.com/apache/arrow-rs/pull/4888) ([tustvold](https://github.com/tustvold)) +- Update ring requirement from 0.16 to 0.17 in /object\_store [\#4887](https://github.com/apache/arrow-rs/pull/4887) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- Add AWS presigned URL support [\#4876](https://github.com/apache/arrow-rs/pull/4876) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([carols10cents](https://github.com/carols10cents)) +- Flush in creates\_dir\_if\_not\_present\_append \(\#4872\) [\#4874](https://github.com/apache/arrow-rs/pull/4874) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Flush in multiple\_append test \(\#4868\) [\#4869](https://github.com/apache/arrow-rs/pull/4869) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Enable new integration tests \(\#4828\) [\#4862](https://github.com/apache/arrow-rs/pull/4862) ([tustvold](https://github.com/tustvold)) ## [object_store_0.7.1](https://github.com/apache/arrow-rs/tree/object_store_0.7.1) (2023-09-26) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7a4fcd0..db2009b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,83 +19,61 @@ # Changelog -## [object_store_0.8.0](https://github.com/apache/arrow-rs/tree/object_store_0.8.0) (2023-11-02) +## [object_store_0.9.0](https://github.com/apache/arrow-rs/tree/object_store_0.9.0) (2024-01-05) -[Full Changelog](https://github.com/apache/arrow-rs/compare/object_store_0.7.1...object_store_0.8.0) +[Full Changelog](https://github.com/apache/arrow-rs/compare/object_store_0.8.0...object_store_0.9.0) **Breaking changes:** -- Remove ObjectStore::append [\#5016](https://github.com/apache/arrow-rs/pull/5016) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- Don't panic on invalid Azure access key \(\#4972\) [\#4974](https://github.com/apache/arrow-rs/pull/4974) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- Return `PutResult` with an ETag from ObjectStore::put \(\#4934\) [\#4944](https://github.com/apache/arrow-rs/pull/4944) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- Add ObjectMeta::version and GetOptions::version \(\#4925\) [\#4935](https://github.com/apache/arrow-rs/pull/4935) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- Add GetOptions::head [\#4931](https://github.com/apache/arrow-rs/pull/4931) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- Remove Nested async and Fallibility from ObjectStore::list [\#4930](https://github.com/apache/arrow-rs/pull/4930) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- Add ObjectStore::put_opts / Conditional Put [\#4879](https://github.com/apache/arrow-rs/pull/4984) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Remove deprecated try\_with\_option methods [\#5237](https://github.com/apache/arrow-rs/pull/5237) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- object\_store: full HTTP range support [\#5222](https://github.com/apache/arrow-rs/pull/5222) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([clbarnes](https://github.com/clbarnes)) +- feat\(object\_store\): use http1 by default [\#5204](https://github.com/apache/arrow-rs/pull/5204) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([wjones127](https://github.com/wjones127)) +- refactor: change `object_store` CA handling [\#5056](https://github.com/apache/arrow-rs/pull/5056) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([crepererum](https://github.com/crepererum)) **Implemented enhancements:** -- Relax Path Safety on Parse [\#5019](https://github.com/apache/arrow-rs/issues/5019) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- ObjectStore: hard to determine the cause of the error thrown from retry [\#5013](https://github.com/apache/arrow-rs/issues/5013) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- continue existing multi-part upload [\#4961](https://github.com/apache/arrow-rs/issues/4961) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- Simplify ObjectStore::List [\#4946](https://github.com/apache/arrow-rs/issues/4946) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- Return ETag and Version on Put [\#4934](https://github.com/apache/arrow-rs/issues/4934) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- Support Not Signing Requests in AmazonS3 [\#4927](https://github.com/apache/arrow-rs/issues/4927) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- Get Object By Version [\#4925](https://github.com/apache/arrow-rs/issues/4925) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- Plans for supporting Extension Array to support Fixed shape tensor Array [\#4890](https://github.com/apache/arrow-rs/issues/4890) -- Conditional Put Support [\#4879](https://github.com/apache/arrow-rs/issues/4879) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- creates\_dir\_if\_not\_present\_append Test is Flaky [\#4872](https://github.com/apache/arrow-rs/issues/4872) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- Release object\_store `0.7.1` [\#4858](https://github.com/apache/arrow-rs/issues/4858) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- Support User-Defined Object Metadata [\#4754](https://github.com/apache/arrow-rs/issues/4754) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- APIs for directly managing multi-part uploads and saving potential parquet footers [\#4608](https://github.com/apache/arrow-rs/issues/4608) +- Azure Signed URL Support [\#5232](https://github.com/apache/arrow-rs/issues/5232) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- \[object-store\] Make aws region optional. [\#5211](https://github.com/apache/arrow-rs/issues/5211) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- \[object\_store,gcp\] Document GoogleCloudStorage Default Credentials [\#5187](https://github.com/apache/arrow-rs/issues/5187) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Support S3 Express One Zone [\#5140](https://github.com/apache/arrow-rs/issues/5140) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- `object_store`: Allow 403 Forbidden for `copy_if_not_exists` S3 status code [\#5132](https://github.com/apache/arrow-rs/issues/5132) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Add `copy_if_not_exists` support for AmazonS3 via DynamoDB Lock Support [\#4880](https://github.com/apache/arrow-rs/issues/4880) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- object\_store: native certs, w/o webpki-roots [\#4870](https://github.com/apache/arrow-rs/issues/4870) +- object\_store: range request with suffix [\#4611](https://github.com/apache/arrow-rs/issues/4611) **Fixed bugs:** -- ObjectStore parse\_url Incorrectly Handles URLs with Spaces [\#5017](https://github.com/apache/arrow-rs/issues/5017) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- \[objects-store\]: periods/dots error in GCP bucket [\#4991](https://github.com/apache/arrow-rs/issues/4991) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- Azure ImdsManagedIdentityProvider does not work in Azure functions [\#4976](https://github.com/apache/arrow-rs/issues/4976) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- Panic when using an azure object store with an invalid access key [\#4972](https://github.com/apache/arrow-rs/issues/4972) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- Handle Body Errors in AWS CompleteMultipartUpload [\#4965](https://github.com/apache/arrow-rs/issues/4965) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- ObjectStore multiple\_append Test is Flaky [\#4868](https://github.com/apache/arrow-rs/issues/4868) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- \[objectstore\] Problem with special characters in file path [\#4454](https://github.com/apache/arrow-rs/issues/4454) +- ObjectStore::get\_opts Incorrectly Returns Response Size not Object Size [\#5272](https://github.com/apache/arrow-rs/issues/5272) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Single object store has limited throughput on GCS [\#5194](https://github.com/apache/arrow-rs/issues/5194) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- local::tests::invalid\_path fails during object store release verification [\#5035](https://github.com/apache/arrow-rs/issues/5035) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Object Store Doctest Failure with Default Features [\#5025](https://github.com/apache/arrow-rs/issues/5025) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -**Closed issues:** +**Documentation updates:** -- Include onelake fabric path for https [\#5000](https://github.com/apache/arrow-rs/issues/5000) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- \[object\_store\] Support generating and using signed upload URLs [\#4763](https://github.com/apache/arrow-rs/issues/4763) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Document default value of InstanceCredentialProvider [\#5188](https://github.com/apache/arrow-rs/pull/5188) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([justinabrahms](https://github.com/justinabrahms)) **Merged pull requests:** -- Relax path safety \(\#5019\) [\#5020](https://github.com/apache/arrow-rs/pull/5020) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- Decode URL paths \(\#5017\) [\#5018](https://github.com/apache/arrow-rs/pull/5018) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- ObjectStore: make error msg thrown from retry more detailed [\#5012](https://github.com/apache/arrow-rs/pull/5012) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([Rachelint](https://github.com/Rachelint)) -- Support onelake fabric paths in parse\_url \(\#5000\) [\#5002](https://github.com/apache/arrow-rs/pull/5002) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- Object tagging \(\#4754\) [\#4999](https://github.com/apache/arrow-rs/pull/4999) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- \[MINOR\] No need to jump to web pages [\#4994](https://github.com/apache/arrow-rs/pull/4994) ([smallzhongfeng](https://github.com/smallzhongfeng)) -- Pushdown list\_with\_offset for GCS [\#4993](https://github.com/apache/arrow-rs/pull/4993) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- Support bucket name with `.` when parsing GCS URL \(\#4991\) [\#4992](https://github.com/apache/arrow-rs/pull/4992) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- Increase default timeout to 30 seconds [\#4989](https://github.com/apache/arrow-rs/pull/4989) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- Conditional Put \(\#4879\) [\#4984](https://github.com/apache/arrow-rs/pull/4984) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- Update quick-xml requirement from 0.30.0 to 0.31.0 in /object\_store [\#4983](https://github.com/apache/arrow-rs/pull/4983) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([dependabot[bot]](https://github.com/apps/dependabot)) -- Bump actions/setup-node from 3 to 4 [\#4982](https://github.com/apache/arrow-rs/pull/4982) ([dependabot[bot]](https://github.com/apps/dependabot)) -- Support ImdsManagedIdentityProvider in Azure Functions \(\#4976\) [\#4977](https://github.com/apache/arrow-rs/pull/4977) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- Add MultiPartStore \(\#4961\) \(\#4608\) [\#4971](https://github.com/apache/arrow-rs/pull/4971) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- Split gcp Module [\#4956](https://github.com/apache/arrow-rs/pull/4956) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- Add module links in docs root [\#4955](https://github.com/apache/arrow-rs/pull/4955) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- Prepare arrow 48.0.0 [\#4948](https://github.com/apache/arrow-rs/pull/4948) ([tustvold](https://github.com/tustvold)) -- Allow opting out of request signing \(\#4927\) [\#4929](https://github.com/apache/arrow-rs/pull/4929) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- Default connection and request timeouts of 5 seconds [\#4928](https://github.com/apache/arrow-rs/pull/4928) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- Support service\_account in ApplicationDefaultCredentials and Use SelfSignedJwt [\#4926](https://github.com/apache/arrow-rs/pull/4926) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- Generate `ETag`s for `InMemory` and `LocalFileSystem` \(\#4879\) [\#4922](https://github.com/apache/arrow-rs/pull/4922) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- Cleanup `object_store::retry` client error handling [\#4915](https://github.com/apache/arrow-rs/pull/4915) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- Fix integration tests [\#4889](https://github.com/apache/arrow-rs/pull/4889) ([tustvold](https://github.com/tustvold)) -- Support Parsing Avro File Headers [\#4888](https://github.com/apache/arrow-rs/pull/4888) ([tustvold](https://github.com/tustvold)) -- Update ring requirement from 0.16 to 0.17 in /object\_store [\#4887](https://github.com/apache/arrow-rs/pull/4887) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([dependabot[bot]](https://github.com/apps/dependabot)) -- Add AWS presigned URL support [\#4876](https://github.com/apache/arrow-rs/pull/4876) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([carols10cents](https://github.com/carols10cents)) -- Flush in creates\_dir\_if\_not\_present\_append \(\#4872\) [\#4874](https://github.com/apache/arrow-rs/pull/4874) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- Flush in multiple\_append test \(\#4868\) [\#4869](https://github.com/apache/arrow-rs/pull/4869) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- Enable new integration tests \(\#4828\) [\#4862](https://github.com/apache/arrow-rs/pull/4862) ([tustvold](https://github.com/tustvold)) - - +- Retry Safe/Read-Only Requests on Timeout [\#5278](https://github.com/apache/arrow-rs/pull/5278) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Fix ObjectMeta::size for range requests \(\#5272\) [\#5276](https://github.com/apache/arrow-rs/pull/5276) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- docs\(object\_store\): Mention `with_allow_http` in docs of `with_endpoint` [\#5275](https://github.com/apache/arrow-rs/pull/5275) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([Xuanwo](https://github.com/Xuanwo)) +- Support S3 Express One Zone [\#5268](https://github.com/apache/arrow-rs/pull/5268) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- feat\(object\_store\): Azure url signing [\#5259](https://github.com/apache/arrow-rs/pull/5259) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([roeap](https://github.com/roeap)) +- DynamoDB ConditionalPut [\#5247](https://github.com/apache/arrow-rs/pull/5247) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Default AWS region to us-east-1 \(\#5211\) [\#5244](https://github.com/apache/arrow-rs/pull/5244) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- ci: Fail Miri CI on first failure [\#5243](https://github.com/apache/arrow-rs/pull/5243) ([Jefffrey](https://github.com/Jefffrey)) +- Bump actions/upload-pages-artifact from 2 to 3 [\#5229](https://github.com/apache/arrow-rs/pull/5229) ([dependabot[bot]](https://github.com/apps/dependabot)) +- Bump actions/setup-python from 4 to 5 [\#5175](https://github.com/apache/arrow-rs/pull/5175) ([dependabot[bot]](https://github.com/apps/dependabot)) +- fix: ensure take\_fixed\_size\_list can handle null indices [\#5170](https://github.com/apache/arrow-rs/pull/5170) ([westonpace](https://github.com/westonpace)) +- Bump actions/labeler from 4.3.0 to 5.0.0 [\#5167](https://github.com/apache/arrow-rs/pull/5167) ([dependabot[bot]](https://github.com/apps/dependabot)) +- object\_store: fix failing doctest with default features [\#5161](https://github.com/apache/arrow-rs/pull/5161) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([Jefffrey](https://github.com/Jefffrey)) +- Update rustls-pemfile requirement from 1.0 to 2.0 in /object\_store [\#5155](https://github.com/apache/arrow-rs/pull/5155) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- Allow 403 for overwrite prevention [\#5134](https://github.com/apache/arrow-rs/pull/5134) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([emcake](https://github.com/emcake)) +- Fix ObjectStore.LocalFileSystem.put\_opts for blobfuse [\#5094](https://github.com/apache/arrow-rs/pull/5094) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([RobinLin666](https://github.com/RobinLin666)) +- Update itertools requirement from 0.11.0 to 0.12.0 in /object\_store [\#5077](https://github.com/apache/arrow-rs/pull/5077) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- Add a PR under "Breaking changes" in the object\_store 0.8.0 changelog [\#5063](https://github.com/apache/arrow-rs/pull/5063) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([carols10cents](https://github.com/carols10cents)) +- Prepare arrow 49.0.0 [\#5054](https://github.com/apache/arrow-rs/pull/5054) ([tustvold](https://github.com/tustvold)) +- Fix invalid\_path test [\#5026](https://github.com/apache/arrow-rs/pull/5026) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Implement `copy_if_not_exist` for `AmazonS3` using DynamoDB \(\#4880\) [\#4918](https://github.com/apache/arrow-rs/pull/4918) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) \* *This Changelog was automatically generated by [github_changelog_generator](https://github.com/github-changelog-generator/github-changelog-generator)* diff --git a/Cargo.toml b/Cargo.toml index e7f99e5..512fa30 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "object_store" -version = "0.8.0" +version = "0.9.0" edition = "2021" license = "MIT/Apache-2.0" readme = "README.md" diff --git a/dev/release/update_change_log.sh b/dev/release/update_change_log.sh index 33eeb33..a083f61 100755 --- a/dev/release/update_change_log.sh +++ b/dev/release/update_change_log.sh @@ -29,8 +29,8 @@ set -e -SINCE_TAG="object_store_0.7.1" -FUTURE_RELEASE="object_store_0.8.0" +SINCE_TAG="object_store_0.8.0" +FUTURE_RELEASE="object_store_0.9.0" SOURCE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" SOURCE_TOP_DIR="$(cd "${SOURCE_DIR}/../../" && pwd)" From b29b27402ed75223f9055f7ad2d89e1ede92accd Mon Sep 17 00:00:00 2001 From: Marco Neumann Date: Thu, 11 Jan 2024 11:43:09 +0100 Subject: [PATCH 258/397] refactor: log server error during object store retries (#5294) Similar to transport errors, log the server error as well. That should help debugging. --- src/client/retry.rs | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/src/client/retry.rs b/src/client/retry.rs index 9d21867..fbd3645 100644 --- a/src/client/retry.rs +++ b/src/client/retry.rs @@ -246,7 +246,13 @@ impl RetryExt for reqwest::RequestBuilder { let sleep = backoff.next(); retries += 1; - info!("Encountered server error, backing off for {} seconds, retry {} of {}", sleep.as_secs_f32(), retries, max_retries); + info!( + "Encountered server error, backing off for {} seconds, retry {} of {}: {}", + sleep.as_secs_f32(), + retries, + max_retries, + e, + ); tokio::time::sleep(sleep).await; } }, @@ -277,7 +283,13 @@ impl RetryExt for reqwest::RequestBuilder { } let sleep = backoff.next(); retries += 1; - info!("Encountered transport error ({}) backing off for {} seconds, retry {} of {}", e, sleep.as_secs_f32(), retries, max_retries); + info!( + "Encountered transport error backing off for {} seconds, retry {} of {}: {}", + sleep.as_secs_f32(), + retries, + max_retries, + e, + ); tokio::time::sleep(sleep).await; } } From d76b0c1720e4e16b67d707c9ba6c75aae5cf570f Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 18 Jan 2024 11:07:40 +0000 Subject: [PATCH 259/397] Remove localstack DynamoDB workaround (#5267) (#5307) --- src/aws/dynamo.rs | 60 ++++------------------------------------------- 1 file changed, 4 insertions(+), 56 deletions(-) diff --git a/src/aws/dynamo.rs b/src/aws/dynamo.rs index 4331ae2..2390187 100644 --- a/src/aws/dynamo.rs +++ b/src/aws/dynamo.rs @@ -242,40 +242,6 @@ impl DynamoCommit { } } - /// Retrieve a lock, returning an error if it doesn't exist - async fn get_lock(&self, s3: &S3Client, path: &str, etag: Option<&str>) -> Result { - let key_attributes = [ - ("path", AttributeValue::from(path)), - ("etag", AttributeValue::from(etag.unwrap_or("*"))), - ]; - let req = GetItem { - table_name: &self.table_name, - key: Map(&key_attributes), - }; - let credential = s3.config.get_credential().await?; - - let resp = self - .request(s3, credential.as_deref(), "DynamoDB_20120810.GetItem", req) - .await - .map_err(|e| e.error(STORE, path.to_string()))?; - - let body = resp.bytes().await.map_err(|e| Error::Generic { - store: STORE, - source: Box::new(e), - })?; - - let response: GetItemResponse<'_> = - serde_json::from_slice(body.as_ref()).map_err(|e| Error::Generic { - store: STORE, - source: Box::new(e), - })?; - - extract_lease(&response.item).ok_or_else(|| Error::NotFound { - path: path.into(), - source: "DynamoDB GetItem returned no items".to_string().into(), - }) - } - /// Attempt to acquire a lock, reclaiming an existing lease if provided async fn try_lock( &self, @@ -332,22 +298,10 @@ impl DynamoCommit { Err(e) => match parse_error_response(&e) { Some(e) if e.error.ends_with(CONFLICT) => match extract_lease(&e.item) { Some(lease) => Ok(TryLockResult::Conflict(lease)), - // ReturnValuesOnConditionCheckFailure is a relatively recent addition - // to DynamoDB and is not supported by dynamodb-local, which is used - // by localstack. In such cases the conflict error will not contain - // the conflicting item, and we must instead perform a get request - // - // There is a potential race here if the conflicting record is removed - // before we retrieve it. We could retry the transaction in such a scenario, - // but as this only occurs for emulators, we simply abort with a - // not found error - // - // - // - // - None => Ok(TryLockResult::Conflict( - self.get_lock(s3, path, etag).await?, - )), + None => Err(Error::Generic { + store: STORE, + source: "Failed to extract lease from conflict ReturnValuesOnConditionCheckFailure response".into() + }), }, _ => Err(Error::Generic { store: STORE, @@ -509,12 +463,6 @@ struct GetItem<'a> { key: Map<'a, &'a str, AttributeValue<'a>>, } -#[derive(Deserialize)] -struct GetItemResponse<'a> { - #[serde(borrow, default, rename = "Item")] - item: HashMap<&'a str, AttributeValue<'a>>, -} - #[derive(Deserialize)] struct ErrorResponse<'a> { #[serde(rename = "__type")] From 831ca2657b1adb78bda8b838d6b338a89382ec5f Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Fri, 19 Jan 2024 10:47:24 +0000 Subject: [PATCH 260/397] Pass options to HTTPBuilder in parse_url_opts (#5310) (#5311) --- src/parse.rs | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/parse.rs b/src/parse.rs index ddea034..47e537c 100644 --- a/src/parse.rs +++ b/src/parse.rs @@ -105,7 +105,7 @@ impl ObjectStoreScheme { } } -#[cfg(any(feature = "aws", feature = "gcp", feature = "azure"))] +#[cfg(feature = "cloud")] macro_rules! builder_opts { ($builder:ty, $url:expr, $options:expr) => {{ let builder = $options.into_iter().fold( @@ -164,8 +164,7 @@ where } #[cfg(feature = "http")] ObjectStoreScheme::Http => { - let url = &url[..url::Position::BeforePath]; - Box::new(crate::http::HttpBuilder::new().with_url(url).build()?) as _ + builder_opts!(crate::http::HttpBuilder, url, _options) } #[cfg(not(all(feature = "aws", feature = "azure", feature = "gcp", feature = "http")))] s => { From cc5813ce522341da8c116f70b3a7489daa605097 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Fri, 19 Jan 2024 10:47:42 +0000 Subject: [PATCH 261/397] Update IOx links (#5312) --- README.md | 4 ++-- src/lib.rs | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index fd09ec7..3e09471 100644 --- a/README.md +++ b/README.md @@ -20,7 +20,7 @@ # Rust Object Store A focused, easy to use, idiomatic, high performance, `async` object -store library interacting with object stores. +store library for interacting with object stores. Using this crate, the same binary and code can easily run in multiple clouds and local test environments, via a simple runtime configuration @@ -33,7 +33,7 @@ change. Supported object stores include: * Memory * Custom implementations -Originally developed for [InfluxDB IOx](https://github.com/influxdata/influxdb_iox/) and later split out and donated to [Apache Arrow](https://arrow.apache.org/). +Originally developed by [InfluxData](https://www.influxdata.com/) and later donated to [Apache Arrow](https://arrow.apache.org/). See [docs.rs](https://docs.rs/object_store) for usage instructions diff --git a/src/lib.rs b/src/lib.rs index 53a5356..a25224a 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -51,11 +51,11 @@ //! //! 5. Small dependency footprint, depending on only a small number of common crates //! -//! Originally developed for [InfluxDB IOx] and subsequently donated +//! Originally developed by [InfluxData] and subsequently donated //! to [Apache Arrow]. //! //! [Apache Arrow]: https://arrow.apache.org/ -//! [InfluxDB IOx]: https://github.com/influxdata/influxdb_iox/ +//! [InfluxData]: https://www.influxdata.com/ //! [crates.io]: https://github.com/rust-lang/crates.io //! [ACID]: https://en.wikipedia.org/wiki/ACID //! [S3]: https://aws.amazon.com/s3/ From 0aec97fa00eca7f384209b2b711c915fee8fb2de Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Sat, 20 Jan 2024 10:48:01 +0000 Subject: [PATCH 262/397] Test parse_url_opts for HTTP (#5310) (#5316) * Test parse_url_opts for HTTP (#5310) * Format --- src/parse.rs | 27 ++++++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/src/parse.rs b/src/parse.rs index 47e537c..116c2ad 100644 --- a/src/parse.rs +++ b/src/parse.rs @@ -109,7 +109,7 @@ impl ObjectStoreScheme { macro_rules! builder_opts { ($builder:ty, $url:expr, $options:expr) => {{ let builder = $options.into_iter().fold( - <$builder>::new().with_url($url.as_str()), + <$builder>::new().with_url($url.to_string()), |builder, (key, value)| match key.as_ref().parse() { Ok(k) => builder.with_config(k, value), Err(_) => builder, @@ -164,6 +164,7 @@ where } #[cfg(feature = "http")] ObjectStoreScheme::Http => { + let url = &url[..url::Position::BeforePath]; builder_opts!(crate::http::HttpBuilder, url, _options) } #[cfg(not(all(feature = "aws", feature = "azure", feature = "gcp", feature = "http")))] @@ -305,4 +306,28 @@ mod tests { let (_, path) = parse_url(&url).unwrap(); assert_eq!(path.as_ref(), "my file with spaces"); } + + #[tokio::test] + #[cfg(feature = "http")] + async fn test_url_http() { + use crate::client::mock_server::MockServer; + use hyper::{header::USER_AGENT, Body, Response}; + + let server = MockServer::new(); + + server.push_fn(|r| { + assert_eq!(r.uri().path(), "/foo/bar"); + assert_eq!(r.headers().get(USER_AGENT).unwrap(), "test_url"); + Response::new(Body::empty()) + }); + + let test = format!("{}/foo/bar", server.url()); + let opts = [("user_agent", "test_url"), ("allow_http", "true")]; + let url = test.parse().unwrap(); + let (store, path) = parse_url_opts(&url, opts).unwrap(); + assert_eq!(path.as_ref(), "foo/bar"); + store.get(&path).await.unwrap(); + + server.shutdown().await; + } } From e5dfda3f961502fec857c765db28d1403d72e44d Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Tue, 30 Jan 2024 16:12:19 +0000 Subject: [PATCH 263/397] Exclude questions from changelog (#5349) --- .github_changelog_generator | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github_changelog_generator b/.github_changelog_generator index 69b574a..6d44a8e 100644 --- a/.github_changelog_generator +++ b/.github_changelog_generator @@ -23,5 +23,5 @@ add-sections={"documentation":{"prefix":"**Documentation updates:**","labels":[" # so that the component is shown associated with the issue issue-line-labels=object-store # skip non object_store issues -exclude-labels=development-process,invalid,arrow,parquet,arrow-flight,parquet-derive +exclude-labels=development-process,invalid,arrow,parquet,arrow-flight,parquet-derive,question breaking_labels=api-change From 025700d2d6f0a191b791fca4387405b62687d913 Mon Sep 17 00:00:00 2001 From: Will Jones Date: Wed, 7 Feb 2024 08:22:19 -0800 Subject: [PATCH 264/397] docs(object-store): add warning to flush (#5369) * add warning to flush * Update object_store/src/lib.rs Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> * format --------- Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> --- src/lib.rs | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/lib.rs b/src/lib.rs index a25224a..deb133d 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -551,6 +551,15 @@ pub trait ObjectStore: std::fmt::Display + Send + Sync + Debug + 'static { /// writer fails or panics, you must call [ObjectStore::abort_multipart] /// to clean up partially written data. /// + ///
+ /// It is recommended applications wait for any in-flight requests to complete by calling `flush`, if + /// there may be a significant gap in time (> ~30s) before the next write. + /// These gaps can include times where the function returns control to the + /// caller while keeping the writer open. If `flush` is not called, futures + /// for in-flight requests may be left unpolled long enough for the requests + /// to time out, causing the write to fail. + ///
+ /// /// For applications requiring fine-grained control of multipart uploads /// see [`MultiPartStore`], although note that this interface cannot be /// supported by all [`ObjectStore`] backends. From 33bcbe62646480e2cb9a386dbe5ad4dc5ff4689f Mon Sep 17 00:00:00 2001 From: Brad V Date: Thu, 8 Feb 2024 09:26:47 -0500 Subject: [PATCH 265/397] Pull container name from URL for Azure blob in https://.blob.core.windows.net/ case (#5371) --- src/azure/builder.rs | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/src/azure/builder.rs b/src/azure/builder.rs index 905fa52..530095f 100644 --- a/src/azure/builder.rs +++ b/src/azure/builder.rs @@ -474,6 +474,7 @@ impl MicrosoftAzureBuilder { /// - `azure:///` (custom) /// - `https://.dfs.core.windows.net` /// - `https://.blob.core.windows.net` + /// - `https://.blob.core.windows.net/` /// - `https://.dfs.fabric.microsoft.com` /// - `https://.dfs.fabric.microsoft.com/` /// - `https://.blob.fabric.microsoft.com` @@ -589,6 +590,9 @@ impl MicrosoftAzureBuilder { "https" => match host.split_once('.') { Some((a, "dfs.core.windows.net")) | Some((a, "blob.core.windows.net")) => { self.account_name = Some(validate(a)?); + if let Some(container) = parsed.path_segments().unwrap().next() { + self.container_name = Some(validate(container)?); + } } Some((a, "dfs.fabric.microsoft.com")) | Some((a, "blob.fabric.microsoft.com")) => { self.account_name = Some(validate(a)?); @@ -984,6 +988,14 @@ mod tests { assert_eq!(builder.account_name, Some("account".to_string())); assert!(!builder.use_fabric_endpoint.get().unwrap()); + let mut builder = MicrosoftAzureBuilder::new(); + builder + .parse_url("https://account.blob.core.windows.net/container") + .unwrap(); + assert_eq!(builder.account_name, Some("account".to_string())); + assert_eq!(builder.container_name, Some("container".to_string())); + assert!(!builder.use_fabric_endpoint.get().unwrap()); + let mut builder = MicrosoftAzureBuilder::new(); builder .parse_url("https://account.dfs.fabric.microsoft.com/") From f10987d56ec2948148df2bebd750ffc2995f9b4b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Guedes?= Date: Sat, 17 Feb 2024 20:28:00 -0300 Subject: [PATCH 266/397] [object_store] Fix empty Multipart Upload for AWS S3 (#5405) * Fix empty multipart put for AWS S3 * Fix lint --- src/aws/client.rs | 10 ++++++++++ src/lib.rs | 15 +++++++++++++++ 2 files changed, 25 insertions(+) diff --git a/src/aws/client.rs b/src/aws/client.rs index e06a0ce..fed6911 100644 --- a/src/aws/client.rs +++ b/src/aws/client.rs @@ -538,6 +538,16 @@ impl S3Client { upload_id: &str, parts: Vec, ) -> Result { + let parts = if parts.is_empty() { + // If no parts were uploaded, upload an empty part + // otherwise the completion request will fail + let part = self + .put_part(location, &upload_id.to_string(), 0, Bytes::new()) + .await?; + vec![part] + } else { + parts + }; let request = CompleteMultipartUpload::from(parts); let body = quick_xml::se::to_string(&request).unwrap(); diff --git a/src/lib.rs b/src/lib.rs index deb133d..af5676e 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -2200,6 +2200,21 @@ mod tests { let meta = storage.head(&path).await.unwrap(); assert_eq!(meta.size, chunk_size * 2); + + // Empty case + let path = Path::from("test_empty_multipart"); + + let id = multipart.create_multipart(&path).await.unwrap(); + + let parts = vec![]; + + multipart + .complete_multipart(&path, &id, parts) + .await + .unwrap(); + + let meta = storage.head(&path).await.unwrap(); + assert_eq!(meta.size, 0); } #[cfg(any(feature = "azure", feature = "aws"))] From c81c58e76cb5b629c5711f60473f99c77497bd4b Mon Sep 17 00:00:00 2001 From: Xuanwo Date: Tue, 20 Feb 2024 11:50:18 +0800 Subject: [PATCH 267/397] docds(object_store): Mention HTTP/WebDAV in README (#5409) Signed-off-by: Xuanwo --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 3e09471..1799bf8 100644 --- a/README.md +++ b/README.md @@ -31,6 +31,7 @@ change. Supported object stores include: * [Google Cloud Storage](https://cloud.google.com/storage) * Local files * Memory +* [HTTP/WebDAV Storage](https://datatracker.ietf.org/doc/html/rfc2518) * Custom implementations Originally developed by [InfluxData](https://www.influxdata.com/) and later donated to [Apache Arrow](https://arrow.apache.org/). From 39287ca39983b90f8ea11de8233057b513a95886 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sebasti=C3=A1n=20Galkin?= Date: Wed, 21 Feb 2024 15:21:59 -0300 Subject: [PATCH 268/397] fix(object_store): Include Content-MD5 header for S3 DeleteObjects (#5415) * fix(object_store): Include Content-MD5 header for S3 DeleteObjects S3 API [specification](https://docs.aws.amazon.com/AmazonS3/latest/API/API_DeleteObjects.html) requires the presence of this header for all `DeleteObjects` requests to general purpose buckets: > The Content-MD5 request header is required for all Multi-Object Delete requests Some platform, such as MinIO, enforce this requirement, failing requests that don't include the header. * Switch dependency from md5 to md-5 md-5 seems better maintained. --- Cargo.toml | 3 ++- src/aws/client.rs | 9 +++++++++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index 512fa30..f3aaf35 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -54,6 +54,7 @@ reqwest = { version = "0.11", default-features = false, features = ["rustls-tls- ring = { version = "0.17", default-features = false, features = ["std"], optional = true } rustls-pemfile = { version = "2.0", default-features = false, features = ["std"], optional = true } tokio = { version = "1.25.0", features = ["sync", "macros", "rt", "time", "io-util"] } +md-5 = { version = "0.10.6", default-features = false, optional = true } [target.'cfg(target_family="unix")'.dev-dependencies] nix = { version = "0.27.1", features = ["fs"] } @@ -62,7 +63,7 @@ nix = { version = "0.27.1", features = ["fs"] } cloud = ["serde", "serde_json", "quick-xml", "hyper", "reqwest", "reqwest/json", "reqwest/stream", "chrono/serde", "base64", "rand", "ring"] azure = ["cloud"] gcp = ["cloud", "rustls-pemfile"] -aws = ["cloud"] +aws = ["cloud", "md-5"] http = ["cloud"] tls-webpki-roots = ["reqwest?/rustls-tls-webpki-roots"] diff --git a/src/aws/client.rs b/src/aws/client.rs index fed6911..a31350f 100644 --- a/src/aws/client.rs +++ b/src/aws/client.rs @@ -43,6 +43,7 @@ use bytes::{Buf, Bytes}; use hyper::http; use hyper::http::HeaderName; use itertools::Itertools; +use md5::{Digest, Md5}; use percent_encoding::{utf8_percent_encode, PercentEncode}; use quick_xml::events::{self as xml_events}; use reqwest::{ @@ -438,6 +439,14 @@ impl S3Client { None }; + // S3 *requires* DeleteObjects to include a Content-MD5 header: + // https://docs.aws.amazon.com/AmazonS3/latest/API/API_DeleteObjects.html + // > "The Content-MD5 request header is required for all Multi-Object Delete requests" + // Some platforms, like MinIO, enforce this requirement and fail requests without the header. + let mut hasher = Md5::new(); + hasher.update(&body); + builder = builder.header("Content-MD5", BASE64_STANDARD.encode(hasher.finalize())); + let response = builder .header(CONTENT_TYPE, "application/xml") .body(body) From df91c4cf2814cddf8ae1e1b4fc44341fa7f60c6c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Guedes?= Date: Sun, 25 Feb 2024 16:34:06 -0300 Subject: [PATCH 269/397] Enable anonymous access for MicrosoftAzure (#5425) * Enables anonymous access for MicrosoftAzure store * Add test * Fix warnings * Change impl to prevent breaking change * Improve error message * Do not change AzureAuthorizer API --- src/azure/builder.rs | 39 +++++++++++++++++--- src/azure/client.rs | 26 +++++++++---- src/azure/credential.rs | 81 ++++++++++++++++++++++++++++++++++------- 3 files changed, 120 insertions(+), 26 deletions(-) diff --git a/src/azure/builder.rs b/src/azure/builder.rs index 530095f..ee09534 100644 --- a/src/azure/builder.rs +++ b/src/azure/builder.rs @@ -169,6 +169,8 @@ pub struct MicrosoftAzureBuilder { client_options: ClientOptions, /// Credentials credentials: Option, + /// Skip signing requests + skip_signature: ConfigValue, /// When set to true, fabric url scheme will be used /// /// i.e. https://{account_name}.dfs.fabric.microsoft.com @@ -316,6 +318,13 @@ pub enum AzureConfigKey { /// - `use_azure_cli` UseAzureCli, + /// Skip signing requests + /// + /// Supported keys: + /// - `azure_skip_signature` + /// - `skip_signature` + SkipSignature, + /// Container name /// /// Supported keys: @@ -354,6 +363,7 @@ impl AsRef for AzureConfigKey { Self::MsiResourceId => "azure_msi_resource_id", Self::FederatedTokenFile => "azure_federated_token_file", Self::UseAzureCli => "azure_use_azure_cli", + Self::SkipSignature => "azure_skip_signature", Self::ContainerName => "azure_container_name", Self::DisableTagging => "azure_disable_tagging", Self::Client(key) => key.as_ref(), @@ -398,6 +408,7 @@ impl FromStr for AzureConfigKey { "azure_federated_token_file" | "federated_token_file" => Ok(Self::FederatedTokenFile), "azure_use_fabric_endpoint" | "use_fabric_endpoint" => Ok(Self::UseFabricEndpoint), "azure_use_azure_cli" | "use_azure_cli" => Ok(Self::UseAzureCli), + "azure_skip_signature" | "skip_signature" => Ok(Self::SkipSignature), "azure_container_name" | "container_name" => Ok(Self::ContainerName), "azure_disable_tagging" | "disable_tagging" => Ok(Self::DisableTagging), // Backwards compatibility @@ -510,6 +521,7 @@ impl MicrosoftAzureBuilder { AzureConfigKey::MsiResourceId => self.msi_resource_id = Some(value.into()), AzureConfigKey::FederatedTokenFile => self.federated_token_file = Some(value.into()), AzureConfigKey::UseAzureCli => self.use_azure_cli.parse(value), + AzureConfigKey::SkipSignature => self.skip_signature.parse(value), AzureConfigKey::UseEmulator => self.use_emulator.parse(value), AzureConfigKey::Endpoint => self.endpoint = Some(value.into()), AzureConfigKey::UseFabricEndpoint => self.use_fabric_endpoint.parse(value), @@ -550,6 +562,7 @@ impl MicrosoftAzureBuilder { AzureConfigKey::MsiResourceId => self.msi_resource_id.clone(), AzureConfigKey::FederatedTokenFile => self.federated_token_file.clone(), AzureConfigKey::UseAzureCli => Some(self.use_azure_cli.to_string()), + AzureConfigKey::SkipSignature => Some(self.skip_signature.to_string()), AzureConfigKey::Client(key) => self.client_options.get_config_value(key), AzureConfigKey::ContainerName => self.container_name.clone(), AzureConfigKey::DisableTagging => Some(self.disable_tagging.to_string()), @@ -784,6 +797,14 @@ impl MicrosoftAzureBuilder { self } + /// If enabled, [`MicrosoftAzure`] will not fetch credentials and will not sign requests + /// + /// This can be useful when interacting with public containers + pub fn with_skip_signature(mut self, skip_signature: bool) -> Self { + self.skip_signature = skip_signature.into(); + self + } + /// If set to `true` will ignore any tags provided to put_opts pub fn with_disable_tagging(mut self, ignore: bool) -> Self { self.disable_tagging = ignore.into(); @@ -809,15 +830,20 @@ impl MicrosoftAzureBuilder { // Allow overriding defaults. Values taken from // from https://docs.rs/azure_storage/0.2.0/src/azure_storage/core/clients/storage_account_client.rs.html#129-141 let url = url_from_env("AZURITE_BLOB_STORAGE_URL", "http://127.0.0.1:10000")?; - let key = match self.access_key { - Some(k) => AzureAccessKey::try_new(&k)?, - None => AzureAccessKey::try_new(EMULATOR_ACCOUNT_KEY)?, + let credential = if let Some(k) = self.access_key { + AzureCredential::AccessKey(AzureAccessKey::try_new(&k)?) + } else if let Some(bearer_token) = self.bearer_token { + AzureCredential::BearerToken(bearer_token) + } else if let Some(query_pairs) = self.sas_query_pairs { + AzureCredential::SASToken(query_pairs) + } else if let Some(sas) = self.sas_key { + AzureCredential::SASToken(split_sas(&sas)?) + } else { + AzureCredential::AccessKey(AzureAccessKey::try_new(EMULATOR_ACCOUNT_KEY)?) }; - let credential = static_creds(AzureCredential::AccessKey(key)); - self.client_options = self.client_options.with_allow_http(true); - (true, url, credential, account_name) + (true, url, static_creds(credential), account_name) } else { let account_name = self.account_name.ok_or(Error::MissingAccount {})?; let account_url = match self.endpoint { @@ -893,6 +919,7 @@ impl MicrosoftAzureBuilder { let config = AzureConfig { account, is_emulator, + skip_signature: self.skip_signature.get()?, container, disable_tagging: self.disable_tagging.get()?, retry_config: self.retry_config, diff --git a/src/azure/client.rs b/src/azure/client.rs index 41b7cbd..feea2f2 100644 --- a/src/azure/client.rs +++ b/src/azure/client.rs @@ -114,6 +114,9 @@ pub(crate) enum Error { #[snafu(display("Generating SAS keys with SAS tokens auth is not supported"))] SASforSASNotSupported, + + #[snafu(display("Generating SAS keys while skipping signatures is not supported"))] + SASwithSkipSignature, } impl From for crate::Error { @@ -139,6 +142,7 @@ pub(crate) struct AzureConfig { pub retry_config: RetryConfig, pub service: Url, pub is_emulator: bool, + pub skip_signature: bool, pub disable_tagging: bool, pub client_options: ClientOptions, } @@ -155,6 +159,13 @@ impl AzureConfig { } url } + async fn get_credential(&self) -> Result>> { + if self.skip_signature { + Ok(None) + } else { + Some(self.credentials.get_credential().await).transpose() + } + } } /// A builder for a put request allowing customisation of the headers and query string @@ -176,7 +187,7 @@ impl<'a> PutRequest<'a> { } async fn send(self) -> Result { - let credential = self.config.credentials.get_credential().await?; + let credential = self.config.get_credential().await?; let response = self .builder .with_azure_authorization(&credential, &self.config.account) @@ -208,8 +219,8 @@ impl AzureClient { &self.config } - async fn get_credential(&self) -> Result> { - self.config.credentials.get_credential().await + async fn get_credential(&self) -> Result>> { + self.config.get_credential().await } fn put_request<'a>(&'a self, path: &'a Path, bytes: Bytes) -> PutRequest<'a> { @@ -314,7 +325,7 @@ impl AzureClient { // If using SAS authorization must include the headers in the URL // - if let AzureCredential::SASToken(pairs) = credential.as_ref() { + if let Some(AzureCredential::SASToken(pairs)) = credential.as_deref() { source.query_pairs_mut().extend_pairs(pairs); } @@ -384,8 +395,8 @@ impl AzureClient { let credential = self.get_credential().await?; let signed_start = chrono::Utc::now(); let signed_expiry = signed_start + expires_in; - match credential.as_ref() { - AzureCredential::BearerToken(_) => { + match credential.as_deref() { + Some(AzureCredential::BearerToken(_)) => { let key = self .get_user_delegation_key(&signed_start, &signed_expiry) .await?; @@ -398,13 +409,14 @@ impl AzureClient { Some(key), )) } - AzureCredential::AccessKey(key) => Ok(AzureSigner::new( + Some(AzureCredential::AccessKey(key)) => Ok(AzureSigner::new( key.to_owned(), self.config.account.clone(), signed_start, signed_expiry, None, )), + None => Err(Error::SASwithSkipSignature.into()), _ => Err(Error::SASforSASNotSupported.into()), } } diff --git a/src/azure/credential.rs b/src/azure/credential.rs index bfbbde8..9360831 100644 --- a/src/azure/credential.rs +++ b/src/azure/credential.rs @@ -36,6 +36,7 @@ use snafu::{ResultExt, Snafu}; use std::borrow::Cow; use std::collections::HashMap; use std::fmt::Debug; +use std::ops::Deref; use std::process::Command; use std::str; use std::sync::Arc; @@ -186,6 +187,18 @@ impl AzureSigner { } } +fn add_date_and_version_headers(request: &mut Request) { + // rfc2822 string should never contain illegal characters + let date = Utc::now(); + let date_str = date.format(RFC1123_FMT).to_string(); + // we formatted the data string ourselves, so unwrapping should be fine + let date_val = HeaderValue::from_str(&date_str).unwrap(); + request.headers_mut().insert(DATE, date_val); + request + .headers_mut() + .insert(&VERSION, AZURE_VERSION.clone()); +} + /// Authorize a [`Request`] with an [`AzureAuthorizer`] #[derive(Debug)] pub struct AzureAuthorizer<'a> { @@ -204,15 +217,7 @@ impl<'a> AzureAuthorizer<'a> { /// Authorize `request` pub fn authorize(&self, request: &mut Request) { - // rfc2822 string should never contain illegal characters - let date = Utc::now(); - let date_str = date.format(RFC1123_FMT).to_string(); - // we formatted the data string ourselves, so unwrapping should be fine - let date_val = HeaderValue::from_str(&date_str).unwrap(); - request.headers_mut().insert(DATE, date_val); - request - .headers_mut() - .insert(&VERSION, AZURE_VERSION.clone()); + add_date_and_version_headers(request); match self.credential { AzureCredential::AccessKey(key) => { @@ -250,15 +255,30 @@ impl<'a> AzureAuthorizer<'a> { pub(crate) trait CredentialExt { /// Apply authorization to requests against azure storage accounts /// - fn with_azure_authorization(self, credential: &AzureCredential, account: &str) -> Self; + fn with_azure_authorization( + self, + credential: &Option>, + account: &str, + ) -> Self; } impl CredentialExt for RequestBuilder { - fn with_azure_authorization(self, credential: &AzureCredential, account: &str) -> Self { + fn with_azure_authorization( + self, + credential: &Option>, + account: &str, + ) -> Self { let (client, request) = self.build_split(); let mut request = request.expect("request valid"); - AzureAuthorizer::new(credential, account).authorize(&mut request); + match credential.as_deref() { + Some(credential) => { + AzureAuthorizer::new(credential, account).authorize(&mut request); + } + None => { + add_date_and_version_headers(&mut request); + } + } Self::from_parts(client, request) } @@ -911,12 +931,14 @@ impl CredentialProvider for AzureCliCredential { mod tests { use futures::executor::block_on; use hyper::body::to_bytes; - use hyper::{Body, Response}; + use hyper::{Body, Response, StatusCode}; use reqwest::{Client, Method}; use tempfile::NamedTempFile; use super::*; + use crate::azure::MicrosoftAzureBuilder; use crate::client::mock_server::MockServer; + use crate::{ObjectStore, Path}; #[tokio::test] async fn test_managed_identity() { @@ -1025,4 +1047,37 @@ mod tests { &AzureCredential::BearerToken("TOKEN".into()) ); } + + #[tokio::test] + async fn test_no_credentials() { + let server = MockServer::new(); + + let endpoint = server.url(); + let store = MicrosoftAzureBuilder::new() + .with_account("test") + .with_container_name("test") + .with_allow_http(true) + .with_bearer_token_authorization("token") + .with_endpoint(endpoint.to_string()) + .with_skip_signature(true) + .build() + .unwrap(); + + server.push_fn(|req| { + assert_eq!(req.method(), &Method::GET); + assert!(req.headers().get("Authorization").is_none()); + Response::builder() + .status(StatusCode::NOT_FOUND) + .body(Body::from("not found")) + .unwrap() + }); + + let path = Path::from("file.txt"); + match store.get(&path).await { + Err(crate::Error::NotFound { .. }) => {} + _ => { + panic!("unexpected response"); + } + } + } } From 7f4442fee0ea4b816b564d015913937fb0c5cc1e Mon Sep 17 00:00:00 2001 From: Will Jones Date: Sun, 25 Feb 2024 20:55:43 -0800 Subject: [PATCH 270/397] feat: S3 server-side encryption (#5402) * feat: s3 server-side encryption * cleanup * fix instructions * also run with encryption in CI * feedback * fix clippy --- CONTRIBUTING.md | 39 ++++++-- src/aws/builder.rs | 228 ++++++++++++++++++++++++++++++++++++++++++++- src/aws/client.rs | 18 +++- src/aws/mod.rs | 54 ++++++++++- 4 files changed, 326 insertions(+), 13 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index aeb38e1..4b0ef1f 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -39,7 +39,8 @@ To test the S3 integration against [localstack](https://localstack.cloud/) First start up a container running localstack ``` -$ podman run -d -p 4566:4566 localstack/localstack:2.0 +$ LOCALSTACK_VERSION=sha256:a0b79cb2430f1818de2c66ce89d41bba40f5a1823410f5a7eaf3494b692eed97 +$ podman run -d -p 4566:4566 localstack/localstack@$LOCALSTACK_VERSION $ podman run -d -p 1338:1338 amazon/amazon-ec2-metadata-mock:v1.9.2 --imdsv2 ``` @@ -47,13 +48,12 @@ Setup environment ``` export TEST_INTEGRATION=1 -export OBJECT_STORE_AWS_DEFAULT_REGION=us-east-1 -export OBJECT_STORE_AWS_ACCESS_KEY_ID=test -export OBJECT_STORE_AWS_SECRET_ACCESS_KEY=test -export OBJECT_STORE_AWS_ENDPOINT=http://localhost:4566 +export AWS_DEFAULT_REGION=us-east-1 export AWS_ACCESS_KEY_ID=test export AWS_SECRET_ACCESS_KEY=test -export OBJECT_STORE_BUCKET=test-bucket +export AWS_ENDPOINT=http://localhost:4566 +export AWS_ALLOW_HTTP=true +export AWS_BUCKET_NAME=test-bucket ``` Create a bucket using the AWS CLI @@ -66,6 +66,7 @@ Or directly with: ``` aws s3 mb s3://test-bucket --endpoint-url=http://localhost:4566 +aws --endpoint-url=http://localhost:4566 dynamodb create-table --table-name test-table --key-schema AttributeName=path,KeyType=HASH AttributeName=etag,KeyType=RANGE --attribute-definitions AttributeName=path,AttributeType=S AttributeName=etag,AttributeType=S --provisioned-throughput ReadCapacityUnits=5,WriteCapacityUnits=5 ``` Run tests @@ -74,6 +75,32 @@ Run tests $ cargo test --features aws ``` +#### Encryption tests + +To create an encryption key for the tests, you can run the following command: + +``` +export AWS_SSE_KMS_KEY_ID=$(aws --endpoint-url=http://localhost:4566 \ + kms create-key --description "test key" | + jq -r '.KeyMetadata.KeyId') +``` + +To run integration tests with encryption, you can set the following environment variables: + +``` +export AWS_SERVER_SIDE_ENCRYPTION=aws:kms +export AWS_SSE_BUCKET_KEY=false +cargo test --features aws +``` + +As well as: + +``` +unset AWS_SSE_BUCKET_KEY +export AWS_SERVER_SIDE_ENCRYPTION=aws:kms:dsse +cargo test --features aws +``` + ### Azure To test the Azure integration diff --git a/src/aws/builder.rs b/src/aws/builder.rs index 9a296bc..a578d1a 100644 --- a/src/aws/builder.rs +++ b/src/aws/builder.rs @@ -27,6 +27,7 @@ use crate::client::TokenCredentialProvider; use crate::config::ConfigValue; use crate::{ClientConfigKey, ClientOptions, Result, RetryConfig, StaticCredentialProvider}; use itertools::Itertools; +use reqwest::header::{HeaderMap, HeaderValue}; use serde::{Deserialize, Serialize}; use snafu::{OptionExt, ResultExt, Snafu}; use std::str::FromStr; @@ -83,6 +84,19 @@ enum Error { #[snafu(display("Failed to parse the region for bucket '{}'", bucket))] RegionParse { bucket: String }, + + #[snafu(display("Invalid encryption type: {}. Valid values are \"AES256\", \"sse:kms\", and \"sse:kms:dsse\".", passed))] + InvalidEncryptionType { passed: String }, + + #[snafu(display( + "Invalid encryption header values. Header: {}, source: {}", + header, + source + ))] + InvalidEncryptionHeader { + header: &'static str, + source: Box, + }, } impl From for crate::Error { @@ -160,6 +174,10 @@ pub struct AmazonS3Builder { conditional_put: Option>, /// Ignore tags disable_tagging: ConfigValue, + /// Encryption (See [`S3EncryptionConfigKey`]) + encryption_type: Option>, + encryption_kms_key_id: Option, + encryption_bucket_key_enabled: Option>, } /// Configuration keys for [`AmazonS3Builder`] @@ -322,6 +340,9 @@ pub enum AmazonS3ConfigKey { /// Client options Client(ClientConfigKey), + + /// Encryption options + Encryption(S3EncryptionConfigKey), } impl AsRef for AmazonS3ConfigKey { @@ -346,6 +367,7 @@ impl AsRef for AmazonS3ConfigKey { Self::ConditionalPut => "aws_conditional_put", Self::DisableTagging => "aws_disable_tagging", Self::Client(opt) => opt.as_ref(), + Self::Encryption(opt) => opt.as_ref(), } } } @@ -377,6 +399,13 @@ impl FromStr for AmazonS3ConfigKey { "aws_disable_tagging" | "disable_tagging" => Ok(Self::DisableTagging), // Backwards compatibility "aws_allow_http" => Ok(Self::Client(ClientConfigKey::AllowHttp)), + "aws_server_side_encryption" => Ok(Self::Encryption( + S3EncryptionConfigKey::ServerSideEncryption, + )), + "aws_sse_kms_key_id" => Ok(Self::Encryption(S3EncryptionConfigKey::KmsKeyId)), + "aws_sse_bucket_key_enabled" => { + Ok(Self::Encryption(S3EncryptionConfigKey::BucketKeyEnabled)) + } _ => match s.parse() { Ok(key) => Ok(Self::Client(key)), Err(_) => Err(Error::UnknownConfigurationKey { key: s.into() }.into()), @@ -486,6 +515,15 @@ impl AmazonS3Builder { AmazonS3ConfigKey::ConditionalPut => { self.conditional_put = Some(ConfigValue::Deferred(value.into())) } + AmazonS3ConfigKey::Encryption(key) => match key { + S3EncryptionConfigKey::ServerSideEncryption => { + self.encryption_type = Some(ConfigValue::Deferred(value.into())) + } + S3EncryptionConfigKey::KmsKeyId => self.encryption_kms_key_id = Some(value.into()), + S3EncryptionConfigKey::BucketKeyEnabled => { + self.encryption_bucket_key_enabled = Some(ConfigValue::Deferred(value.into())) + } + }, }; self } @@ -531,6 +569,16 @@ impl AmazonS3Builder { self.conditional_put.as_ref().map(ToString::to_string) } AmazonS3ConfigKey::DisableTagging => Some(self.disable_tagging.to_string()), + AmazonS3ConfigKey::Encryption(key) => match key { + S3EncryptionConfigKey::ServerSideEncryption => { + self.encryption_type.as_ref().map(ToString::to_string) + } + S3EncryptionConfigKey::KmsKeyId => self.encryption_kms_key_id.clone(), + S3EncryptionConfigKey::BucketKeyEnabled => self + .encryption_bucket_key_enabled + .as_ref() + .map(ToString::to_string), + }, } } @@ -759,6 +807,35 @@ impl AmazonS3Builder { self } + /// Use SSE-KMS for server side encryption. + pub fn with_sse_kms_encryption(mut self, kms_key_id: impl Into) -> Self { + self.encryption_type = Some(ConfigValue::Parsed(S3EncryptionType::SseKms)); + if let Some(kms_key_id) = kms_key_id.into().into() { + self.encryption_kms_key_id = Some(kms_key_id); + } + self + } + + /// Use dual server side encryption for server side encryption. + pub fn with_dsse_kms_encryption(mut self, kms_key_id: impl Into) -> Self { + self.encryption_type = Some(ConfigValue::Parsed(S3EncryptionType::DsseKms)); + if let Some(kms_key_id) = kms_key_id.into().into() { + self.encryption_kms_key_id = Some(kms_key_id); + } + self + } + + /// Set whether to enable bucket key for server side encryption. This overrides + /// the bucket default setting for bucket keys. + /// + /// When bucket keys are disabled, each object is encrypted with a unique data key. + /// When bucket keys are enabled, a single data key is used for the entire bucket, + /// reducing overhead of encryption. + pub fn with_bucket_key(mut self, enabled: bool) -> Self { + self.encryption_bucket_key_enabled = Some(ConfigValue::Parsed(enabled)); + self + } + /// Create a [`AmazonS3`] instance from the provided values, /// consuming `self`. pub fn build(mut self) -> Result { @@ -882,6 +959,18 @@ impl AmazonS3Builder { (None, None, false) => format!("https://s3.{region}.amazonaws.com/{bucket}"), }; + let encryption_headers = if let Some(encryption_type) = self.encryption_type { + S3EncryptionHeaders::try_new( + &encryption_type.get()?, + self.encryption_kms_key_id, + self.encryption_bucket_key_enabled + .map(|val| val.get()) + .transpose()?, + )? + } else { + S3EncryptionHeaders::default() + }; + let config = S3Config { region, endpoint: self.endpoint, @@ -897,6 +986,7 @@ impl AmazonS3Builder { checksum, copy_if_not_exists, conditional_put: put_precondition, + encryption_headers, }; let client = Arc::new(S3Client::new(config)?); @@ -912,6 +1002,121 @@ fn parse_bucket_az(bucket: &str) -> Option<&str> { Some(bucket.strip_suffix("--x-s3")?.rsplit_once("--")?.1) } +/// Encryption configuration options for S3. +/// +/// These options are used to configure server-side encryption for S3 objects. +/// To configure them, pass them to [`AmazonS3Builder::with_config`]. +/// +/// Both [SSE-KMS] and [DSSE-KMS] are supported. [SSE-C] is not yet supported. +/// +/// [SSE-KMS]: https://docs.aws.amazon.com/AmazonS3/latest/userguide/UsingKMSEncryption.html +/// [DSSE-KMS]: https://docs.aws.amazon.com/AmazonS3/latest/userguide/UsingDSSEncryption.html +/// [SSE-C]: https://docs.aws.amazon.com/AmazonS3/latest/userguide/ServerSideEncryptionCustomerKeys.html +#[derive(PartialEq, Eq, Hash, Clone, Debug, Copy, Serialize, Deserialize)] +#[non_exhaustive] +pub enum S3EncryptionConfigKey { + /// Type of encryption to use. If set, must be one of "AES256", "aws:kms", or "aws:kms:dsse". + ServerSideEncryption, + /// The KMS key ID to use for server-side encryption. If set, ServerSideEncryption + /// must be "aws:kms" or "aws:kms:dsse". + KmsKeyId, + /// If set to true, will use the bucket's default KMS key for server-side encryption. + /// If set to false, will disable the use of the bucket's default KMS key for server-side encryption. + BucketKeyEnabled, +} + +impl AsRef for S3EncryptionConfigKey { + fn as_ref(&self) -> &str { + match self { + Self::ServerSideEncryption => "aws_server_side_encryption", + Self::KmsKeyId => "aws_sse_kms_key_id", + Self::BucketKeyEnabled => "aws_sse_bucket_key_enabled", + } + } +} + +#[derive(Debug, Clone)] +enum S3EncryptionType { + S3, + SseKms, + DsseKms, +} + +impl crate::config::Parse for S3EncryptionType { + fn parse(s: &str) -> Result { + match s { + "AES256" => Ok(Self::S3), + "aws:kms" => Ok(Self::SseKms), + "aws:kms:dsse" => Ok(Self::DsseKms), + _ => Err(Error::InvalidEncryptionType { passed: s.into() }.into()), + } + } +} + +impl From<&S3EncryptionType> for &'static str { + fn from(value: &S3EncryptionType) -> Self { + match value { + S3EncryptionType::S3 => "AES256", + S3EncryptionType::SseKms => "aws:kms", + S3EncryptionType::DsseKms => "aws:kms:dsse", + } + } +} + +impl std::fmt::Display for S3EncryptionType { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.write_str(self.into()) + } +} + +/// A sequence of headers to be sent for write requests that specify server-side +/// encryption. +/// +/// Whether these headers are sent depends on both the kind of encryption set +/// and the kind of request being made. +#[derive(Default, Clone, Debug)] +pub struct S3EncryptionHeaders(HeaderMap); + +impl S3EncryptionHeaders { + fn try_new( + encryption_type: &S3EncryptionType, + key_id: Option, + bucket_key_enabled: Option, + ) -> Result { + let mut headers = HeaderMap::new(); + // Note: if we later add support for SSE-C, we should be sure to use + // HeaderValue::set_sensitive to prevent the key from being logged. + headers.insert( + "x-amz-server-side-encryption", + HeaderValue::from_static(encryption_type.into()), + ); + if let Some(key_id) = key_id { + headers.insert( + "x-amz-server-side-encryption-aws-kms-key-id", + key_id + .try_into() + .map_err(|err| Error::InvalidEncryptionHeader { + header: "kms-key-id", + source: Box::new(err), + })?, + ); + } + if let Some(bucket_key_enabled) = bucket_key_enabled { + headers.insert( + "x-amz-server-side-encryption-bucket-key-enabled", + HeaderValue::from_static(if bucket_key_enabled { "true" } else { "false" }), + ); + } + Ok(Self(headers)) + } +} + +impl From for HeaderMap { + fn from(headers: S3EncryptionHeaders) -> Self { + headers.0 + } +} + #[cfg(test)] mod tests { use super::*; @@ -967,7 +1172,10 @@ mod tests { .with_config(AmazonS3ConfigKey::DefaultRegion, &aws_default_region) .with_config(AmazonS3ConfigKey::Endpoint, &aws_endpoint) .with_config(AmazonS3ConfigKey::Token, &aws_session_token) - .with_config(AmazonS3ConfigKey::UnsignedPayload, "true"); + .with_config(AmazonS3ConfigKey::UnsignedPayload, "true") + .with_config("aws_server_side_encryption".parse().unwrap(), "AES256") + .with_config("aws_sse_kms_key_id".parse().unwrap(), "some_key_id") + .with_config("aws_sse_bucket_key_enabled".parse().unwrap(), "true"); assert_eq!( builder @@ -1003,6 +1211,24 @@ mod tests { .unwrap(), "true" ); + assert_eq!( + builder + .get_config_value(&"aws_server_side_encryption".parse().unwrap()) + .unwrap(), + "AES256" + ); + assert_eq!( + builder + .get_config_value(&"aws_sse_kms_key_id".parse().unwrap()) + .unwrap(), + "some_key_id" + ); + assert_eq!( + builder + .get_config_value(&"aws_sse_bucket_key_enabled".parse().unwrap()) + .unwrap(), + "true" + ); } #[test] diff --git a/src/aws/client.rs b/src/aws/client.rs index a31350f..aa9f6bf 100644 --- a/src/aws/client.rs +++ b/src/aws/client.rs @@ -15,6 +15,7 @@ // specific language governing permissions and limitations // under the License. +use crate::aws::builder::S3EncryptionHeaders; use crate::aws::checksum::Checksum; use crate::aws::credential::{AwsCredential, CredentialExt}; use crate::aws::{ @@ -182,6 +183,7 @@ pub struct S3Config { pub checksum: Option, pub copy_if_not_exists: Option, pub conditional_put: Option, + pub encryption_headers: S3EncryptionHeaders, } impl S3Config { @@ -321,9 +323,17 @@ impl S3Client { /// Make an S3 PUT request /// /// Returns the ETag - pub fn put_request<'a>(&'a self, path: &'a Path, bytes: Bytes) -> Request<'a> { + pub fn put_request<'a>( + &'a self, + path: &'a Path, + bytes: Bytes, + with_encryption_headers: bool, + ) -> Request<'a> { let url = self.config.path_url(path); let mut builder = self.client.request(Method::PUT, url); + if with_encryption_headers { + builder = builder.headers(self.config.encryption_headers.clone().into()); + } let mut payload_sha256 = None; if let Some(checksum) = self.config.checksum { @@ -490,7 +500,8 @@ impl S3Client { let builder = self .client .request(Method::PUT, url) - .header("x-amz-copy-source", source); + .header("x-amz-copy-source", source) + .headers(self.config.encryption_headers.clone().into()); Request { builder, @@ -508,6 +519,7 @@ impl S3Client { let response = self .client .request(Method::POST, url) + .headers(self.config.encryption_headers.clone().into()) .with_aws_sigv4(credential.authorizer(), None) .send_retry(&self.config.retry_config) .await @@ -532,7 +544,7 @@ impl S3Client { let part = (part_idx + 1).to_string(); let response = self - .put_request(path, data) + .put_request(path, data, false) .query(&[("partNumber", &part), ("uploadId", upload_id)]) .send() .await?; diff --git a/src/aws/mod.rs b/src/aws/mod.rs index 4e88524..b11f451 100644 --- a/src/aws/mod.rs +++ b/src/aws/mod.rs @@ -62,7 +62,7 @@ mod dynamo; mod precondition; mod resolve; -pub use builder::{AmazonS3Builder, AmazonS3ConfigKey}; +pub use builder::{AmazonS3Builder, AmazonS3ConfigKey, S3EncryptionHeaders}; pub use checksum::Checksum; pub use dynamo::DynamoCommit; pub use precondition::{S3ConditionalPut, S3CopyIfNotExists}; @@ -164,7 +164,7 @@ impl Signer for AmazonS3 { #[async_trait] impl ObjectStore for AmazonS3 { async fn put_opts(&self, location: &Path, bytes: Bytes, opts: PutOptions) -> Result { - let mut request = self.client.put_request(location, bytes); + let mut request = self.client.put_request(location, bytes, true); let tags = opts.tags.encoded(); if !tags.is_empty() && !self.client.config.disable_tagging { request = request.header(&TAGS_HEADER, tags); @@ -374,8 +374,10 @@ impl MultiPartStore for AmazonS3 { #[cfg(test)] mod tests { use super::*; - use crate::tests::*; + use crate::{client::get::GetClient, tests::*}; use bytes::Bytes; + use hyper::HeaderMap; + use tokio::io::AsyncWriteExt; const NON_EXISTENT_NAME: &str = "nonexistentname"; @@ -397,6 +399,7 @@ mod tests { stream_get(&integration).await; multipart(&integration, &integration).await; signing(&integration).await; + s3_encryption(&integration).await; // Object tagging is not supported by S3 Express One Zone if config.session_provider.is_none() { @@ -515,4 +518,49 @@ mod tests { v2.list_with_delimiter(Some(&prefix)).await.unwrap(); } + + async fn s3_encryption(store: &AmazonS3) { + crate::test_util::maybe_skip_integration!(); + + let data = Bytes::from(vec![3u8; 1024]); + + let encryption_headers: HeaderMap = store.client.config.encryption_headers.clone().into(); + let expected_encryption = + if let Some(encryption_type) = encryption_headers.get("x-amz-server-side-encryption") { + encryption_type + } else { + eprintln!("Skipping S3 encryption test - encryption not configured"); + return; + }; + + let locations = [ + Path::from("test-encryption-1"), + Path::from("test-encryption-2"), + Path::from("test-encryption-3"), + ]; + + store.put(&locations[0], data.clone()).await.unwrap(); + store.copy(&locations[0], &locations[1]).await.unwrap(); + + let (_, mut writer) = store.put_multipart(&locations[2]).await.unwrap(); + writer.write_all(&data).await.unwrap(); + writer.shutdown().await.unwrap(); + + for location in &locations { + let res = store + .client + .get_request(location, GetOptions::default()) + .await + .unwrap(); + let headers = res.headers(); + assert_eq!( + headers + .get("x-amz-server-side-encryption") + .expect("object is not encrypted"), + expected_encryption + ); + + store.delete(location).await.unwrap(); + } + } } From fe50dcea1296598ff45a2d84639831d35900dd57 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 27 Feb 2024 03:52:13 +1300 Subject: [PATCH 271/397] Update nix requirement from 0.27.1 to 0.28.0 in /object_store (#5432) Updates the requirements on [nix](https://github.com/nix-rust/nix) to permit the latest version. - [Changelog](https://github.com/nix-rust/nix/blob/master/CHANGELOG.md) - [Commits](https://github.com/nix-rust/nix/compare/v0.27.1...v0.28.0) --- updated-dependencies: - dependency-name: nix dependency-type: direct:production ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index f3aaf35..6b38a8d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -57,7 +57,7 @@ tokio = { version = "1.25.0", features = ["sync", "macros", "rt", "time", "io-ut md-5 = { version = "0.10.6", default-features = false, optional = true } [target.'cfg(target_family="unix")'.dev-dependencies] -nix = { version = "0.27.1", features = ["fs"] } +nix = { version = "0.28.0", features = ["fs"] } [features] cloud = ["serde", "serde_json", "quick-xml", "hyper", "reqwest", "reqwest/json", "reqwest/stream", "chrono/serde", "base64", "rand", "ring"] From 27c50d9b55ee0063f2781454f8090a24d73f8502 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Tue, 27 Feb 2024 15:39:36 +1300 Subject: [PATCH 272/397] Add BufWriter for Adapative Put / Multipart Upload (#5431) * Add BufWriter * Review feedback --- src/buffered.rs | 163 +++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 161 insertions(+), 2 deletions(-) diff --git a/src/buffered.rs b/src/buffered.rs index 3a1354f..fdefe59 100644 --- a/src/buffered.rs +++ b/src/buffered.rs @@ -18,7 +18,7 @@ //! Utilities for performing tokio-style buffered IO use crate::path::Path; -use crate::{ObjectMeta, ObjectStore}; +use crate::{MultipartId, ObjectMeta, ObjectStore}; use bytes::Bytes; use futures::future::{BoxFuture, FutureExt}; use futures::ready; @@ -27,7 +27,7 @@ use std::io::{Error, ErrorKind, SeekFrom}; use std::pin::Pin; use std::sync::Arc; use std::task::{Context, Poll}; -use tokio::io::{AsyncBufRead, AsyncRead, AsyncSeek, ReadBuf}; +use tokio::io::{AsyncBufRead, AsyncRead, AsyncSeek, AsyncWrite, AsyncWriteExt, ReadBuf}; /// The default buffer size used by [`BufReader`] pub const DEFAULT_BUFFER_SIZE: usize = 1024 * 1024; @@ -205,6 +205,138 @@ impl AsyncBufRead for BufReader { } } +/// An async buffered writer compatible with the tokio IO traits +/// +/// Up to `capacity` bytes will be buffered in memory, and flushed on shutdown +/// using [`ObjectStore::put`]. If `capacity` is exceeded, data will instead be +/// streamed using [`ObjectStore::put_multipart`] +pub struct BufWriter { + capacity: usize, + state: BufWriterState, + multipart_id: Option, + store: Arc, +} + +impl std::fmt::Debug for BufWriter { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("BufWriter") + .field("capacity", &self.capacity) + .field("multipart_id", &self.multipart_id) + .finish() + } +} + +type MultipartResult = (MultipartId, Box); + +enum BufWriterState { + /// Buffer up to capacity bytes + Buffer(Path, Vec), + /// [`ObjectStore::put_multipart`] + Prepare(BoxFuture<'static, std::io::Result>), + /// Write to a multipart upload + Write(Box), + /// [`ObjectStore::put`] + Put(BoxFuture<'static, std::io::Result<()>>), +} + +impl BufWriter { + /// Create a new [`BufWriter`] from the provided [`ObjectStore`] and [`Path`] + pub fn new(store: Arc, path: Path) -> Self { + Self::with_capacity(store, path, 10 * 1024 * 1024) + } + + /// Create a new [`BufWriter`] from the provided [`ObjectStore`], [`Path`] and `capacity` + pub fn with_capacity(store: Arc, path: Path, capacity: usize) -> Self { + Self { + capacity, + store, + state: BufWriterState::Buffer(path, Vec::new()), + multipart_id: None, + } + } + + /// Returns the [`MultipartId`] if multipart upload + pub fn multipart_id(&self) -> Option<&MultipartId> { + self.multipart_id.as_ref() + } +} + +impl AsyncWrite for BufWriter { + fn poll_write( + mut self: Pin<&mut Self>, + cx: &mut Context<'_>, + buf: &[u8], + ) -> Poll> { + let cap = self.capacity; + loop { + return match &mut self.state { + BufWriterState::Write(write) => Pin::new(write).poll_write(cx, buf), + BufWriterState::Put(_) => panic!("Already shut down"), + BufWriterState::Prepare(f) => { + let (id, w) = ready!(f.poll_unpin(cx)?); + self.state = BufWriterState::Write(w); + self.multipart_id = Some(id); + continue; + } + BufWriterState::Buffer(path, b) => { + if b.len().saturating_add(buf.len()) >= cap { + let buffer = std::mem::take(b); + let path = std::mem::take(path); + let store = Arc::clone(&self.store); + self.state = BufWriterState::Prepare(Box::pin(async move { + let (id, mut writer) = store.put_multipart(&path).await?; + writer.write_all(&buffer).await?; + Ok((id, writer)) + })); + continue; + } + b.extend_from_slice(buf); + Poll::Ready(Ok(buf.len())) + } + }; + } + } + + fn poll_flush(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { + loop { + return match &mut self.state { + BufWriterState::Buffer(_, _) => Poll::Ready(Ok(())), + BufWriterState::Write(write) => Pin::new(write).poll_flush(cx), + BufWriterState::Put(_) => panic!("Already shut down"), + BufWriterState::Prepare(f) => { + let (id, w) = ready!(f.poll_unpin(cx)?); + self.state = BufWriterState::Write(w); + self.multipart_id = Some(id); + continue; + } + }; + } + } + + fn poll_shutdown(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { + loop { + match &mut self.state { + BufWriterState::Prepare(f) => { + let (id, w) = ready!(f.poll_unpin(cx)?); + self.state = BufWriterState::Write(w); + self.multipart_id = Some(id); + } + BufWriterState::Buffer(p, b) => { + let buf = std::mem::take(b); + let path = std::mem::take(p); + let store = Arc::clone(&self.store); + self.state = BufWriterState::Put(Box::pin(async move { + store.put(&path, buf.into()).await?; + Ok(()) + })); + } + BufWriterState::Put(f) => return f.poll_unpin(cx), + BufWriterState::Write(w) => return Pin::new(w).poll_shutdown(cx), + } + } + } +} + /// Port of standardised function as requires Rust 1.66 /// /// @@ -300,4 +432,31 @@ mod tests { assert!(buffer.is_empty()); } } + + #[tokio::test] + async fn test_buf_writer() { + let store = Arc::new(InMemory::new()) as Arc; + let path = Path::from("file.txt"); + + // Test put + let mut writer = BufWriter::with_capacity(Arc::clone(&store), path.clone(), 30); + writer.write_all(&[0; 20]).await.unwrap(); + writer.flush().await.unwrap(); + writer.write_all(&[0; 5]).await.unwrap(); + assert!(writer.multipart_id().is_none()); + writer.shutdown().await.unwrap(); + assert!(writer.multipart_id().is_none()); + assert_eq!(store.head(&path).await.unwrap().size, 25); + + // Test multipart + let mut writer = BufWriter::with_capacity(Arc::clone(&store), path.clone(), 30); + writer.write_all(&[0; 20]).await.unwrap(); + writer.flush().await.unwrap(); + writer.write_all(&[0; 20]).await.unwrap(); + assert!(writer.multipart_id().is_some()); + writer.shutdown().await.unwrap(); + assert!(writer.multipart_id().is_some()); + + assert_eq!(store.head(&path).await.unwrap().size, 40); + } } From 8ed584689cd9933f8b0230dea0d3a9f59f33f489 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Metehan=20Y=C4=B1ld=C4=B1r=C4=B1m?= <100111937+metesynnada@users.noreply.github.com> Date: Thu, 29 Feb 2024 05:39:17 +0300 Subject: [PATCH 273/397] Update local.rs (#5441) --- src/local.rs | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/src/local.rs b/src/local.rs index e985ff0..d631771 100644 --- a/src/local.rs +++ b/src/local.rs @@ -283,20 +283,20 @@ impl LocalFileSystem { }), }) } -} -impl Config { /// Return an absolute filesystem path of the given file location - fn path_to_filesystem(&self, location: &Path) -> Result { + pub fn path_to_filesystem(&self, location: &Path) -> Result { ensure!( is_valid_file_path(location), InvalidPathSnafu { path: location.as_ref() } ); - self.prefix_to_filesystem(location) + self.config.prefix_to_filesystem(location) } +} +impl Config { /// Return an absolute filesystem path of the given location fn prefix_to_filesystem(&self, location: &Path) -> Result { let mut url = self.root.clone(); @@ -340,7 +340,7 @@ impl ObjectStore for LocalFileSystem { return Err(crate::Error::NotImplemented); } - let path = self.config.path_to_filesystem(location)?; + let path = self.path_to_filesystem(location)?; maybe_spawn_blocking(move || { let (mut file, suffix) = new_staged_upload(&path)?; let staging_path = staged_upload_path(&path, &suffix); @@ -399,7 +399,7 @@ impl ObjectStore for LocalFileSystem { &self, location: &Path, ) -> Result<(MultipartId, Box)> { - let dest = self.config.path_to_filesystem(location)?; + let dest = self.path_to_filesystem(location)?; let (file, suffix) = new_staged_upload(&dest)?; Ok(( @@ -409,7 +409,7 @@ impl ObjectStore for LocalFileSystem { } async fn abort_multipart(&self, location: &Path, multipart_id: &MultipartId) -> Result<()> { - let dest = self.config.path_to_filesystem(location)?; + let dest = self.path_to_filesystem(location)?; let path: PathBuf = staged_upload_path(&dest, multipart_id); maybe_spawn_blocking(move || match std::fs::remove_file(&path) { @@ -424,7 +424,7 @@ impl ObjectStore for LocalFileSystem { async fn get_opts(&self, location: &Path, options: GetOptions) -> Result { let location = location.clone(); - let path = self.config.path_to_filesystem(&location)?; + let path = self.path_to_filesystem(&location)?; maybe_spawn_blocking(move || { let (file, metadata) = open_file(&path)?; let meta = convert_metadata(metadata, location)?; @@ -445,7 +445,7 @@ impl ObjectStore for LocalFileSystem { } async fn get_range(&self, location: &Path, range: Range) -> Result { - let path = self.config.path_to_filesystem(location)?; + let path = self.path_to_filesystem(location)?; maybe_spawn_blocking(move || { let (mut file, _) = open_file(&path)?; read_range(&mut file, &path, range) @@ -454,7 +454,7 @@ impl ObjectStore for LocalFileSystem { } async fn get_ranges(&self, location: &Path, ranges: &[Range]) -> Result> { - let path = self.config.path_to_filesystem(location)?; + let path = self.path_to_filesystem(location)?; let ranges = ranges.to_vec(); maybe_spawn_blocking(move || { // Vectored IO might be faster @@ -468,7 +468,7 @@ impl ObjectStore for LocalFileSystem { } async fn delete(&self, location: &Path) -> Result<()> { - let path = self.config.path_to_filesystem(location)?; + let path = self.path_to_filesystem(location)?; maybe_spawn_blocking(move || match std::fs::remove_file(&path) { Ok(_) => Ok(()), Err(e) => Err(match e.kind() { @@ -599,8 +599,8 @@ impl ObjectStore for LocalFileSystem { } async fn copy(&self, from: &Path, to: &Path) -> Result<()> { - let from = self.config.path_to_filesystem(from)?; - let to = self.config.path_to_filesystem(to)?; + let from = self.path_to_filesystem(from)?; + let to = self.path_to_filesystem(to)?; let mut id = 0; // In order to make this atomic we: // @@ -628,8 +628,8 @@ impl ObjectStore for LocalFileSystem { } async fn rename(&self, from: &Path, to: &Path) -> Result<()> { - let from = self.config.path_to_filesystem(from)?; - let to = self.config.path_to_filesystem(to)?; + let from = self.path_to_filesystem(from)?; + let to = self.path_to_filesystem(to)?; maybe_spawn_blocking(move || loop { match std::fs::rename(&from, &to) { Ok(_) => return Ok(()), @@ -643,8 +643,8 @@ impl ObjectStore for LocalFileSystem { } async fn copy_if_not_exists(&self, from: &Path, to: &Path) -> Result<()> { - let from = self.config.path_to_filesystem(from)?; - let to = self.config.path_to_filesystem(to)?; + let from = self.path_to_filesystem(from)?; + let to = self.path_to_filesystem(to)?; maybe_spawn_blocking(move || loop { match std::fs::hard_link(&from, &to) { @@ -1235,7 +1235,7 @@ mod tests { let url = Url::from_directory_path(&canonical).unwrap(); let path = Path::parse(url.path()).unwrap(); - let roundtrip = integration.config.path_to_filesystem(&path).unwrap(); + let roundtrip = integration.path_to_filesystem(&path).unwrap(); // Needed as on Windows canonicalize returns extended length path syntax // C:\Users\circleci -> \\?\C:\Users\circleci From bf023aa4a5f3d2762b05f4286cfaee76942d092b Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Fri, 1 Mar 2024 19:50:31 +1300 Subject: [PATCH 274/397] Prepare object_store 0.9.1 (#5449) --- CHANGELOG-old.md | 59 ++++++++++++++++++++++++ CHANGELOG.md | 77 +++++++++++++------------------- Cargo.toml | 2 +- dev/release/update_change_log.sh | 4 +- 4 files changed, 93 insertions(+), 49 deletions(-) diff --git a/CHANGELOG-old.md b/CHANGELOG-old.md index d01b8a3..141a8b9 100644 --- a/CHANGELOG-old.md +++ b/CHANGELOG-old.md @@ -19,6 +19,65 @@ # Historical Changelog +# Changelog + +## [object_store_0.9.0](https://github.com/apache/arrow-rs/tree/object_store_0.9.0) (2024-01-05) + +[Full Changelog](https://github.com/apache/arrow-rs/compare/object_store_0.8.0...object_store_0.9.0) + +**Breaking changes:** + +- Remove deprecated try\_with\_option methods [\#5237](https://github.com/apache/arrow-rs/pull/5237) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- object\_store: full HTTP range support [\#5222](https://github.com/apache/arrow-rs/pull/5222) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([clbarnes](https://github.com/clbarnes)) +- feat\(object\_store\): use http1 by default [\#5204](https://github.com/apache/arrow-rs/pull/5204) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([wjones127](https://github.com/wjones127)) +- refactor: change `object_store` CA handling [\#5056](https://github.com/apache/arrow-rs/pull/5056) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([crepererum](https://github.com/crepererum)) + +**Implemented enhancements:** + +- Azure Signed URL Support [\#5232](https://github.com/apache/arrow-rs/issues/5232) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- \[object-store\] Make aws region optional. [\#5211](https://github.com/apache/arrow-rs/issues/5211) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- \[object\_store,gcp\] Document GoogleCloudStorage Default Credentials [\#5187](https://github.com/apache/arrow-rs/issues/5187) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Support S3 Express One Zone [\#5140](https://github.com/apache/arrow-rs/issues/5140) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- `object_store`: Allow 403 Forbidden for `copy_if_not_exists` S3 status code [\#5132](https://github.com/apache/arrow-rs/issues/5132) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Add `copy_if_not_exists` support for AmazonS3 via DynamoDB Lock Support [\#4880](https://github.com/apache/arrow-rs/issues/4880) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- object\_store: native certs, w/o webpki-roots [\#4870](https://github.com/apache/arrow-rs/issues/4870) +- object\_store: range request with suffix [\#4611](https://github.com/apache/arrow-rs/issues/4611) + +**Fixed bugs:** + +- ObjectStore::get\_opts Incorrectly Returns Response Size not Object Size [\#5272](https://github.com/apache/arrow-rs/issues/5272) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Single object store has limited throughput on GCS [\#5194](https://github.com/apache/arrow-rs/issues/5194) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- local::tests::invalid\_path fails during object store release verification [\#5035](https://github.com/apache/arrow-rs/issues/5035) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Object Store Doctest Failure with Default Features [\#5025](https://github.com/apache/arrow-rs/issues/5025) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] + +**Documentation updates:** + +- Document default value of InstanceCredentialProvider [\#5188](https://github.com/apache/arrow-rs/pull/5188) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([justinabrahms](https://github.com/justinabrahms)) + +**Merged pull requests:** + +- Retry Safe/Read-Only Requests on Timeout [\#5278](https://github.com/apache/arrow-rs/pull/5278) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Fix ObjectMeta::size for range requests \(\#5272\) [\#5276](https://github.com/apache/arrow-rs/pull/5276) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- docs\(object\_store\): Mention `with_allow_http` in docs of `with_endpoint` [\#5275](https://github.com/apache/arrow-rs/pull/5275) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([Xuanwo](https://github.com/Xuanwo)) +- Support S3 Express One Zone [\#5268](https://github.com/apache/arrow-rs/pull/5268) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- feat\(object\_store\): Azure url signing [\#5259](https://github.com/apache/arrow-rs/pull/5259) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([roeap](https://github.com/roeap)) +- DynamoDB ConditionalPut [\#5247](https://github.com/apache/arrow-rs/pull/5247) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Default AWS region to us-east-1 \(\#5211\) [\#5244](https://github.com/apache/arrow-rs/pull/5244) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- ci: Fail Miri CI on first failure [\#5243](https://github.com/apache/arrow-rs/pull/5243) ([Jefffrey](https://github.com/Jefffrey)) +- Bump actions/upload-pages-artifact from 2 to 3 [\#5229](https://github.com/apache/arrow-rs/pull/5229) ([dependabot[bot]](https://github.com/apps/dependabot)) +- Bump actions/setup-python from 4 to 5 [\#5175](https://github.com/apache/arrow-rs/pull/5175) ([dependabot[bot]](https://github.com/apps/dependabot)) +- fix: ensure take\_fixed\_size\_list can handle null indices [\#5170](https://github.com/apache/arrow-rs/pull/5170) ([westonpace](https://github.com/westonpace)) +- Bump actions/labeler from 4.3.0 to 5.0.0 [\#5167](https://github.com/apache/arrow-rs/pull/5167) ([dependabot[bot]](https://github.com/apps/dependabot)) +- object\_store: fix failing doctest with default features [\#5161](https://github.com/apache/arrow-rs/pull/5161) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([Jefffrey](https://github.com/Jefffrey)) +- Update rustls-pemfile requirement from 1.0 to 2.0 in /object\_store [\#5155](https://github.com/apache/arrow-rs/pull/5155) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- Allow 403 for overwrite prevention [\#5134](https://github.com/apache/arrow-rs/pull/5134) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([emcake](https://github.com/emcake)) +- Fix ObjectStore.LocalFileSystem.put\_opts for blobfuse [\#5094](https://github.com/apache/arrow-rs/pull/5094) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([RobinLin666](https://github.com/RobinLin666)) +- Update itertools requirement from 0.11.0 to 0.12.0 in /object\_store [\#5077](https://github.com/apache/arrow-rs/pull/5077) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- Add a PR under "Breaking changes" in the object\_store 0.8.0 changelog [\#5063](https://github.com/apache/arrow-rs/pull/5063) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([carols10cents](https://github.com/carols10cents)) +- Prepare arrow 49.0.0 [\#5054](https://github.com/apache/arrow-rs/pull/5054) ([tustvold](https://github.com/tustvold)) +- Fix invalid\_path test [\#5026](https://github.com/apache/arrow-rs/pull/5026) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Implement `copy_if_not_exist` for `AmazonS3` using DynamoDB \(\#4880\) [\#4918](https://github.com/apache/arrow-rs/pull/4918) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) + ## [object_store_0.8.0](https://github.com/apache/arrow-rs/tree/object_store_0.8.0) (2023-11-02) [Full Changelog](https://github.com/apache/arrow-rs/compare/object_store_0.7.1...object_store_0.8.0) diff --git a/CHANGELOG.md b/CHANGELOG.md index db2009b..0c6af67 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,61 +19,46 @@ # Changelog -## [object_store_0.9.0](https://github.com/apache/arrow-rs/tree/object_store_0.9.0) (2024-01-05) +## [object_store_0.9.1](https://github.com/apache/arrow-rs/tree/object_store_0.9.1) (2024-03-01) -[Full Changelog](https://github.com/apache/arrow-rs/compare/object_store_0.8.0...object_store_0.9.0) - -**Breaking changes:** - -- Remove deprecated try\_with\_option methods [\#5237](https://github.com/apache/arrow-rs/pull/5237) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- object\_store: full HTTP range support [\#5222](https://github.com/apache/arrow-rs/pull/5222) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([clbarnes](https://github.com/clbarnes)) -- feat\(object\_store\): use http1 by default [\#5204](https://github.com/apache/arrow-rs/pull/5204) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([wjones127](https://github.com/wjones127)) -- refactor: change `object_store` CA handling [\#5056](https://github.com/apache/arrow-rs/pull/5056) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([crepererum](https://github.com/crepererum)) +[Full Changelog](https://github.com/apache/arrow-rs/compare/object_store_0.9.0...object_store_0.9.1) **Implemented enhancements:** -- Azure Signed URL Support [\#5232](https://github.com/apache/arrow-rs/issues/5232) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- \[object-store\] Make aws region optional. [\#5211](https://github.com/apache/arrow-rs/issues/5211) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- \[object\_store,gcp\] Document GoogleCloudStorage Default Credentials [\#5187](https://github.com/apache/arrow-rs/issues/5187) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- Support S3 Express One Zone [\#5140](https://github.com/apache/arrow-rs/issues/5140) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- `object_store`: Allow 403 Forbidden for `copy_if_not_exists` S3 status code [\#5132](https://github.com/apache/arrow-rs/issues/5132) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- Add `copy_if_not_exists` support for AmazonS3 via DynamoDB Lock Support [\#4880](https://github.com/apache/arrow-rs/issues/4880) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- object\_store: native certs, w/o webpki-roots [\#4870](https://github.com/apache/arrow-rs/issues/4870) -- object\_store: range request with suffix [\#4611](https://github.com/apache/arrow-rs/issues/4611) +- \[object\_store\] Enable anonymous read access for Azure [\#5424](https://github.com/apache/arrow-rs/issues/5424) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Support for additional URL formats in object\_store for Azure blob [\#5370](https://github.com/apache/arrow-rs/issues/5370) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Mention "Http" support in README [\#5320](https://github.com/apache/arrow-rs/issues/5320) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Pass Options to HttpBuilder in parse\_url\_opts [\#5310](https://github.com/apache/arrow-rs/issues/5310) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Remove Localstack DynamoDB Workaround Once Fixed Upstream [\#5267](https://github.com/apache/arrow-rs/issues/5267) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Can I use S3 server side encryption [\#5087](https://github.com/apache/arrow-rs/issues/5087) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] **Fixed bugs:** -- ObjectStore::get\_opts Incorrectly Returns Response Size not Object Size [\#5272](https://github.com/apache/arrow-rs/issues/5272) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- Single object store has limited throughput on GCS [\#5194](https://github.com/apache/arrow-rs/issues/5194) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- local::tests::invalid\_path fails during object store release verification [\#5035](https://github.com/apache/arrow-rs/issues/5035) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- Object Store Doctest Failure with Default Features [\#5025](https://github.com/apache/arrow-rs/issues/5025) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] - -**Documentation updates:** - -- Document default value of InstanceCredentialProvider [\#5188](https://github.com/apache/arrow-rs/pull/5188) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([justinabrahms](https://github.com/justinabrahms)) +- delete\_stream fails in MinIO [\#5414](https://github.com/apache/arrow-rs/issues/5414) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- \[object\_store\] Completing an empty Multipart Upload fails for AWS S3 [\#5404](https://github.com/apache/arrow-rs/issues/5404) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Multipart upload can leave futures unpolled, leading to timeout [\#5366](https://github.com/apache/arrow-rs/issues/5366) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Broken Link in README \(Rust Object Store\) Content [\#5309](https://github.com/apache/arrow-rs/issues/5309) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] **Merged pull requests:** -- Retry Safe/Read-Only Requests on Timeout [\#5278](https://github.com/apache/arrow-rs/pull/5278) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- Fix ObjectMeta::size for range requests \(\#5272\) [\#5276](https://github.com/apache/arrow-rs/pull/5276) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- docs\(object\_store\): Mention `with_allow_http` in docs of `with_endpoint` [\#5275](https://github.com/apache/arrow-rs/pull/5275) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([Xuanwo](https://github.com/Xuanwo)) -- Support S3 Express One Zone [\#5268](https://github.com/apache/arrow-rs/pull/5268) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- feat\(object\_store\): Azure url signing [\#5259](https://github.com/apache/arrow-rs/pull/5259) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([roeap](https://github.com/roeap)) -- DynamoDB ConditionalPut [\#5247](https://github.com/apache/arrow-rs/pull/5247) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- Default AWS region to us-east-1 \(\#5211\) [\#5244](https://github.com/apache/arrow-rs/pull/5244) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- ci: Fail Miri CI on first failure [\#5243](https://github.com/apache/arrow-rs/pull/5243) ([Jefffrey](https://github.com/Jefffrey)) -- Bump actions/upload-pages-artifact from 2 to 3 [\#5229](https://github.com/apache/arrow-rs/pull/5229) ([dependabot[bot]](https://github.com/apps/dependabot)) -- Bump actions/setup-python from 4 to 5 [\#5175](https://github.com/apache/arrow-rs/pull/5175) ([dependabot[bot]](https://github.com/apps/dependabot)) -- fix: ensure take\_fixed\_size\_list can handle null indices [\#5170](https://github.com/apache/arrow-rs/pull/5170) ([westonpace](https://github.com/westonpace)) -- Bump actions/labeler from 4.3.0 to 5.0.0 [\#5167](https://github.com/apache/arrow-rs/pull/5167) ([dependabot[bot]](https://github.com/apps/dependabot)) -- object\_store: fix failing doctest with default features [\#5161](https://github.com/apache/arrow-rs/pull/5161) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([Jefffrey](https://github.com/Jefffrey)) -- Update rustls-pemfile requirement from 1.0 to 2.0 in /object\_store [\#5155](https://github.com/apache/arrow-rs/pull/5155) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([dependabot[bot]](https://github.com/apps/dependabot)) -- Allow 403 for overwrite prevention [\#5134](https://github.com/apache/arrow-rs/pull/5134) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([emcake](https://github.com/emcake)) -- Fix ObjectStore.LocalFileSystem.put\_opts for blobfuse [\#5094](https://github.com/apache/arrow-rs/pull/5094) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([RobinLin666](https://github.com/RobinLin666)) -- Update itertools requirement from 0.11.0 to 0.12.0 in /object\_store [\#5077](https://github.com/apache/arrow-rs/pull/5077) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([dependabot[bot]](https://github.com/apps/dependabot)) -- Add a PR under "Breaking changes" in the object\_store 0.8.0 changelog [\#5063](https://github.com/apache/arrow-rs/pull/5063) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([carols10cents](https://github.com/carols10cents)) -- Prepare arrow 49.0.0 [\#5054](https://github.com/apache/arrow-rs/pull/5054) ([tustvold](https://github.com/tustvold)) -- Fix invalid\_path test [\#5026](https://github.com/apache/arrow-rs/pull/5026) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- Implement `copy_if_not_exist` for `AmazonS3` using DynamoDB \(\#4880\) [\#4918](https://github.com/apache/arrow-rs/pull/4918) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Expose path\_to\_filesystem public [\#5441](https://github.com/apache/arrow-rs/pull/5441) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([metesynnada](https://github.com/metesynnada)) +- Update nix requirement from 0.27.1 to 0.28.0 in /object\_store [\#5432](https://github.com/apache/arrow-rs/pull/5432) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- Add BufWriter for Adapative Put / Multipart Upload [\#5431](https://github.com/apache/arrow-rs/pull/5431) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Enable anonymous access for MicrosoftAzure [\#5425](https://github.com/apache/arrow-rs/pull/5425) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([andrebsguedes](https://github.com/andrebsguedes)) +- fix\(object\_store\): Include Content-MD5 header for S3 DeleteObjects [\#5415](https://github.com/apache/arrow-rs/pull/5415) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([paraseba](https://github.com/paraseba)) +- docds\(object\_store\): Mention HTTP/WebDAV in README [\#5409](https://github.com/apache/arrow-rs/pull/5409) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([Xuanwo](https://github.com/Xuanwo)) +- \[object\_store\] Fix empty Multipart Upload for AWS S3 [\#5405](https://github.com/apache/arrow-rs/pull/5405) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([andrebsguedes](https://github.com/andrebsguedes)) +- feat: S3 server-side encryption [\#5402](https://github.com/apache/arrow-rs/pull/5402) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([wjones127](https://github.com/wjones127)) +- Pull container name from URL for Azure blob [\#5371](https://github.com/apache/arrow-rs/pull/5371) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([bradvoth](https://github.com/bradvoth)) +- docs\(object-store\): add warning to flush [\#5369](https://github.com/apache/arrow-rs/pull/5369) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([wjones127](https://github.com/wjones127)) +- Minor\(docs\): update master to main for DataFusion/Ballista [\#5363](https://github.com/apache/arrow-rs/pull/5363) ([caicancai](https://github.com/caicancai)) +- Test parse\_url\_opts for HTTP \(\#5310\) [\#5316](https://github.com/apache/arrow-rs/pull/5316) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Update IOx links [\#5312](https://github.com/apache/arrow-rs/pull/5312) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Pass options to HTTPBuilder in parse\_url\_opts \(\#5310\) [\#5311](https://github.com/apache/arrow-rs/pull/5311) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Bump actions/cache from 3 to 4 [\#5308](https://github.com/apache/arrow-rs/pull/5308) ([dependabot[bot]](https://github.com/apps/dependabot)) +- Remove localstack DynamoDB workaround \(\#5267\) [\#5307](https://github.com/apache/arrow-rs/pull/5307) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- refactor: log server error during object store retries [\#5294](https://github.com/apache/arrow-rs/pull/5294) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([crepererum](https://github.com/crepererum)) +- Prepare arrow 50.0.0 [\#5291](https://github.com/apache/arrow-rs/pull/5291) ([tustvold](https://github.com/tustvold)) +- Enable JS tests again [\#5287](https://github.com/apache/arrow-rs/pull/5287) ([domoritz](https://github.com/domoritz)) \* *This Changelog was automatically generated by [github_changelog_generator](https://github.com/github-changelog-generator/github-changelog-generator)* diff --git a/Cargo.toml b/Cargo.toml index 6b38a8d..4cc7559 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "object_store" -version = "0.9.0" +version = "0.9.1" edition = "2021" license = "MIT/Apache-2.0" readme = "README.md" diff --git a/dev/release/update_change_log.sh b/dev/release/update_change_log.sh index a083f61..83d5c32 100755 --- a/dev/release/update_change_log.sh +++ b/dev/release/update_change_log.sh @@ -29,8 +29,8 @@ set -e -SINCE_TAG="object_store_0.8.0" -FUTURE_RELEASE="object_store_0.9.0" +SINCE_TAG="object_store_0.9.0" +FUTURE_RELEASE="object_store_0.9.1" SOURCE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" SOURCE_TOP_DIR="$(cd "${SOURCE_DIR}/../../" && pwd)" From 29e259d0e27db9d461b57da0392b04cce9fca257 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Guedes?= Date: Sun, 3 Mar 2024 18:04:31 -0300 Subject: [PATCH 275/397] Uses ResourceType for filtering directories instead of workaround (#5452) --- src/azure/client.rs | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) diff --git a/src/azure/client.rs b/src/azure/client.rs index feea2f2..5be6658 100644 --- a/src/azure/client.rs +++ b/src/azure/client.rs @@ -36,7 +36,6 @@ use base64::Engine; use bytes::{Buf, Bytes}; use chrono::{DateTime, Utc}; use hyper::http::HeaderName; -use itertools::Itertools; use reqwest::header::CONTENT_TYPE; use reqwest::{ header::{HeaderValue, CONTENT_LENGTH, IF_MATCH, IF_NONE_MATCH}, @@ -564,7 +563,7 @@ struct ListResultInternal { } fn to_list_result(value: ListResultInternal, prefix: Option<&str>) -> Result { - let prefix = prefix.map(Path::from).unwrap_or_default(); + let prefix = prefix.unwrap_or_default(); let common_prefixes = value .blobs .blob_prefix @@ -576,18 +575,14 @@ fn to_list_result(value: ListResultInternal, prefix: Option<&str>) -> Result 0 && obj.location.as_ref().len() > prefix.as_ref().len() { - Some(obj) - } else { - None - } + // Note: Filters out directories from list results when hierarchical namespaces are + // enabled. When we want directories, its always via the BlobPrefix mechanics, + // and during lists we state that prefixes are evaluated on path segment basis. + .filter(|blob| { + !matches!(blob.properties.resource_type.as_ref(), Some(typ) if typ == "directory") + && blob.name.len() > prefix.len() }) + .map(ObjectMeta::try_from) .collect::>()?; Ok(ListResult { @@ -657,6 +652,8 @@ struct BlobProperties { pub content_language: Option, #[serde(rename = "Etag")] pub e_tag: Option, + #[serde(rename = "ResourceType")] + pub resource_type: Option, } #[derive(Debug, Clone, PartialEq, Eq)] From 292556b63b41f7f7e854c64de4461f9ae6b7a39a Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 5 Mar 2024 07:29:09 +1300 Subject: [PATCH 276/397] Update base64 requirement from 0.21 to 0.22 in /object_store (#5465) Updates the requirements on [base64](https://github.com/marshallpierce/rust-base64) to permit the latest version. - [Changelog](https://github.com/marshallpierce/rust-base64/blob/master/RELEASE-NOTES.md) - [Commits](https://github.com/marshallpierce/rust-base64/compare/v0.21.0...v0.22.0) --- updated-dependencies: - dependency-name: base64 dependency-type: direct:production ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index 4cc7559..a1e80ce 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -44,7 +44,7 @@ url = "2.2" walkdir = "2" # Cloud storage support -base64 = { version = "0.21", default-features = false, features = ["std"], optional = true } +base64 = { version = "0.22", default-features = false, features = ["std"], optional = true } hyper = { version = "0.14", default-features = false, optional = true } quick-xml = { version = "0.31.0", features = ["serialize", "overlapped-lists"], optional = true } serde = { version = "1.0", default-features = false, features = ["derive"], optional = true } From 232dad1e8d5e0b20aabed0f821f46de522630dd5 Mon Sep 17 00:00:00 2001 From: Itayazolay Date: Wed, 6 Mar 2024 21:07:29 +0200 Subject: [PATCH 277/397] add support for gcp application default auth on windows in object store (#5473) * add support for gcp application default auth on windows in object store * syntax err * clippy --------- Co-authored-by: Itay Azolay --- src/gcp/credential.rs | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/src/gcp/credential.rs b/src/gcp/credential.rs index dc504da..34cd6ee 100644 --- a/src/gcp/credential.rs +++ b/src/gcp/credential.rs @@ -393,7 +393,11 @@ pub enum ApplicationDefaultCredentials { } impl ApplicationDefaultCredentials { - const CREDENTIALS_PATH: &'static str = ".config/gcloud/application_default_credentials.json"; + const CREDENTIALS_PATH: &'static str = if cfg!(windows) { + "gcloud/application_default_credentials.json" + } else { + ".config/gcloud/application_default_credentials.json" + }; // Create a new application default credential in the following situations: // 1. a file is passed in and the type matches. @@ -402,7 +406,9 @@ impl ApplicationDefaultCredentials { if let Some(path) = path { return read_credentials_file::(path).map(Some); } - if let Some(home) = env::var_os("HOME") { + + let home_var = if cfg!(windows) { "APPDATA" } else { "HOME" }; + if let Some(home) = env::var_os(home_var) { let path = Path::new(&home).join(Self::CREDENTIALS_PATH); // It's expected for this file to not exist unless it has been explicitly configured by the user. From d092b35d4aa742c5374657f140525e9ccca64653 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 7 Mar 2024 14:18:04 +1300 Subject: [PATCH 278/397] Update latest chrono (#5479) --- src/lib.rs | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index af5676e..8132002 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1693,7 +1693,9 @@ mod tests { } let options = GetOptions { - if_unmodified_since: Some(meta.last_modified + chrono::Duration::hours(10)), + if_unmodified_since: Some( + meta.last_modified + chrono::Duration::try_hours(10).unwrap(), + ), ..GetOptions::default() }; match storage.get_opts(&path, options).await { @@ -1702,7 +1704,9 @@ mod tests { } let options = GetOptions { - if_unmodified_since: Some(meta.last_modified - chrono::Duration::hours(10)), + if_unmodified_since: Some( + meta.last_modified - chrono::Duration::try_hours(10).unwrap(), + ), ..GetOptions::default() }; match storage.get_opts(&path, options).await { @@ -1720,7 +1724,7 @@ mod tests { } let options = GetOptions { - if_modified_since: Some(meta.last_modified - chrono::Duration::hours(10)), + if_modified_since: Some(meta.last_modified - chrono::Duration::try_hours(10).unwrap()), ..GetOptions::default() }; match storage.get_opts(&path, options).await { From f0ad78f247327a139b4740758981a9f750d6a2f2 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Wed, 13 Mar 2024 13:20:11 +1300 Subject: [PATCH 279/397] Implement MultiPartStore for InMemory (#5495) --- src/memory.rs | 92 ++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 91 insertions(+), 1 deletion(-) diff --git a/src/memory.rs b/src/memory.rs index 41cfcc4..41ee109 100644 --- a/src/memory.rs +++ b/src/memory.rs @@ -16,6 +16,7 @@ // under the License. //! An in-memory object store implementation +use crate::multipart::{MultiPartStore, PartId}; use crate::util::InvalidGetRange; use crate::{ path::Path, GetRange, GetResult, GetResultPayload, ListResult, ObjectMeta, ObjectStore, @@ -28,8 +29,8 @@ use chrono::{DateTime, Utc}; use futures::{stream::BoxStream, StreamExt}; use parking_lot::RwLock; use snafu::{OptionExt, ResultExt, Snafu}; -use std::collections::BTreeMap; use std::collections::BTreeSet; +use std::collections::{BTreeMap, HashMap}; use std::io; use std::ops::Range; use std::pin::Pin; @@ -52,6 +53,12 @@ enum Error { #[snafu(display("ETag required for conditional update"))] MissingETag, + + #[snafu(display("MultipartUpload not found: {id}"))] + UploadNotFound { id: String }, + + #[snafu(display("Missing part at index: {part}"))] + MissingPart { part: usize }, } impl From for super::Error { @@ -101,6 +108,12 @@ impl Entry { struct Storage { next_etag: usize, map: BTreeMap, + uploads: HashMap, +} + +#[derive(Debug, Default, Clone)] +struct PartStorage { + parts: Vec>, } type SharedStorage = Arc>; @@ -154,6 +167,24 @@ impl Storage { } } } + + fn upload_mut(&mut self, id: &MultipartId) -> Result<&mut PartStorage> { + let parts = id + .parse() + .ok() + .and_then(|x| self.uploads.get_mut(&x)) + .context(UploadNotFoundSnafu { id })?; + Ok(parts) + } + + fn remove_upload(&mut self, id: &MultipartId) -> Result { + let parts = id + .parse() + .ok() + .and_then(|x| self.uploads.remove(&x)) + .context(UploadNotFoundSnafu { id })?; + Ok(parts) + } } impl std::fmt::Display for InMemory { @@ -359,6 +390,64 @@ impl ObjectStore for InMemory { } } +#[async_trait] +impl MultiPartStore for InMemory { + async fn create_multipart(&self, _path: &Path) -> Result { + let mut storage = self.storage.write(); + let etag = storage.next_etag; + storage.next_etag += 1; + storage.uploads.insert(etag, Default::default()); + Ok(etag.to_string()) + } + + async fn put_part( + &self, + _path: &Path, + id: &MultipartId, + part_idx: usize, + data: Bytes, + ) -> Result { + let mut storage = self.storage.write(); + let upload = storage.upload_mut(id)?; + if part_idx <= upload.parts.len() { + upload.parts.resize(part_idx + 1, None); + } + upload.parts[part_idx] = Some(data); + Ok(PartId { + content_id: Default::default(), + }) + } + + async fn complete_multipart( + &self, + path: &Path, + id: &MultipartId, + _parts: Vec, + ) -> Result { + let mut storage = self.storage.write(); + let upload = storage.remove_upload(id)?; + + let mut cap = 0; + for (part, x) in upload.parts.iter().enumerate() { + cap += x.as_ref().context(MissingPartSnafu { part })?.len(); + } + let mut buf = Vec::with_capacity(cap); + for x in &upload.parts { + buf.extend_from_slice(x.as_ref().unwrap()) + } + let etag = storage.insert(path, buf.into()); + Ok(PutResult { + e_tag: Some(etag.to_string()), + version: None, + }) + } + + async fn abort_multipart(&self, _path: &Path, id: &MultipartId) -> Result<()> { + self.storage.write().remove_upload(id)?; + Ok(()) + } +} + impl InMemory { /// Create new in-memory storage. pub fn new() -> Self { @@ -444,6 +533,7 @@ mod tests { copy_if_not_exists(&integration).await; stream_get(&integration).await; put_opts(&integration, true).await; + multipart(&integration, &integration).await; } #[tokio::test] From ce34bc90a01cc83076ff0cba168e7cb915b68377 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Sat, 16 Mar 2024 18:41:14 -0400 Subject: [PATCH 280/397] Minor: add additional documentation about `BufWriter` (#5519) * Minor: add additional documentation about BufWriter * Update object_store/src/buffered.rs * Apply suggestions from code review Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> * Format --------- Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Co-authored-by: Raphael Taylor-Davies --- src/buffered.rs | 7 ++++++- src/lib.rs | 13 +++++++++---- 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/src/buffered.rs b/src/buffered.rs index fdefe59..9299e11 100644 --- a/src/buffered.rs +++ b/src/buffered.rs @@ -207,6 +207,10 @@ impl AsyncBufRead for BufReader { /// An async buffered writer compatible with the tokio IO traits /// +/// This writer adaptively uses [`ObjectStore::put`] or +/// [`ObjectStore::put_multipart`] depending on the amount of data that has +/// been written. +/// /// Up to `capacity` bytes will be buffered in memory, and flushed on shutdown /// using [`ObjectStore::put`]. If `capacity` is exceeded, data will instead be /// streamed using [`ObjectStore::put_multipart`] @@ -255,7 +259,8 @@ impl BufWriter { } } - /// Returns the [`MultipartId`] if multipart upload + /// Returns the [`MultipartId`] of the multipart upload created by this + /// writer, if any. pub fn multipart_id(&self) -> Option<&MultipartId> { self.multipart_id.as_ref() } diff --git a/src/lib.rs b/src/lib.rs index 8132002..4960f3b 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -88,11 +88,11 @@ //! //! # Why not a Filesystem Interface? //! -//! Whilst this crate does provide a [`BufReader`], the [`ObjectStore`] interface mirrors the APIs -//! of object stores and not filesystems, opting to provide stateless APIs instead of the cursor -//! based interfaces such as [`Read`] or [`Seek`] favoured by filesystems. +//! The [`ObjectStore`] interface is designed to mirror the APIs +//! of object stores and *not* filesystems, and thus has stateless APIs instead +//! of cursor based interfaces such as [`Read`] or [`Seek`] available in filesystems. //! -//! This provides some compelling advantages: +//! This design provides the following advantages: //! //! * All operations are atomic, and readers cannot observe partial and/or failed writes //! * Methods map directly to object store APIs, providing both efficiency and predictability @@ -100,7 +100,12 @@ //! * Allows for functionality not native to filesystems, such as operation preconditions //! and atomic multipart uploads //! +//! This crate does provide [`BufReader`] and [`BufWriter`] adapters +//! which provide a more filesystem-like API for working with the +//! [`ObjectStore`] trait, however, they should be used with care +//! //! [`BufReader`]: buffered::BufReader +//! [`BufWriter`]: buffered::BufWriter //! //! # Adapters //! From 128ccf65ce9312bc7ae806a19282b73d38688f8d Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Wed, 20 Mar 2024 12:26:54 +1300 Subject: [PATCH 281/397] Replace AsyncWrite with Upload trait and rename MultiPartStore to MultipartStore (#5458) (#5500) * Replace AsyncWrite with Upload trait (#5458) * Make BufWriter abortable * Flesh out cloud implementations * Review feedback * Misc tweaks and fixes * Format * Replace multi-part with multipart * More docs * Clippy * Rename to MultipartUpload * Rename ChunkedUpload to WriteMultipart * Doc tweaks * Apply suggestions from code review Co-authored-by: Andrew Lamb * Docs * Format --------- Co-authored-by: Andrew Lamb --- src/aws/mod.rs | 104 +++++++----- src/azure/mod.rs | 80 +++++---- src/buffered.rs | 84 +++++----- src/chunked.rs | 16 +- src/client/mod.rs | 3 + src/client/parts.rs | 48 ++++++ src/gcp/client.rs | 2 +- src/gcp/mod.rs | 106 ++++++------ src/http/mod.rs | 14 +- src/lib.rs | 127 ++++---------- src/limit.rs | 79 +++++---- src/local.rs | 358 ++++++++++++++-------------------------- src/memory.rs | 95 +++++------ src/multipart.rs | 243 +-------------------------- src/prefix.rs | 14 +- src/throttle.rs | 16 +- src/upload.rs | 175 ++++++++++++++++++++ tests/get_range_file.rs | 10 +- 18 files changed, 691 insertions(+), 883 deletions(-) create mode 100644 src/client/parts.rs create mode 100644 src/upload.rs diff --git a/src/aws/mod.rs b/src/aws/mod.rs index b11f451..b33771d 100644 --- a/src/aws/mod.rs +++ b/src/aws/mod.rs @@ -17,17 +17,14 @@ //! An object store implementation for S3 //! -//! ## Multi-part uploads +//! ## Multipart uploads //! -//! Multi-part uploads can be initiated with the [ObjectStore::put_multipart] method. -//! Data passed to the writer is automatically buffered to meet the minimum size -//! requirements for a part. Multiple parts are uploaded concurrently. +//! Multipart uploads can be initiated with the [ObjectStore::put_multipart] method. //! //! If the writer fails for any reason, you may have parts uploaded to AWS but not -//! used that you may be charged for. Use the [ObjectStore::abort_multipart] method -//! to abort the upload and drop those unneeded parts. In addition, you may wish to -//! consider implementing [automatic cleanup] of unused parts that are older than one -//! week. +//! used that you will be charged for. [`MultipartUpload::abort`] may be invoked to drop +//! these unneeded parts, however, it is recommended that you consider implementing +//! [automatic cleanup] of unused parts that are older than some threshold. //! //! [automatic cleanup]: https://aws.amazon.com/blogs/aws/s3-lifecycle-management-update-support-for-multipart-uploads-and-delete-markers/ @@ -38,18 +35,17 @@ use futures::{StreamExt, TryStreamExt}; use reqwest::header::{HeaderName, IF_MATCH, IF_NONE_MATCH}; use reqwest::{Method, StatusCode}; use std::{sync::Arc, time::Duration}; -use tokio::io::AsyncWrite; use url::Url; use crate::aws::client::{RequestError, S3Client}; use crate::client::get::GetClientExt; use crate::client::list::ListClientExt; use crate::client::CredentialProvider; -use crate::multipart::{MultiPartStore, PartId, PutPart, WriteMultiPart}; +use crate::multipart::{MultipartStore, PartId}; use crate::signer::Signer; use crate::{ - Error, GetOptions, GetResult, ListResult, MultipartId, ObjectMeta, ObjectStore, Path, PutMode, - PutOptions, PutResult, Result, + Error, GetOptions, GetResult, ListResult, MultipartId, MultipartUpload, ObjectMeta, + ObjectStore, Path, PutMode, PutOptions, PutResult, Result, UploadPart, }; static TAGS_HEADER: HeaderName = HeaderName::from_static("x-amz-tagging"); @@ -85,6 +81,7 @@ const STORE: &str = "S3"; /// [`CredentialProvider`] for [`AmazonS3`] pub type AwsCredentialProvider = Arc>; +use crate::client::parts::Parts; pub use credential::{AwsAuthorizer, AwsCredential}; /// Interface for [Amazon S3](https://aws.amazon.com/s3/). @@ -211,25 +208,18 @@ impl ObjectStore for AmazonS3 { } } - async fn put_multipart( - &self, - location: &Path, - ) -> Result<(MultipartId, Box)> { - let id = self.client.create_multipart(location).await?; - - let upload = S3MultiPartUpload { - location: location.clone(), - upload_id: id.clone(), - client: Arc::clone(&self.client), - }; - - Ok((id, Box::new(WriteMultiPart::new(upload, 8)))) - } - - async fn abort_multipart(&self, location: &Path, multipart_id: &MultipartId) -> Result<()> { - self.client - .delete_request(location, &[("uploadId", multipart_id)]) - .await + async fn put_multipart(&self, location: &Path) -> Result> { + let upload_id = self.client.create_multipart(location).await?; + + Ok(Box::new(S3MultiPartUpload { + part_idx: 0, + state: Arc::new(UploadState { + client: Arc::clone(&self.client), + location: location.clone(), + upload_id: upload_id.clone(), + parts: Default::default(), + }), + })) } async fn get_opts(&self, location: &Path, options: GetOptions) -> Result { @@ -319,30 +309,55 @@ impl ObjectStore for AmazonS3 { } } +#[derive(Debug)] struct S3MultiPartUpload { + part_idx: usize, + state: Arc, +} + +#[derive(Debug)] +struct UploadState { + parts: Parts, location: Path, upload_id: String, client: Arc, } #[async_trait] -impl PutPart for S3MultiPartUpload { - async fn put_part(&self, buf: Vec, part_idx: usize) -> Result { - self.client - .put_part(&self.location, &self.upload_id, part_idx, buf.into()) +impl MultipartUpload for S3MultiPartUpload { + fn put_part(&mut self, data: Bytes) -> UploadPart { + let idx = self.part_idx; + self.part_idx += 1; + let state = Arc::clone(&self.state); + Box::pin(async move { + let part = state + .client + .put_part(&state.location, &state.upload_id, idx, data) + .await?; + state.parts.put(idx, part); + Ok(()) + }) + } + + async fn complete(&mut self) -> Result { + let parts = self.state.parts.finish(self.part_idx)?; + + self.state + .client + .complete_multipart(&self.state.location, &self.state.upload_id, parts) .await } - async fn complete(&self, completed_parts: Vec) -> Result<()> { - self.client - .complete_multipart(&self.location, &self.upload_id, completed_parts) - .await?; - Ok(()) + async fn abort(&mut self) -> Result<()> { + self.state + .client + .delete_request(&self.state.location, &[("uploadId", &self.state.upload_id)]) + .await } } #[async_trait] -impl MultiPartStore for AmazonS3 { +impl MultipartStore for AmazonS3 { async fn create_multipart(&self, path: &Path) -> Result { self.client.create_multipart(path).await } @@ -377,7 +392,6 @@ mod tests { use crate::{client::get::GetClient, tests::*}; use bytes::Bytes; use hyper::HeaderMap; - use tokio::io::AsyncWriteExt; const NON_EXISTENT_NAME: &str = "nonexistentname"; @@ -542,9 +556,9 @@ mod tests { store.put(&locations[0], data.clone()).await.unwrap(); store.copy(&locations[0], &locations[1]).await.unwrap(); - let (_, mut writer) = store.put_multipart(&locations[2]).await.unwrap(); - writer.write_all(&data).await.unwrap(); - writer.shutdown().await.unwrap(); + let mut upload = store.put_multipart(&locations[2]).await.unwrap(); + upload.put_part(data.clone()).await.unwrap(); + upload.complete().await.unwrap(); for location in &locations { let res = store diff --git a/src/azure/mod.rs b/src/azure/mod.rs index 712b7a3..5d3a405 100644 --- a/src/azure/mod.rs +++ b/src/azure/mod.rs @@ -19,19 +19,15 @@ //! //! ## Streaming uploads //! -//! [ObjectStore::put_multipart] will upload data in blocks and write a blob from those -//! blocks. Data is buffered internally to make blocks of at least 5MB and blocks -//! are uploaded concurrently. +//! [ObjectStore::put_multipart] will upload data in blocks and write a blob from those blocks. //! -//! [ObjectStore::abort_multipart] is a no-op, since Azure Blob Store doesn't provide -//! a way to drop old blocks. Instead unused blocks are automatically cleaned up -//! after 7 days. +//! Unused blocks will automatically be dropped after 7 days. use crate::{ - multipart::{MultiPartStore, PartId, PutPart, WriteMultiPart}, + multipart::{MultipartStore, PartId}, path::Path, signer::Signer, - GetOptions, GetResult, ListResult, MultipartId, ObjectMeta, ObjectStore, PutOptions, PutResult, - Result, + GetOptions, GetResult, ListResult, MultipartId, MultipartUpload, ObjectMeta, ObjectStore, + PutOptions, PutResult, Result, UploadPart, }; use async_trait::async_trait; use bytes::Bytes; @@ -40,7 +36,6 @@ use reqwest::Method; use std::fmt::Debug; use std::sync::Arc; use std::time::Duration; -use tokio::io::AsyncWrite; use url::Url; use crate::client::get::GetClientExt; @@ -54,6 +49,8 @@ mod credential; /// [`CredentialProvider`] for [`MicrosoftAzure`] pub type AzureCredentialProvider = Arc>; +use crate::azure::client::AzureClient; +use crate::client::parts::Parts; pub use builder::{AzureConfigKey, MicrosoftAzureBuilder}; pub use credential::AzureCredential; @@ -94,21 +91,15 @@ impl ObjectStore for MicrosoftAzure { self.client.put_blob(location, bytes, opts).await } - async fn put_multipart( - &self, - location: &Path, - ) -> Result<(MultipartId, Box)> { - let inner = AzureMultiPartUpload { - client: Arc::clone(&self.client), - location: location.to_owned(), - }; - Ok((String::new(), Box::new(WriteMultiPart::new(inner, 8)))) - } - - async fn abort_multipart(&self, _location: &Path, _multipart_id: &MultipartId) -> Result<()> { - // There is no way to drop blocks that have been uploaded. Instead, they simply - // expire in 7 days. - Ok(()) + async fn put_multipart(&self, location: &Path) -> Result> { + Ok(Box::new(AzureMultiPartUpload { + part_idx: 0, + state: Arc::new(UploadState { + client: Arc::clone(&self.client), + location: location.clone(), + parts: Default::default(), + }), + })) } async fn get_opts(&self, location: &Path, options: GetOptions) -> Result { @@ -197,26 +188,49 @@ impl Signer for MicrosoftAzure { /// put_multipart_part -> PUT block /// complete -> PUT block list /// abort -> No equivalent; blocks are simply dropped after 7 days -#[derive(Debug, Clone)] +#[derive(Debug)] struct AzureMultiPartUpload { - client: Arc, + part_idx: usize, + state: Arc, +} + +#[derive(Debug)] +struct UploadState { location: Path, + parts: Parts, + client: Arc, } #[async_trait] -impl PutPart for AzureMultiPartUpload { - async fn put_part(&self, buf: Vec, idx: usize) -> Result { - self.client.put_block(&self.location, idx, buf.into()).await +impl MultipartUpload for AzureMultiPartUpload { + fn put_part(&mut self, data: Bytes) -> UploadPart { + let idx = self.part_idx; + self.part_idx += 1; + let state = Arc::clone(&self.state); + Box::pin(async move { + let part = state.client.put_block(&state.location, idx, data).await?; + state.parts.put(idx, part); + Ok(()) + }) + } + + async fn complete(&mut self) -> Result { + let parts = self.state.parts.finish(self.part_idx)?; + + self.state + .client + .put_block_list(&self.state.location, parts) + .await } - async fn complete(&self, parts: Vec) -> Result<()> { - self.client.put_block_list(&self.location, parts).await?; + async fn abort(&mut self) -> Result<()> { + // Nothing to do Ok(()) } } #[async_trait] -impl MultiPartStore for MicrosoftAzure { +impl MultipartStore for MicrosoftAzure { async fn create_multipart(&self, _: &Path) -> Result { Ok(String::new()) } diff --git a/src/buffered.rs b/src/buffered.rs index 9299e11..39f8eaf 100644 --- a/src/buffered.rs +++ b/src/buffered.rs @@ -18,7 +18,7 @@ //! Utilities for performing tokio-style buffered IO use crate::path::Path; -use crate::{MultipartId, ObjectMeta, ObjectStore}; +use crate::{ObjectMeta, ObjectStore, WriteMultipart}; use bytes::Bytes; use futures::future::{BoxFuture, FutureExt}; use futures::ready; @@ -27,7 +27,7 @@ use std::io::{Error, ErrorKind, SeekFrom}; use std::pin::Pin; use std::sync::Arc; use std::task::{Context, Poll}; -use tokio::io::{AsyncBufRead, AsyncRead, AsyncSeek, AsyncWrite, AsyncWriteExt, ReadBuf}; +use tokio::io::{AsyncBufRead, AsyncRead, AsyncSeek, AsyncWrite, ReadBuf}; /// The default buffer size used by [`BufReader`] pub const DEFAULT_BUFFER_SIZE: usize = 1024 * 1024; @@ -217,7 +217,6 @@ impl AsyncBufRead for BufReader { pub struct BufWriter { capacity: usize, state: BufWriterState, - multipart_id: Option, store: Arc, } @@ -225,22 +224,19 @@ impl std::fmt::Debug for BufWriter { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { f.debug_struct("BufWriter") .field("capacity", &self.capacity) - .field("multipart_id", &self.multipart_id) .finish() } } -type MultipartResult = (MultipartId, Box); - enum BufWriterState { /// Buffer up to capacity bytes Buffer(Path, Vec), /// [`ObjectStore::put_multipart`] - Prepare(BoxFuture<'static, std::io::Result>), + Prepare(BoxFuture<'static, std::io::Result>), /// Write to a multipart upload - Write(Box), + Write(Option), /// [`ObjectStore::put`] - Put(BoxFuture<'static, std::io::Result<()>>), + Flush(BoxFuture<'static, std::io::Result<()>>), } impl BufWriter { @@ -255,14 +251,20 @@ impl BufWriter { capacity, store, state: BufWriterState::Buffer(path, Vec::new()), - multipart_id: None, } } - /// Returns the [`MultipartId`] of the multipart upload created by this - /// writer, if any. - pub fn multipart_id(&self) -> Option<&MultipartId> { - self.multipart_id.as_ref() + /// Abort this writer, cleaning up any partially uploaded state + /// + /// # Panic + /// + /// Panics if this writer has already been shutdown or aborted + pub async fn abort(&mut self) -> crate::Result<()> { + match &mut self.state { + BufWriterState::Buffer(_, _) | BufWriterState::Prepare(_) => Ok(()), + BufWriterState::Flush(_) => panic!("Already shut down"), + BufWriterState::Write(x) => x.take().unwrap().abort().await, + } } } @@ -275,12 +277,15 @@ impl AsyncWrite for BufWriter { let cap = self.capacity; loop { return match &mut self.state { - BufWriterState::Write(write) => Pin::new(write).poll_write(cx, buf), - BufWriterState::Put(_) => panic!("Already shut down"), + BufWriterState::Write(Some(write)) => { + write.write(buf); + Poll::Ready(Ok(buf.len())) + } + BufWriterState::Write(None) | BufWriterState::Flush(_) => { + panic!("Already shut down") + } BufWriterState::Prepare(f) => { - let (id, w) = ready!(f.poll_unpin(cx)?); - self.state = BufWriterState::Write(w); - self.multipart_id = Some(id); + self.state = BufWriterState::Write(ready!(f.poll_unpin(cx)?).into()); continue; } BufWriterState::Buffer(path, b) => { @@ -289,9 +294,10 @@ impl AsyncWrite for BufWriter { let path = std::mem::take(path); let store = Arc::clone(&self.store); self.state = BufWriterState::Prepare(Box::pin(async move { - let (id, mut writer) = store.put_multipart(&path).await?; - writer.write_all(&buffer).await?; - Ok((id, writer)) + let upload = store.put_multipart(&path).await?; + let mut chunked = WriteMultipart::new(upload); + chunked.write(&buffer); + Ok(chunked) })); continue; } @@ -305,13 +311,10 @@ impl AsyncWrite for BufWriter { fn poll_flush(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { loop { return match &mut self.state { - BufWriterState::Buffer(_, _) => Poll::Ready(Ok(())), - BufWriterState::Write(write) => Pin::new(write).poll_flush(cx), - BufWriterState::Put(_) => panic!("Already shut down"), + BufWriterState::Write(_) | BufWriterState::Buffer(_, _) => Poll::Ready(Ok(())), + BufWriterState::Flush(_) => panic!("Already shut down"), BufWriterState::Prepare(f) => { - let (id, w) = ready!(f.poll_unpin(cx)?); - self.state = BufWriterState::Write(w); - self.multipart_id = Some(id); + self.state = BufWriterState::Write(ready!(f.poll_unpin(cx)?).into()); continue; } }; @@ -322,21 +325,28 @@ impl AsyncWrite for BufWriter { loop { match &mut self.state { BufWriterState::Prepare(f) => { - let (id, w) = ready!(f.poll_unpin(cx)?); - self.state = BufWriterState::Write(w); - self.multipart_id = Some(id); + self.state = BufWriterState::Write(ready!(f.poll_unpin(cx)?).into()); } BufWriterState::Buffer(p, b) => { let buf = std::mem::take(b); let path = std::mem::take(p); let store = Arc::clone(&self.store); - self.state = BufWriterState::Put(Box::pin(async move { + self.state = BufWriterState::Flush(Box::pin(async move { store.put(&path, buf.into()).await?; Ok(()) })); } - BufWriterState::Put(f) => return f.poll_unpin(cx), - BufWriterState::Write(w) => return Pin::new(w).poll_shutdown(cx), + BufWriterState::Flush(f) => return f.poll_unpin(cx), + BufWriterState::Write(x) => { + let upload = x.take().unwrap(); + self.state = BufWriterState::Flush( + async move { + upload.finish().await?; + Ok(()) + } + .boxed(), + ) + } } } } @@ -357,7 +367,7 @@ mod tests { use super::*; use crate::memory::InMemory; use crate::path::Path; - use tokio::io::{AsyncBufReadExt, AsyncReadExt, AsyncSeekExt}; + use tokio::io::{AsyncBufReadExt, AsyncReadExt, AsyncSeekExt, AsyncWriteExt}; #[tokio::test] async fn test_buf_reader() { @@ -448,9 +458,7 @@ mod tests { writer.write_all(&[0; 20]).await.unwrap(); writer.flush().await.unwrap(); writer.write_all(&[0; 5]).await.unwrap(); - assert!(writer.multipart_id().is_none()); writer.shutdown().await.unwrap(); - assert!(writer.multipart_id().is_none()); assert_eq!(store.head(&path).await.unwrap().size, 25); // Test multipart @@ -458,9 +466,7 @@ mod tests { writer.write_all(&[0; 20]).await.unwrap(); writer.flush().await.unwrap(); writer.write_all(&[0; 20]).await.unwrap(); - assert!(writer.multipart_id().is_some()); writer.shutdown().await.unwrap(); - assert!(writer.multipart_id().is_some()); assert_eq!(store.head(&path).await.unwrap().size, 40); } diff --git a/src/chunked.rs b/src/chunked.rs index d33556f..6db7f4b 100644 --- a/src/chunked.rs +++ b/src/chunked.rs @@ -25,14 +25,13 @@ use async_trait::async_trait; use bytes::{BufMut, Bytes, BytesMut}; use futures::stream::BoxStream; use futures::StreamExt; -use tokio::io::AsyncWrite; use crate::path::Path; +use crate::Result; use crate::{ - GetOptions, GetResult, GetResultPayload, ListResult, ObjectMeta, ObjectStore, PutOptions, - PutResult, + GetOptions, GetResult, GetResultPayload, ListResult, MultipartUpload, ObjectMeta, ObjectStore, + PutOptions, PutResult, }; -use crate::{MultipartId, Result}; /// Wraps a [`ObjectStore`] and makes its get response return chunks /// in a controllable manner. @@ -67,17 +66,10 @@ impl ObjectStore for ChunkedStore { self.inner.put_opts(location, bytes, opts).await } - async fn put_multipart( - &self, - location: &Path, - ) -> Result<(MultipartId, Box)> { + async fn put_multipart(&self, location: &Path) -> Result> { self.inner.put_multipart(location).await } - async fn abort_multipart(&self, location: &Path, multipart_id: &MultipartId) -> Result<()> { - self.inner.abort_multipart(location, multipart_id).await - } - async fn get_opts(&self, location: &Path, options: GetOptions) -> Result { let r = self.inner.get_opts(location, options).await?; let stream = match r.payload { diff --git a/src/client/mod.rs b/src/client/mod.rs index 252e9fd..7728f38 100644 --- a/src/client/mod.rs +++ b/src/client/mod.rs @@ -40,6 +40,9 @@ pub mod header; #[cfg(any(feature = "aws", feature = "gcp"))] pub mod s3; +#[cfg(any(feature = "aws", feature = "gcp", feature = "azure"))] +pub mod parts; + use async_trait::async_trait; use std::collections::HashMap; use std::str::FromStr; diff --git a/src/client/parts.rs b/src/client/parts.rs new file mode 100644 index 0000000..9fc301e --- /dev/null +++ b/src/client/parts.rs @@ -0,0 +1,48 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::multipart::PartId; +use parking_lot::Mutex; + +/// An interior mutable collection of upload parts and their corresponding part index +#[derive(Debug, Default)] +pub(crate) struct Parts(Mutex>); + +impl Parts { + /// Record the [`PartId`] for a given index + /// + /// Note: calling this method multiple times with the same `part_idx` + /// will result in multiple [`PartId`] in the final output + pub(crate) fn put(&self, part_idx: usize, id: PartId) { + self.0.lock().push((part_idx, id)) + } + + /// Produce the final list of [`PartId`] ordered by `part_idx` + /// + /// `expected` is the number of parts expected in the final result + pub(crate) fn finish(&self, expected: usize) -> crate::Result> { + let mut parts = self.0.lock(); + if parts.len() != expected { + return Err(crate::Error::Generic { + store: "Parts", + source: "Missing part".to_string().into(), + }); + } + parts.sort_unstable_by_key(|(idx, _)| *idx); + Ok(parts.drain(..).map(|(_, v)| v).collect()) + } +} diff --git a/src/gcp/client.rs b/src/gcp/client.rs index e4b0f9a..def53be 100644 --- a/src/gcp/client.rs +++ b/src/gcp/client.rs @@ -272,7 +272,7 @@ impl GoogleCloudStorageClient { }) } - /// Initiate a multi-part upload + /// Initiate a multipart upload pub async fn multipart_initiate(&self, path: &Path) -> Result { let credential = self.get_credential().await?; let url = self.object_url(path); diff --git a/src/gcp/mod.rs b/src/gcp/mod.rs index 8633abb..2058d1f 100644 --- a/src/gcp/mod.rs +++ b/src/gcp/mod.rs @@ -17,18 +17,14 @@ //! An object store implementation for Google Cloud Storage //! -//! ## Multi-part uploads +//! ## Multipart uploads //! -//! [Multi-part uploads](https://cloud.google.com/storage/docs/multipart-uploads) -//! can be initiated with the [ObjectStore::put_multipart] method. -//! Data passed to the writer is automatically buffered to meet the minimum size -//! requirements for a part. Multiple parts are uploaded concurrently. -//! -//! If the writer fails for any reason, you may have parts uploaded to GCS but not -//! used that you may be charged for. Use the [ObjectStore::abort_multipart] method -//! to abort the upload and drop those unneeded parts. In addition, you may wish to -//! consider implementing automatic clean up of unused parts that are older than one -//! week. +//! [Multipart uploads](https://cloud.google.com/storage/docs/multipart-uploads) +//! can be initiated with the [ObjectStore::put_multipart] method. If neither +//! [`MultipartUpload::complete`] nor [`MultipartUpload::abort`] is invoked, you may +//! have parts uploaded to GCS but not used, that you will be charged for. It is recommended +//! you configure a [lifecycle rule] to abort incomplete multipart uploads after a certain +//! period of time to avoid being charged for storing partial uploads. //! //! ## Using HTTP/2 //! @@ -36,24 +32,24 @@ //! because it allows much higher throughput in our benchmarks (see //! [#5194](https://github.com/apache/arrow-rs/issues/5194)). HTTP/2 can be //! enabled by setting [crate::ClientConfigKey::Http1Only] to false. +//! +//! [lifecycle rule]: https://cloud.google.com/storage/docs/lifecycle#abort-mpu use std::sync::Arc; use crate::client::CredentialProvider; use crate::{ - multipart::{PartId, PutPart, WriteMultiPart}, - path::Path, - GetOptions, GetResult, ListResult, MultipartId, ObjectMeta, ObjectStore, PutOptions, PutResult, - Result, + multipart::PartId, path::Path, GetOptions, GetResult, ListResult, MultipartId, MultipartUpload, + ObjectMeta, ObjectStore, PutOptions, PutResult, Result, UploadPart, }; use async_trait::async_trait; use bytes::Bytes; use client::GoogleCloudStorageClient; use futures::stream::BoxStream; -use tokio::io::AsyncWrite; use crate::client::get::GetClientExt; use crate::client::list::ListClientExt; -use crate::multipart::MultiPartStore; +use crate::client::parts::Parts; +use crate::multipart::MultipartStore; pub use builder::{GoogleCloudStorageBuilder, GoogleConfigKey}; pub use credential::GcpCredential; @@ -89,27 +85,50 @@ impl GoogleCloudStorage { } } +#[derive(Debug)] struct GCSMultipartUpload { + state: Arc, + part_idx: usize, +} + +#[derive(Debug)] +struct UploadState { client: Arc, path: Path, multipart_id: MultipartId, + parts: Parts, } #[async_trait] -impl PutPart for GCSMultipartUpload { - /// Upload an object part - async fn put_part(&self, buf: Vec, part_idx: usize) -> Result { - self.client - .put_part(&self.path, &self.multipart_id, part_idx, buf.into()) +impl MultipartUpload for GCSMultipartUpload { + fn put_part(&mut self, data: Bytes) -> UploadPart { + let idx = self.part_idx; + self.part_idx += 1; + let state = Arc::clone(&self.state); + Box::pin(async move { + let part = state + .client + .put_part(&state.path, &state.multipart_id, idx, data) + .await?; + state.parts.put(idx, part); + Ok(()) + }) + } + + async fn complete(&mut self) -> Result { + let parts = self.state.parts.finish(self.part_idx)?; + + self.state + .client + .multipart_complete(&self.state.path, &self.state.multipart_id, parts) .await } - /// Complete a multipart upload - async fn complete(&self, completed_parts: Vec) -> Result<()> { - self.client - .multipart_complete(&self.path, &self.multipart_id, completed_parts) - .await?; - Ok(()) + async fn abort(&mut self) -> Result<()> { + self.state + .client + .multipart_cleanup(&self.state.path, &self.state.multipart_id) + .await } } @@ -119,27 +138,18 @@ impl ObjectStore for GoogleCloudStorage { self.client.put(location, bytes, opts).await } - async fn put_multipart( - &self, - location: &Path, - ) -> Result<(MultipartId, Box)> { + async fn put_multipart(&self, location: &Path) -> Result> { let upload_id = self.client.multipart_initiate(location).await?; - let inner = GCSMultipartUpload { - client: Arc::clone(&self.client), - path: location.clone(), - multipart_id: upload_id.clone(), - }; - - Ok((upload_id, Box::new(WriteMultiPart::new(inner, 8)))) - } - - async fn abort_multipart(&self, location: &Path, multipart_id: &MultipartId) -> Result<()> { - self.client - .multipart_cleanup(location, multipart_id) - .await?; - - Ok(()) + Ok(Box::new(GCSMultipartUpload { + part_idx: 0, + state: Arc::new(UploadState { + client: Arc::clone(&self.client), + path: location.clone(), + multipart_id: upload_id.clone(), + parts: Default::default(), + }), + })) } async fn get_opts(&self, location: &Path, options: GetOptions) -> Result { @@ -176,7 +186,7 @@ impl ObjectStore for GoogleCloudStorage { } #[async_trait] -impl MultiPartStore for GoogleCloudStorage { +impl MultipartStore for GoogleCloudStorage { async fn create_multipart(&self, path: &Path) -> Result { self.client.multipart_initiate(path).await } diff --git a/src/http/mod.rs b/src/http/mod.rs index f1d11db..626337d 100644 --- a/src/http/mod.rs +++ b/src/http/mod.rs @@ -37,7 +37,6 @@ use futures::stream::BoxStream; use futures::{StreamExt, TryStreamExt}; use itertools::Itertools; use snafu::{OptionExt, ResultExt, Snafu}; -use tokio::io::AsyncWrite; use url::Url; use crate::client::get::GetClientExt; @@ -45,7 +44,7 @@ use crate::client::header::get_etag; use crate::http::client::Client; use crate::path::Path; use crate::{ - ClientConfigKey, ClientOptions, GetOptions, GetResult, ListResult, MultipartId, ObjectMeta, + ClientConfigKey, ClientOptions, GetOptions, GetResult, ListResult, MultipartUpload, ObjectMeta, ObjectStore, PutMode, PutOptions, PutResult, Result, RetryConfig, }; @@ -115,15 +114,8 @@ impl ObjectStore for HttpStore { }) } - async fn put_multipart( - &self, - _location: &Path, - ) -> Result<(MultipartId, Box)> { - Err(super::Error::NotImplemented) - } - - async fn abort_multipart(&self, _location: &Path, _multipart_id: &MultipartId) -> Result<()> { - Err(super::Error::NotImplemented) + async fn put_multipart(&self, _location: &Path) -> Result> { + Err(crate::Error::NotImplemented) } async fn get_opts(&self, location: &Path, options: GetOptions) -> Result { diff --git a/src/lib.rs b/src/lib.rs index 4960f3b..e02675d 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -269,12 +269,11 @@ //! //! # Multipart Upload //! -//! Use the [`ObjectStore::put_multipart`] method to atomically write a large amount of data, -//! with implementations automatically handling parallel, chunked upload where appropriate. +//! Use the [`ObjectStore::put_multipart`] method to atomically write a large amount of data //! //! ``` //! # use object_store::local::LocalFileSystem; -//! # use object_store::ObjectStore; +//! # use object_store::{ObjectStore, WriteMultipart}; //! # use std::sync::Arc; //! # use bytes::Bytes; //! # use tokio::io::AsyncWriteExt; @@ -286,12 +285,10 @@ //! # //! let object_store: Arc = get_object_store(); //! let path = Path::from("data/large_file"); -//! let (_id, mut writer) = object_store.put_multipart(&path).await.unwrap(); -//! -//! let bytes = Bytes::from_static(b"hello"); -//! writer.write_all(&bytes).await.unwrap(); -//! writer.flush().await.unwrap(); -//! writer.shutdown().await.unwrap(); +//! let upload = object_store.put_multipart(&path).await.unwrap(); +//! let mut write = WriteMultipart::new(upload); +//! write.write(b"hello"); +//! write.finish().await.unwrap(); //! # } //! ``` //! @@ -501,9 +498,11 @@ pub use tags::TagSet; pub mod multipart; mod parse; +mod upload; mod util; pub use parse::{parse_url, parse_url_opts}; +pub use upload::*; pub use util::GetRange; use crate::path::Path; @@ -520,12 +519,11 @@ use std::fmt::{Debug, Formatter}; use std::io::{Read, Seek, SeekFrom}; use std::ops::Range; use std::sync::Arc; -use tokio::io::AsyncWrite; /// An alias for a dynamically dispatched object store implementation. pub type DynObjectStore = dyn ObjectStore; -/// Id type for multi-part uploads. +/// Id type for multipart uploads. pub type MultipartId = String; /// Universal API to multiple object store services. @@ -543,48 +541,11 @@ pub trait ObjectStore: std::fmt::Display + Send + Sync + Debug + 'static { /// Save the provided bytes to the specified location with the given options async fn put_opts(&self, location: &Path, bytes: Bytes, opts: PutOptions) -> Result; - /// Get a multi-part upload that allows writing data in chunks. - /// - /// Most cloud-based uploads will buffer and upload parts in parallel. - /// - /// To complete the upload, [AsyncWrite::poll_shutdown] must be called - /// to completion. This operation is guaranteed to be atomic, it will either - /// make all the written data available at `location`, or fail. No clients - /// should be able to observe a partially written object. - /// - /// For some object stores (S3, GCS, and local in particular), if the - /// writer fails or panics, you must call [ObjectStore::abort_multipart] - /// to clean up partially written data. - /// - ///
- /// It is recommended applications wait for any in-flight requests to complete by calling `flush`, if - /// there may be a significant gap in time (> ~30s) before the next write. - /// These gaps can include times where the function returns control to the - /// caller while keeping the writer open. If `flush` is not called, futures - /// for in-flight requests may be left unpolled long enough for the requests - /// to time out, causing the write to fail. - ///
- /// - /// For applications requiring fine-grained control of multipart uploads - /// see [`MultiPartStore`], although note that this interface cannot be - /// supported by all [`ObjectStore`] backends. - /// - /// For applications looking to implement this interface for a custom - /// multipart API, see [`WriteMultiPart`] which handles the complexities - /// of performing parallel uploads of fixed size parts. - /// - /// [`WriteMultiPart`]: multipart::WriteMultiPart - /// [`MultiPartStore`]: multipart::MultiPartStore - async fn put_multipart( - &self, - location: &Path, - ) -> Result<(MultipartId, Box)>; - - /// Cleanup an aborted upload. + /// Perform a multipart upload /// - /// See documentation for individual stores for exact behavior, as capabilities - /// vary by object store. - async fn abort_multipart(&self, location: &Path, multipart_id: &MultipartId) -> Result<()>; + /// Client should prefer [`ObjectStore::put`] for small payloads, as streaming uploads + /// typically require multiple separate requests. See [`MultipartUpload`] for more information + async fn put_multipart(&self, location: &Path) -> Result>; /// Return the bytes that are stored at the specified location. async fn get(&self, location: &Path) -> Result { @@ -769,21 +730,10 @@ macro_rules! as_ref_impl { self.as_ref().put_opts(location, bytes, opts).await } - async fn put_multipart( - &self, - location: &Path, - ) -> Result<(MultipartId, Box)> { + async fn put_multipart(&self, location: &Path) -> Result> { self.as_ref().put_multipart(location).await } - async fn abort_multipart( - &self, - location: &Path, - multipart_id: &MultipartId, - ) -> Result<()> { - self.as_ref().abort_multipart(location, multipart_id).await - } - async fn get(&self, location: &Path) -> Result { self.as_ref().get(location).await } @@ -1246,14 +1196,12 @@ mod test_util { #[cfg(test)] mod tests { use super::*; - use crate::multipart::MultiPartStore; + use crate::multipart::MultipartStore; use crate::test_util::flatten_list_stream; use chrono::TimeZone; use futures::stream::FuturesUnordered; use rand::distributions::Alphanumeric; use rand::{thread_rng, Rng}; - use std::future::Future; - use tokio::io::AsyncWriteExt; pub(crate) async fn put_get_delete_list(storage: &DynObjectStore) { put_get_delete_list_opts(storage).await @@ -1928,12 +1876,11 @@ mod tests { let location = Path::from("test_dir/test_upload_file.txt"); // Can write to storage - let data = get_chunks(5_000, 10); + let data = get_chunks(5 * 1024 * 1024, 3); let bytes_expected = data.concat(); - let (_, mut writer) = storage.put_multipart(&location).await.unwrap(); - for chunk in &data { - writer.write_all(chunk).await.unwrap(); - } + let mut upload = storage.put_multipart(&location).await.unwrap(); + let uploads = data.into_iter().map(|x| upload.put_part(x)); + futures::future::try_join_all(uploads).await.unwrap(); // Object should not yet exist in store let meta_res = storage.head(&location).await; @@ -1949,7 +1896,8 @@ mod tests { let result = storage.list_with_delimiter(None).await.unwrap(); assert_eq!(&result.objects, &[]); - writer.shutdown().await.unwrap(); + upload.complete().await.unwrap(); + let bytes_written = storage.get(&location).await.unwrap().bytes().await.unwrap(); assert_eq!(bytes_expected, bytes_written); @@ -1957,22 +1905,19 @@ mod tests { // Sizes chosen to ensure we write three parts let data = get_chunks(3_200_000, 7); let bytes_expected = data.concat(); - let (_, mut writer) = storage.put_multipart(&location).await.unwrap(); + let upload = storage.put_multipart(&location).await.unwrap(); + let mut writer = WriteMultipart::new(upload); for chunk in &data { - writer.write_all(chunk).await.unwrap(); + writer.write(chunk) } - writer.shutdown().await.unwrap(); + writer.finish().await.unwrap(); let bytes_written = storage.get(&location).await.unwrap().bytes().await.unwrap(); assert_eq!(bytes_expected, bytes_written); // We can abort an empty write let location = Path::from("test_dir/test_abort_upload.txt"); - let (upload_id, writer) = storage.put_multipart(&location).await.unwrap(); - drop(writer); - storage - .abort_multipart(&location, &upload_id) - .await - .unwrap(); + let mut upload = storage.put_multipart(&location).await.unwrap(); + upload.abort().await.unwrap(); let get_res = storage.get(&location).await; assert!(get_res.is_err()); assert!(matches!( @@ -1981,17 +1926,13 @@ mod tests { )); // We can abort an in-progress write - let (upload_id, mut writer) = storage.put_multipart(&location).await.unwrap(); - if let Some(chunk) = data.first() { - writer.write_all(chunk).await.unwrap(); - let _ = writer.write(chunk).await.unwrap(); - } - drop(writer); - - storage - .abort_multipart(&location, &upload_id) + let mut upload = storage.put_multipart(&location).await.unwrap(); + upload + .put_part(data.first().unwrap().clone()) .await .unwrap(); + + upload.abort().await.unwrap(); let get_res = storage.get(&location).await; assert!(get_res.is_err()); assert!(matches!( @@ -2186,7 +2127,7 @@ mod tests { storage.delete(&path2).await.unwrap(); } - pub(crate) async fn multipart(storage: &dyn ObjectStore, multipart: &dyn MultiPartStore) { + pub(crate) async fn multipart(storage: &dyn ObjectStore, multipart: &dyn MultipartStore) { let path = Path::from("test_multipart"); let chunk_size = 5 * 1024 * 1024; @@ -2253,7 +2194,7 @@ mod tests { pub(crate) async fn tagging(storage: &dyn ObjectStore, validate: bool, get_tags: F) where F: Fn(Path) -> Fut + Send + Sync, - Fut: Future> + Send, + Fut: std::future::Future> + Send, { use bytes::Buf; use serde::Deserialize; diff --git a/src/limit.rs b/src/limit.rs index d1363d9..e5f6841 100644 --- a/src/limit.rs +++ b/src/limit.rs @@ -18,18 +18,16 @@ //! An object store that limits the maximum concurrency of the wrapped implementation use crate::{ - BoxStream, GetOptions, GetResult, GetResultPayload, ListResult, MultipartId, ObjectMeta, - ObjectStore, Path, PutOptions, PutResult, Result, StreamExt, + BoxStream, GetOptions, GetResult, GetResultPayload, ListResult, MultipartUpload, ObjectMeta, + ObjectStore, Path, PutOptions, PutResult, Result, StreamExt, UploadPart, }; use async_trait::async_trait; use bytes::Bytes; use futures::{FutureExt, Stream}; -use std::io::{Error, IoSlice}; use std::ops::Range; use std::pin::Pin; use std::sync::Arc; use std::task::{Context, Poll}; -use tokio::io::AsyncWrite; use tokio::sync::{OwnedSemaphorePermit, Semaphore}; /// Store wrapper that wraps an inner store and limits the maximum number of concurrent @@ -81,18 +79,12 @@ impl ObjectStore for LimitStore { let _permit = self.semaphore.acquire().await.unwrap(); self.inner.put_opts(location, bytes, opts).await } - async fn put_multipart( - &self, - location: &Path, - ) -> Result<(MultipartId, Box)> { - let permit = Arc::clone(&self.semaphore).acquire_owned().await.unwrap(); - let (id, write) = self.inner.put_multipart(location).await?; - Ok((id, Box::new(PermitWrapper::new(write, permit)))) - } - - async fn abort_multipart(&self, location: &Path, multipart_id: &MultipartId) -> Result<()> { - let _permit = self.semaphore.acquire().await.unwrap(); - self.inner.abort_multipart(location, multipart_id).await + async fn put_multipart(&self, location: &Path) -> Result> { + let upload = self.inner.put_multipart(location).await?; + Ok(Box::new(LimitUpload { + semaphore: Arc::clone(&self.semaphore), + upload, + })) } async fn get(&self, location: &Path) -> Result { let permit = Arc::clone(&self.semaphore).acquire_owned().await.unwrap(); @@ -221,39 +213,42 @@ impl Stream for PermitWrapper { } } -impl AsyncWrite for PermitWrapper { - fn poll_write( - mut self: Pin<&mut Self>, - cx: &mut Context<'_>, - buf: &[u8], - ) -> Poll> { - Pin::new(&mut self.inner).poll_write(cx, buf) - } +/// An [`MultipartUpload`] wrapper that limits the maximum number of concurrent requests +#[derive(Debug)] +pub struct LimitUpload { + upload: Box, + semaphore: Arc, +} - fn poll_flush( - mut self: Pin<&mut Self>, - cx: &mut Context<'_>, - ) -> Poll> { - Pin::new(&mut self.inner).poll_flush(cx) +impl LimitUpload { + /// Create a new [`LimitUpload`] limiting `upload` to `max_concurrency` concurrent requests + pub fn new(upload: Box, max_concurrency: usize) -> Self { + Self { + upload, + semaphore: Arc::new(Semaphore::new(max_concurrency)), + } } +} - fn poll_shutdown( - mut self: Pin<&mut Self>, - cx: &mut Context<'_>, - ) -> Poll> { - Pin::new(&mut self.inner).poll_shutdown(cx) +#[async_trait] +impl MultipartUpload for LimitUpload { + fn put_part(&mut self, data: Bytes) -> UploadPart { + let upload = self.upload.put_part(data); + let s = Arc::clone(&self.semaphore); + Box::pin(async move { + let _permit = s.acquire().await.unwrap(); + upload.await + }) } - fn poll_write_vectored( - mut self: Pin<&mut Self>, - cx: &mut Context<'_>, - bufs: &[IoSlice<'_>], - ) -> Poll> { - Pin::new(&mut self.inner).poll_write_vectored(cx, bufs) + async fn complete(&mut self) -> Result { + let _permit = self.semaphore.acquire().await.unwrap(); + self.upload.complete().await } - fn is_write_vectored(&self) -> bool { - self.inner.is_write_vectored() + async fn abort(&mut self) -> Result<()> { + let _permit = self.semaphore.acquire().await.unwrap(); + self.upload.abort().await } } diff --git a/src/local.rs b/src/local.rs index d631771..a7eb466 100644 --- a/src/local.rs +++ b/src/local.rs @@ -16,34 +16,32 @@ // under the License. //! An object store implementation for a local filesystem -use crate::{ - maybe_spawn_blocking, - path::{absolute_path_to_url, Path}, - util::InvalidGetRange, - GetOptions, GetResult, GetResultPayload, ListResult, MultipartId, ObjectMeta, ObjectStore, - PutMode, PutOptions, PutResult, Result, -}; -use async_trait::async_trait; -use bytes::Bytes; -use chrono::{DateTime, Utc}; -use futures::future::BoxFuture; -use futures::ready; -use futures::{stream::BoxStream, StreamExt}; -use futures::{FutureExt, TryStreamExt}; -use snafu::{ensure, ResultExt, Snafu}; use std::fs::{metadata, symlink_metadata, File, Metadata, OpenOptions}; use std::io::{ErrorKind, Read, Seek, SeekFrom, Write}; use std::ops::Range; -use std::pin::Pin; use std::sync::Arc; -use std::task::Poll; use std::time::SystemTime; use std::{collections::BTreeSet, convert::TryFrom, io}; use std::{collections::VecDeque, path::PathBuf}; -use tokio::io::AsyncWrite; + +use async_trait::async_trait; +use bytes::Bytes; +use chrono::{DateTime, Utc}; +use futures::{stream::BoxStream, StreamExt}; +use futures::{FutureExt, TryStreamExt}; +use parking_lot::Mutex; +use snafu::{ensure, OptionExt, ResultExt, Snafu}; use url::Url; use walkdir::{DirEntry, WalkDir}; +use crate::{ + maybe_spawn_blocking, + path::{absolute_path_to_url, Path}, + util::InvalidGetRange, + GetOptions, GetResult, GetResultPayload, ListResult, MultipartUpload, ObjectMeta, ObjectStore, + PutMode, PutOptions, PutResult, Result, UploadPart, +}; + /// A specialized `Error` for filesystem object store-related errors #[derive(Debug, Snafu)] #[allow(missing_docs)] @@ -155,6 +153,9 @@ pub(crate) enum Error { InvalidPath { path: String, }, + + #[snafu(display("Upload aborted"))] + Aborted, } impl From for super::Error { @@ -342,8 +343,7 @@ impl ObjectStore for LocalFileSystem { let path = self.path_to_filesystem(location)?; maybe_spawn_blocking(move || { - let (mut file, suffix) = new_staged_upload(&path)?; - let staging_path = staged_upload_path(&path, &suffix); + let (mut file, staging_path) = new_staged_upload(&path)?; let mut e_tag = None; let err = match file.write_all(&bytes) { @@ -395,31 +395,10 @@ impl ObjectStore for LocalFileSystem { .await } - async fn put_multipart( - &self, - location: &Path, - ) -> Result<(MultipartId, Box)> { - let dest = self.path_to_filesystem(location)?; - - let (file, suffix) = new_staged_upload(&dest)?; - Ok(( - suffix.clone(), - Box::new(LocalUpload::new(dest, suffix, Arc::new(file))), - )) - } - - async fn abort_multipart(&self, location: &Path, multipart_id: &MultipartId) -> Result<()> { + async fn put_multipart(&self, location: &Path) -> Result> { let dest = self.path_to_filesystem(location)?; - let path: PathBuf = staged_upload_path(&dest, multipart_id); - - maybe_spawn_blocking(move || match std::fs::remove_file(&path) { - Ok(_) => Ok(()), - Err(source) => match source.kind() { - ErrorKind::NotFound => Ok(()), // Already deleted - _ => Err(Error::UnableToDeleteFile { path, source }.into()), - }, - }) - .await + let (file, src) = new_staged_upload(&dest)?; + Ok(Box::new(LocalUpload::new(src, dest, file))) } async fn get_opts(&self, location: &Path, options: GetOptions) -> Result { @@ -677,17 +656,17 @@ fn create_parent_dirs(path: &std::path::Path, source: io::Error) -> Result<()> { Ok(()) } -/// Generates a unique file path `{base}#{suffix}`, returning the opened `File` and `suffix` +/// Generates a unique file path `{base}#{suffix}`, returning the opened `File` and `path` /// /// Creates any directories if necessary -fn new_staged_upload(base: &std::path::Path) -> Result<(File, String)> { +fn new_staged_upload(base: &std::path::Path) -> Result<(File, PathBuf)> { let mut multipart_id = 1; loop { let suffix = multipart_id.to_string(); let path = staged_upload_path(base, &suffix); let mut options = OpenOptions::new(); match options.read(true).write(true).create_new(true).open(&path) { - Ok(f) => return Ok((f, suffix)), + Ok(f) => return Ok((f, path)), Err(source) => match source.kind() { ErrorKind::AlreadyExists => multipart_id += 1, ErrorKind::NotFound => create_parent_dirs(&path, source)?, @@ -705,194 +684,91 @@ fn staged_upload_path(dest: &std::path::Path, suffix: &str) -> PathBuf { staging_path.into() } -enum LocalUploadState { - /// Upload is ready to send new data - Idle(Arc), - /// In the middle of a write - Writing(Arc, BoxFuture<'static, Result>), - /// In the middle of syncing data and closing file. - /// - /// Future will contain last reference to file, so it will call drop on completion. - ShuttingDown(BoxFuture<'static, Result<(), io::Error>>), - /// File is being moved from it's temporary location to the final location - Committing(BoxFuture<'static, Result<(), io::Error>>), - /// Upload is complete - Complete, +#[derive(Debug)] +struct LocalUpload { + /// The upload state + state: Arc, + /// The location of the temporary file + src: Option, + /// The next offset to write into the file + offset: u64, } -struct LocalUpload { - inner_state: LocalUploadState, +#[derive(Debug)] +struct UploadState { dest: PathBuf, - multipart_id: MultipartId, + file: Mutex>, } impl LocalUpload { - pub fn new(dest: PathBuf, multipart_id: MultipartId, file: Arc) -> Self { + pub fn new(src: PathBuf, dest: PathBuf, file: File) -> Self { Self { - inner_state: LocalUploadState::Idle(file), - dest, - multipart_id, + state: Arc::new(UploadState { + dest, + file: Mutex::new(Some(file)), + }), + src: Some(src), + offset: 0, } } } -impl AsyncWrite for LocalUpload { - fn poll_write( - mut self: Pin<&mut Self>, - cx: &mut std::task::Context<'_>, - buf: &[u8], - ) -> Poll> { - let invalid_state = |condition: &str| -> Poll> { - Poll::Ready(Err(io::Error::new( - ErrorKind::InvalidInput, - format!("Tried to write to file {condition}."), - ))) - }; +#[async_trait] +impl MultipartUpload for LocalUpload { + fn put_part(&mut self, data: Bytes) -> UploadPart { + let offset = self.offset; + self.offset += data.len() as u64; - if let Ok(runtime) = tokio::runtime::Handle::try_current() { - let mut data: Vec = buf.to_vec(); - let data_len = data.len(); - - loop { - match &mut self.inner_state { - LocalUploadState::Idle(file) => { - let file = Arc::clone(file); - let file2 = Arc::clone(&file); - let data: Vec = std::mem::take(&mut data); - self.inner_state = LocalUploadState::Writing( - file, - Box::pin( - runtime - .spawn_blocking(move || (&*file2).write_all(&data)) - .map(move |res| match res { - Err(err) => Err(io::Error::new(ErrorKind::Other, err)), - Ok(res) => res.map(move |_| data_len), - }), - ), - ); - } - LocalUploadState::Writing(file, inner_write) => { - let res = ready!(inner_write.poll_unpin(cx)); - self.inner_state = LocalUploadState::Idle(Arc::clone(file)); - return Poll::Ready(res); - } - LocalUploadState::ShuttingDown(_) => { - return invalid_state("when writer is shutting down"); - } - LocalUploadState::Committing(_) => { - return invalid_state("when writer is committing data"); - } - LocalUploadState::Complete => { - return invalid_state("when writer is complete"); - } - } - } - } else if let LocalUploadState::Idle(file) = &self.inner_state { - let file = Arc::clone(file); - (&*file).write_all(buf)?; - Poll::Ready(Ok(buf.len())) - } else { - // If we are running on this thread, then only possible states are Idle and Complete. - invalid_state("when writer is already complete.") - } + let s = Arc::clone(&self.state); + maybe_spawn_blocking(move || { + let mut f = s.file.lock(); + let file = f.as_mut().context(AbortedSnafu)?; + file.seek(SeekFrom::Start(offset)) + .context(SeekSnafu { path: &s.dest })?; + file.write_all(&data).context(UnableToCopyDataToFileSnafu)?; + Ok(()) + }) + .boxed() } - fn poll_flush( - self: Pin<&mut Self>, - _cx: &mut std::task::Context<'_>, - ) -> Poll> { - Poll::Ready(Ok(())) + async fn complete(&mut self) -> Result { + let src = self.src.take().context(AbortedSnafu)?; + let s = Arc::clone(&self.state); + maybe_spawn_blocking(move || { + // Ensure no inflight writes + let f = s.file.lock().take().context(AbortedSnafu)?; + std::fs::rename(&src, &s.dest).context(UnableToRenameFileSnafu)?; + let metadata = f.metadata().map_err(|e| Error::Metadata { + source: e.into(), + path: src.to_string_lossy().to_string(), + })?; + + Ok(PutResult { + e_tag: Some(get_etag(&metadata)), + version: None, + }) + }) + .await } - fn poll_shutdown( - mut self: Pin<&mut Self>, - cx: &mut std::task::Context<'_>, - ) -> Poll> { - if let Ok(runtime) = tokio::runtime::Handle::try_current() { - loop { - match &mut self.inner_state { - LocalUploadState::Idle(file) => { - // We are moving file into the future, and it will be dropped on it's completion, closing the file. - let file = Arc::clone(file); - self.inner_state = LocalUploadState::ShuttingDown(Box::pin( - runtime - .spawn_blocking(move || (*file).sync_all()) - .map(move |res| match res { - Err(err) => Err(io::Error::new(io::ErrorKind::Other, err)), - Ok(res) => res, - }), - )); - } - LocalUploadState::ShuttingDown(fut) => match fut.poll_unpin(cx) { - Poll::Ready(res) => { - res?; - let staging_path = staged_upload_path(&self.dest, &self.multipart_id); - let dest = self.dest.clone(); - self.inner_state = LocalUploadState::Committing(Box::pin( - runtime - .spawn_blocking(move || std::fs::rename(&staging_path, &dest)) - .map(move |res| match res { - Err(err) => Err(io::Error::new(io::ErrorKind::Other, err)), - Ok(res) => res, - }), - )); - } - Poll::Pending => { - return Poll::Pending; - } - }, - LocalUploadState::Writing(_, _) => { - return Poll::Ready(Err(io::Error::new( - io::ErrorKind::InvalidInput, - "Tried to commit a file where a write is in progress.", - ))); - } - LocalUploadState::Committing(fut) => { - let res = ready!(fut.poll_unpin(cx)); - self.inner_state = LocalUploadState::Complete; - return Poll::Ready(res); - } - LocalUploadState::Complete => { - return Poll::Ready(Err(io::Error::new( - io::ErrorKind::Other, - "Already complete", - ))) - } - } - } - } else { - let staging_path = staged_upload_path(&self.dest, &self.multipart_id); - match &mut self.inner_state { - LocalUploadState::Idle(file) => { - let file = Arc::clone(file); - self.inner_state = LocalUploadState::Complete; - file.sync_all()?; - drop(file); - std::fs::rename(staging_path, &self.dest)?; - Poll::Ready(Ok(())) - } - _ => { - // If we are running on this thread, then only possible states are Idle and Complete. - Poll::Ready(Err(io::Error::new(ErrorKind::Other, "Already complete"))) - } - } - } + async fn abort(&mut self) -> Result<()> { + let src = self.src.take().context(AbortedSnafu)?; + maybe_spawn_blocking(move || { + std::fs::remove_file(&src).context(UnableToDeleteFileSnafu { path: &src })?; + Ok(()) + }) + .await } } impl Drop for LocalUpload { fn drop(&mut self) { - match self.inner_state { - LocalUploadState::Complete => (), - _ => { - self.inner_state = LocalUploadState::Complete; - let path = staged_upload_path(&self.dest, &self.multipart_id); - // Try to cleanup intermediate file ignoring any error - match tokio::runtime::Handle::try_current() { - Ok(r) => drop(r.spawn_blocking(move || std::fs::remove_file(path))), - Err(_) => drop(std::fs::remove_file(path)), - }; - } + if let Some(src) = self.src.take() { + // Try to clean up intermediate file ignoring any error + match tokio::runtime::Handle::try_current() { + Ok(r) => drop(r.spawn_blocking(move || std::fs::remove_file(src))), + Err(_) => drop(std::fs::remove_file(src)), + }; } } } @@ -1095,12 +971,13 @@ fn convert_walkdir_result( #[cfg(test)] mod tests { - use super::*; - use crate::test_util::flatten_list_stream; - use crate::tests::*; use futures::TryStreamExt; use tempfile::{NamedTempFile, TempDir}; - use tokio::io::AsyncWriteExt; + + use crate::test_util::flatten_list_stream; + use crate::tests::*; + + use super::*; #[tokio::test] async fn file_test() { @@ -1125,7 +1002,18 @@ mod tests { put_get_delete_list(&integration).await; list_uses_directories_correctly(&integration).await; list_with_delimiter(&integration).await; - stream_get(&integration).await; + + // Can't use stream_get test as WriteMultipart uses a tokio JoinSet + let p = Path::from("manual_upload"); + let mut upload = integration.put_multipart(&p).await.unwrap(); + upload.put_part(Bytes::from_static(b"123")).await.unwrap(); + upload.put_part(Bytes::from_static(b"45678")).await.unwrap(); + let r = upload.complete().await.unwrap(); + + let get = integration.get(&p).await.unwrap(); + assert_eq!(get.meta.e_tag.as_ref().unwrap(), r.e_tag.as_ref().unwrap()); + let actual = get.bytes().await.unwrap(); + assert_eq!(actual.as_ref(), b"12345678"); }); } @@ -1422,12 +1310,11 @@ mod tests { let location = Path::from("some_file"); let data = Bytes::from("arbitrary data"); - let (multipart_id, mut writer) = integration.put_multipart(&location).await.unwrap(); - writer.write_all(&data).await.unwrap(); + let mut u1 = integration.put_multipart(&location).await.unwrap(); + u1.put_part(data.clone()).await.unwrap(); - let (multipart_id_2, mut writer_2) = integration.put_multipart(&location).await.unwrap(); - assert_ne!(multipart_id, multipart_id_2); - writer_2.write_all(&data).await.unwrap(); + let mut u2 = integration.put_multipart(&location).await.unwrap(); + u2.put_part(data).await.unwrap(); let list = flatten_list_stream(&integration, None).await.unwrap(); assert_eq!(list.len(), 0); @@ -1520,11 +1407,13 @@ mod tests { #[cfg(not(target_arch = "wasm32"))] #[cfg(test)] mod not_wasm_tests { - use crate::local::LocalFileSystem; - use crate::{ObjectStore, Path}; use std::time::Duration; + + use bytes::Bytes; use tempfile::TempDir; - use tokio::io::AsyncWriteExt; + + use crate::local::LocalFileSystem; + use crate::{ObjectStore, Path}; #[tokio::test] async fn test_cleanup_intermediate_files() { @@ -1532,12 +1421,13 @@ mod not_wasm_tests { let integration = LocalFileSystem::new_with_prefix(root.path()).unwrap(); let location = Path::from("some_file"); - let (_, mut writer) = integration.put_multipart(&location).await.unwrap(); - writer.write_all(b"hello").await.unwrap(); + let data = Bytes::from_static(b"hello"); + let mut upload = integration.put_multipart(&location).await.unwrap(); + upload.put_part(data).await.unwrap(); let file_count = std::fs::read_dir(root.path()).unwrap().count(); assert_eq!(file_count, 1); - drop(writer); + drop(upload); tokio::time::sleep(Duration::from_millis(1)).await; @@ -1549,13 +1439,15 @@ mod not_wasm_tests { #[cfg(target_family = "unix")] #[cfg(test)] mod unix_test { - use crate::local::LocalFileSystem; - use crate::{ObjectStore, Path}; + use std::fs::OpenOptions; + use nix::sys::stat; use nix::unistd; - use std::fs::OpenOptions; use tempfile::TempDir; + use crate::local::LocalFileSystem; + use crate::{ObjectStore, Path}; + #[tokio::test] async fn test_fifo() { let filename = "some_file"; diff --git a/src/memory.rs b/src/memory.rs index 41ee109..6c960d4 100644 --- a/src/memory.rs +++ b/src/memory.rs @@ -16,27 +16,24 @@ // under the License. //! An in-memory object store implementation -use crate::multipart::{MultiPartStore, PartId}; -use crate::util::InvalidGetRange; -use crate::{ - path::Path, GetRange, GetResult, GetResultPayload, ListResult, ObjectMeta, ObjectStore, - PutMode, PutOptions, PutResult, Result, UpdateVersion, -}; -use crate::{GetOptions, MultipartId}; +use std::collections::{BTreeMap, BTreeSet, HashMap}; +use std::ops::Range; +use std::sync::Arc; + use async_trait::async_trait; use bytes::Bytes; use chrono::{DateTime, Utc}; use futures::{stream::BoxStream, StreamExt}; use parking_lot::RwLock; use snafu::{OptionExt, ResultExt, Snafu}; -use std::collections::BTreeSet; -use std::collections::{BTreeMap, HashMap}; -use std::io; -use std::ops::Range; -use std::pin::Pin; -use std::sync::Arc; -use std::task::Poll; -use tokio::io::AsyncWrite; + +use crate::multipart::{MultipartStore, PartId}; +use crate::util::InvalidGetRange; +use crate::GetOptions; +use crate::{ + path::Path, GetRange, GetResult, GetResultPayload, ListResult, MultipartId, MultipartUpload, + ObjectMeta, ObjectStore, PutMode, PutOptions, PutResult, Result, UpdateVersion, UploadPart, +}; /// A specialized `Error` for in-memory object store-related errors #[derive(Debug, Snafu)] @@ -213,23 +210,12 @@ impl ObjectStore for InMemory { }) } - async fn put_multipart( - &self, - location: &Path, - ) -> Result<(MultipartId, Box)> { - Ok(( - String::new(), - Box::new(InMemoryUpload { - location: location.clone(), - data: Vec::new(), - storage: Arc::clone(&self.storage), - }), - )) - } - - async fn abort_multipart(&self, _location: &Path, _multipart_id: &MultipartId) -> Result<()> { - // Nothing to clean up - Ok(()) + async fn put_multipart(&self, location: &Path) -> Result> { + Ok(Box::new(InMemoryUpload { + location: location.clone(), + parts: vec![], + storage: Arc::clone(&self.storage), + })) } async fn get_opts(&self, location: &Path, options: GetOptions) -> Result { @@ -391,7 +377,7 @@ impl ObjectStore for InMemory { } #[async_trait] -impl MultiPartStore for InMemory { +impl MultipartStore for InMemory { async fn create_multipart(&self, _path: &Path) -> Result { let mut storage = self.storage.write(); let etag = storage.next_etag; @@ -482,45 +468,42 @@ impl InMemory { } } +#[derive(Debug)] struct InMemoryUpload { location: Path, - data: Vec, + parts: Vec, storage: Arc>, } -impl AsyncWrite for InMemoryUpload { - fn poll_write( - mut self: Pin<&mut Self>, - _cx: &mut std::task::Context<'_>, - buf: &[u8], - ) -> Poll> { - self.data.extend_from_slice(buf); - Poll::Ready(Ok(buf.len())) +#[async_trait] +impl MultipartUpload for InMemoryUpload { + fn put_part(&mut self, data: Bytes) -> UploadPart { + self.parts.push(data); + Box::pin(futures::future::ready(Ok(()))) } - fn poll_flush( - self: Pin<&mut Self>, - _cx: &mut std::task::Context<'_>, - ) -> Poll> { - Poll::Ready(Ok(())) + async fn complete(&mut self) -> Result { + let cap = self.parts.iter().map(|x| x.len()).sum(); + let mut buf = Vec::with_capacity(cap); + self.parts.iter().for_each(|x| buf.extend_from_slice(x)); + let etag = self.storage.write().insert(&self.location, buf.into()); + Ok(PutResult { + e_tag: Some(etag.to_string()), + version: None, + }) } - fn poll_shutdown( - mut self: Pin<&mut Self>, - _cx: &mut std::task::Context<'_>, - ) -> Poll> { - let data = Bytes::from(std::mem::take(&mut self.data)); - self.storage.write().insert(&self.location, data); - Poll::Ready(Ok(())) + async fn abort(&mut self) -> Result<()> { + Ok(()) } } #[cfg(test)] mod tests { - use super::*; - use crate::tests::*; + use super::*; + #[tokio::test] async fn in_memory_test() { let integration = InMemory::new(); diff --git a/src/multipart.rs b/src/multipart.rs index 1dcd5a6..26cce39 100644 --- a/src/multipart.rs +++ b/src/multipart.rs @@ -17,34 +17,16 @@ //! Cloud Multipart Upload //! -//! This crate provides an asynchronous interface for multipart file uploads to cloud storage services. -//! It's designed to offer efficient, non-blocking operations, +//! This crate provides an asynchronous interface for multipart file uploads to +//! cloud storage services. It's designed to offer efficient, non-blocking operations, //! especially useful when dealing with large files or high-throughput systems. use async_trait::async_trait; use bytes::Bytes; -use futures::{stream::FuturesUnordered, Future, StreamExt}; -use std::{io, pin::Pin, sync::Arc, task::Poll}; -use tokio::io::AsyncWrite; use crate::path::Path; use crate::{MultipartId, PutResult, Result}; -type BoxedTryFuture = Pin> + Send>>; - -/// A trait used in combination with [`WriteMultiPart`] to implement -/// [`AsyncWrite`] on top of an API for multipart upload -#[async_trait] -pub trait PutPart: Send + Sync + 'static { - /// Upload a single part - async fn put_part(&self, buf: Vec, part_idx: usize) -> Result; - - /// Complete the upload with the provided parts - /// - /// `completed_parts` is in order of part number - async fn complete(&self, completed_parts: Vec) -> Result<()>; -} - /// Represents a part of a file that has been successfully uploaded in a multipart upload process. #[derive(Debug, Clone)] pub struct PartId { @@ -52,222 +34,6 @@ pub struct PartId { pub content_id: String, } -/// Wrapper around a [`PutPart`] that implements [`AsyncWrite`] -/// -/// Data will be uploaded in fixed size chunks of 10 MiB in parallel, -/// up to the configured maximum concurrency -pub struct WriteMultiPart { - inner: Arc, - /// A list of completed parts, in sequential order. - completed_parts: Vec>, - /// Part upload tasks currently running - tasks: FuturesUnordered>, - /// Maximum number of upload tasks to run concurrently - max_concurrency: usize, - /// Buffer that will be sent in next upload. - current_buffer: Vec, - /// Size of each part. - /// - /// While S3 and Minio support variable part sizes, R2 requires they all be - /// exactly the same size. - part_size: usize, - /// Index of current part - current_part_idx: usize, - /// The completion task - completion_task: Option>, -} - -impl WriteMultiPart { - /// Create a new multipart upload with the implementation and the given maximum concurrency - pub fn new(inner: T, max_concurrency: usize) -> Self { - Self { - inner: Arc::new(inner), - completed_parts: Vec::new(), - tasks: FuturesUnordered::new(), - max_concurrency, - current_buffer: Vec::new(), - // TODO: Should self vary by provider? - // TODO: Should we automatically increase then when part index gets large? - - // Minimum size of 5 MiB - // https://docs.aws.amazon.com/AmazonS3/latest/userguide/qfacts.html - // https://cloud.google.com/storage/quotas#requests - part_size: 10 * 1024 * 1024, - current_part_idx: 0, - completion_task: None, - } - } - - // Add data to the current buffer, returning the number of bytes added - fn add_to_buffer(mut self: Pin<&mut Self>, buf: &[u8], offset: usize) -> usize { - let remaining_capacity = self.part_size - self.current_buffer.len(); - let to_copy = std::cmp::min(remaining_capacity, buf.len() - offset); - self.current_buffer - .extend_from_slice(&buf[offset..offset + to_copy]); - to_copy - } - - /// Poll current tasks - fn poll_tasks( - mut self: Pin<&mut Self>, - cx: &mut std::task::Context<'_>, - ) -> Result<(), io::Error> { - if self.tasks.is_empty() { - return Ok(()); - } - while let Poll::Ready(Some(res)) = self.tasks.poll_next_unpin(cx) { - let (part_idx, part) = res?; - let total_parts = self.completed_parts.len(); - self.completed_parts - .resize(std::cmp::max(part_idx + 1, total_parts), None); - self.completed_parts[part_idx] = Some(part); - } - Ok(()) - } - - // The `poll_flush` function will only flush the in-progress tasks. - // The `final_flush` method called during `poll_shutdown` will flush - // the `current_buffer` along with in-progress tasks. - // Please see https://github.com/apache/arrow-rs/issues/3390 for more details. - fn final_flush( - mut self: Pin<&mut Self>, - cx: &mut std::task::Context<'_>, - ) -> Poll> { - // Poll current tasks - self.as_mut().poll_tasks(cx)?; - - // If current_buffer is not empty, see if it can be submitted - if !self.current_buffer.is_empty() && self.tasks.len() < self.max_concurrency { - let out_buffer: Vec = std::mem::take(&mut self.current_buffer); - let inner = Arc::clone(&self.inner); - let part_idx = self.current_part_idx; - self.tasks.push(Box::pin(async move { - let upload_part = inner.put_part(out_buffer, part_idx).await?; - Ok((part_idx, upload_part)) - })); - } - - self.as_mut().poll_tasks(cx)?; - - // If tasks and current_buffer are empty, return Ready - if self.tasks.is_empty() && self.current_buffer.is_empty() { - Poll::Ready(Ok(())) - } else { - Poll::Pending - } - } -} - -impl AsyncWrite for WriteMultiPart { - fn poll_write( - mut self: Pin<&mut Self>, - cx: &mut std::task::Context<'_>, - buf: &[u8], - ) -> Poll> { - // Poll current tasks - self.as_mut().poll_tasks(cx)?; - - let mut offset = 0; - - loop { - // Fill up current buffer - offset += self.as_mut().add_to_buffer(buf, offset); - - // If we don't have a full buffer or we have too many tasks, break - if self.current_buffer.len() < self.part_size - || self.tasks.len() >= self.max_concurrency - { - break; - } - - let new_buffer = Vec::with_capacity(self.part_size); - let out_buffer = std::mem::replace(&mut self.current_buffer, new_buffer); - let inner = Arc::clone(&self.inner); - let part_idx = self.current_part_idx; - self.tasks.push(Box::pin(async move { - let upload_part = inner.put_part(out_buffer, part_idx).await?; - Ok((part_idx, upload_part)) - })); - self.current_part_idx += 1; - - // We need to poll immediately after adding to setup waker - self.as_mut().poll_tasks(cx)?; - } - - // If offset is zero, then we didn't write anything because we didn't - // have capacity for more tasks and our buffer is full. - if offset == 0 && !buf.is_empty() { - Poll::Pending - } else { - Poll::Ready(Ok(offset)) - } - } - - fn poll_flush( - mut self: Pin<&mut Self>, - cx: &mut std::task::Context<'_>, - ) -> Poll> { - // Poll current tasks - self.as_mut().poll_tasks(cx)?; - - // If tasks is empty, return Ready - if self.tasks.is_empty() { - Poll::Ready(Ok(())) - } else { - Poll::Pending - } - } - - fn poll_shutdown( - mut self: Pin<&mut Self>, - cx: &mut std::task::Context<'_>, - ) -> Poll> { - // First, poll flush - match self.as_mut().final_flush(cx) { - Poll::Pending => return Poll::Pending, - Poll::Ready(res) => res?, - }; - - // If shutdown task is not set, set it - let parts = std::mem::take(&mut self.completed_parts); - let parts = parts - .into_iter() - .enumerate() - .map(|(idx, part)| { - part.ok_or_else(|| { - io::Error::new( - io::ErrorKind::Other, - format!("Missing information for upload part {idx}"), - ) - }) - }) - .collect::>()?; - - let inner = Arc::clone(&self.inner); - let completion_task = self.completion_task.get_or_insert_with(|| { - Box::pin(async move { - inner.complete(parts).await?; - Ok(()) - }) - }); - - Pin::new(completion_task).poll(cx) - } -} - -impl std::fmt::Debug for WriteMultiPart { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - f.debug_struct("WriteMultiPart") - .field("completed_parts", &self.completed_parts) - .field("tasks", &self.tasks) - .field("max_concurrency", &self.max_concurrency) - .field("current_buffer", &self.current_buffer) - .field("part_size", &self.part_size) - .field("current_part_idx", &self.current_part_idx) - .finish() - } -} - /// A low-level interface for interacting with multipart upload APIs /// /// Most use-cases should prefer [`ObjectStore::put_multipart`] as this is supported by more @@ -277,7 +43,7 @@ impl std::fmt::Debug for WriteMultiPart { /// [`ObjectStore::put_multipart`]: crate::ObjectStore::put_multipart /// [`LocalFileSystem`]: crate::local::LocalFileSystem #[async_trait] -pub trait MultiPartStore: Send + Sync + 'static { +pub trait MultipartStore: Send + Sync + 'static { /// Creates a new multipart upload, returning the [`MultipartId`] async fn create_multipart(&self, path: &Path) -> Result; @@ -288,10 +54,11 @@ pub trait MultiPartStore: Send + Sync + 'static { /// /// Most stores require that all parts excluding the last are at least 5 MiB, and some /// further require that all parts excluding the last be the same size, e.g. [R2]. - /// [`WriteMultiPart`] performs writes in fixed size blocks of 10 MiB, and clients wanting + /// [`WriteMultipart`] performs writes in fixed size blocks of 5 MiB, and clients wanting /// to maximise compatibility should look to do likewise. /// /// [R2]: https://developers.cloudflare.com/r2/objects/multipart-objects/#limitations + /// [`WriteMultipart`]: crate::upload::WriteMultipart async fn put_part( &self, path: &Path, diff --git a/src/prefix.rs b/src/prefix.rs index 38f9b07..053f71a 100644 --- a/src/prefix.rs +++ b/src/prefix.rs @@ -19,12 +19,11 @@ use bytes::Bytes; use futures::{stream::BoxStream, StreamExt, TryStreamExt}; use std::ops::Range; -use tokio::io::AsyncWrite; use crate::path::Path; use crate::{ - GetOptions, GetResult, ListResult, MultipartId, ObjectMeta, ObjectStore, PutOptions, PutResult, - Result, + GetOptions, GetResult, ListResult, MultipartUpload, ObjectMeta, ObjectStore, PutOptions, + PutResult, Result, }; #[doc(hidden)] @@ -91,18 +90,11 @@ impl ObjectStore for PrefixStore { self.inner.put_opts(&full_path, bytes, opts).await } - async fn put_multipart( - &self, - location: &Path, - ) -> Result<(MultipartId, Box)> { + async fn put_multipart(&self, location: &Path) -> Result> { let full_path = self.full_path(location); self.inner.put_multipart(&full_path).await } - async fn abort_multipart(&self, location: &Path, multipart_id: &MultipartId) -> Result<()> { - let full_path = self.full_path(location); - self.inner.abort_multipart(&full_path, multipart_id).await - } async fn get(&self, location: &Path) -> Result { let full_path = self.full_path(location); self.inner.get(&full_path).await diff --git a/src/throttle.rs b/src/throttle.rs index 252256a..5ca1eed 100644 --- a/src/throttle.rs +++ b/src/throttle.rs @@ -20,16 +20,15 @@ use parking_lot::Mutex; use std::ops::Range; use std::{convert::TryInto, sync::Arc}; +use crate::GetOptions; use crate::{ - path::Path, GetResult, GetResultPayload, ListResult, ObjectMeta, ObjectStore, PutOptions, - PutResult, Result, + path::Path, GetResult, GetResultPayload, ListResult, MultipartUpload, ObjectMeta, ObjectStore, + PutOptions, PutResult, Result, }; -use crate::{GetOptions, MultipartId}; use async_trait::async_trait; use bytes::Bytes; use futures::{stream::BoxStream, FutureExt, StreamExt}; use std::time::Duration; -use tokio::io::AsyncWrite; /// Configuration settings for throttled store #[derive(Debug, Default, Clone, Copy)] @@ -158,14 +157,7 @@ impl ObjectStore for ThrottledStore { self.inner.put_opts(location, bytes, opts).await } - async fn put_multipart( - &self, - _location: &Path, - ) -> Result<(MultipartId, Box)> { - Err(super::Error::NotImplemented) - } - - async fn abort_multipart(&self, _location: &Path, _multipart_id: &MultipartId) -> Result<()> { + async fn put_multipart(&self, _location: &Path) -> Result> { Err(super::Error::NotImplemented) } diff --git a/src/upload.rs b/src/upload.rs new file mode 100644 index 0000000..6f8bfa8 --- /dev/null +++ b/src/upload.rs @@ -0,0 +1,175 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::{PutResult, Result}; +use async_trait::async_trait; +use bytes::Bytes; +use futures::future::BoxFuture; +use tokio::task::JoinSet; + +/// An upload part request +pub type UploadPart = BoxFuture<'static, Result<()>>; + +/// A trait allowing writing an object in fixed size chunks +/// +/// Consecutive chunks of data can be written by calling [`MultipartUpload::put_part`] and polling +/// the returned futures to completion. Multiple futures returned by [`MultipartUpload::put_part`] +/// may be polled in parallel, allowing for concurrent uploads. +/// +/// Once all part uploads have been polled to completion, the upload can be completed by +/// calling [`MultipartUpload::complete`]. This will make the entire uploaded object visible +/// as an atomic operation.It is implementation behind behaviour if [`MultipartUpload::complete`] +/// is called before all [`UploadPart`] have been polled to completion. +#[async_trait] +pub trait MultipartUpload: Send + std::fmt::Debug { + /// Upload the next part + /// + /// Most stores require that all parts excluding the last are at least 5 MiB, and some + /// further require that all parts excluding the last be the same size, e.g. [R2]. + /// Clients wanting to maximise compatibility should therefore perform writes in + /// fixed size blocks larger than 5 MiB. + /// + /// Implementations may invoke this method multiple times and then await on the + /// returned futures in parallel + /// + /// ```no_run + /// # use futures::StreamExt; + /// # use object_store::MultipartUpload; + /// # + /// # async fn test() { + /// # + /// let mut upload: Box<&dyn MultipartUpload> = todo!(); + /// let p1 = upload.put_part(vec![0; 10 * 1024 * 1024].into()); + /// let p2 = upload.put_part(vec![1; 10 * 1024 * 1024].into()); + /// futures::future::try_join(p1, p2).await.unwrap(); + /// upload.complete().await.unwrap(); + /// # } + /// ``` + /// + /// [R2]: https://developers.cloudflare.com/r2/objects/multipart-objects/#limitations + fn put_part(&mut self, data: Bytes) -> UploadPart; + + /// Complete the multipart upload + /// + /// It is implementation defined behaviour if this method is called before polling + /// all [`UploadPart`] returned by [`MultipartUpload::put_part`] to completion. Additionally, + /// it is implementation defined behaviour to call [`MultipartUpload::complete`] + /// on an already completed or aborted [`MultipartUpload`]. + async fn complete(&mut self) -> Result; + + /// Abort the multipart upload + /// + /// If a [`MultipartUpload`] is dropped without calling [`MultipartUpload::complete`], + /// some object stores will automatically clean up any previously uploaded parts. + /// However, some stores, such as S3 and GCS, cannot perform cleanup on drop. + /// As such [`MultipartUpload::abort`] can be invoked to perform this cleanup. + /// + /// It will not be possible to call `abort` in all failure scenarios, for example + /// non-graceful shutdown of the calling application. It is therefore recommended + /// object stores are configured with lifecycle rules to automatically cleanup + /// unused parts older than some threshold. See [crate::aws] and [crate::gcp] + /// for more information. + /// + /// It is implementation defined behaviour to call [`MultipartUpload::abort`] + /// on an already completed or aborted [`MultipartUpload`] + async fn abort(&mut self) -> Result<()>; +} + +/// A synchronous write API for uploading data in parallel in fixed size chunks +/// +/// Uses multiple tokio tasks in a [`JoinSet`] to multiplex upload tasks in parallel +/// +/// The design also takes inspiration from [`Sink`] with [`WriteMultipart::wait_for_capacity`] +/// allowing back pressure on producers, prior to buffering the next part. However, unlike +/// [`Sink`] this back pressure is optional, allowing integration with synchronous producers +/// +/// [`Sink`]: futures::sink::Sink +#[derive(Debug)] +pub struct WriteMultipart { + upload: Box, + + buffer: Vec, + + tasks: JoinSet>, +} + +impl WriteMultipart { + /// Create a new [`WriteMultipart`] that will upload using 5MB chunks + pub fn new(upload: Box) -> Self { + Self::new_with_capacity(upload, 5 * 1024 * 1024) + } + + /// Create a new [`WriteMultipart`] that will upload in fixed `capacity` sized chunks + pub fn new_with_capacity(upload: Box, capacity: usize) -> Self { + Self { + upload, + buffer: Vec::with_capacity(capacity), + tasks: Default::default(), + } + } + + /// Wait until there are `max_concurrency` or fewer requests in-flight + pub async fn wait_for_capacity(&mut self, max_concurrency: usize) -> Result<()> { + while self.tasks.len() > max_concurrency { + self.tasks.join_next().await.unwrap()??; + } + Ok(()) + } + + /// Write data to this [`WriteMultipart`] + /// + /// Note this method is synchronous (not `async`) and will immediately start new uploads + /// as soon as the internal `capacity` is hit, regardless of + /// how many outstanding uploads are already in progress. + /// + /// Back pressure can optionally be applied to producers by calling + /// [`Self::wait_for_capacity`] prior to calling this method + pub fn write(&mut self, mut buf: &[u8]) { + while !buf.is_empty() { + let capacity = self.buffer.capacity(); + let remaining = capacity - self.buffer.len(); + let to_read = buf.len().min(remaining); + self.buffer.extend_from_slice(&buf[..to_read]); + if to_read == remaining { + let part = std::mem::replace(&mut self.buffer, Vec::with_capacity(capacity)); + self.put_part(part.into()) + } + buf = &buf[to_read..] + } + } + + fn put_part(&mut self, part: Bytes) { + self.tasks.spawn(self.upload.put_part(part)); + } + + /// Abort this upload, attempting to clean up any successfully uploaded parts + pub async fn abort(mut self) -> Result<()> { + self.tasks.shutdown().await; + self.upload.abort().await + } + + /// Flush final chunk, and await completion of all in-flight requests + pub async fn finish(mut self) -> Result { + if !self.buffer.is_empty() { + let part = std::mem::take(&mut self.buffer); + self.put_part(part.into()) + } + + self.wait_for_capacity(0).await?; + self.upload.complete().await + } +} diff --git a/tests/get_range_file.rs b/tests/get_range_file.rs index f73d785..309a86d 100644 --- a/tests/get_range_file.rs +++ b/tests/get_range_file.rs @@ -25,7 +25,6 @@ use object_store::path::Path; use object_store::*; use std::fmt::Formatter; use tempfile::tempdir; -use tokio::io::AsyncWrite; #[derive(Debug)] struct MyStore(LocalFileSystem); @@ -42,14 +41,7 @@ impl ObjectStore for MyStore { self.0.put_opts(path, data, opts).await } - async fn put_multipart( - &self, - _: &Path, - ) -> Result<(MultipartId, Box)> { - todo!() - } - - async fn abort_multipart(&self, _: &Path, _: &MultipartId) -> Result<()> { + async fn put_multipart(&self, _location: &Path) -> Result> { todo!() } From a24d1d37acdcf5fdfab2736a70d731179f89a6e1 Mon Sep 17 00:00:00 2001 From: dimbtp Date: Wed, 20 Mar 2024 14:00:31 +0800 Subject: [PATCH 282/397] fix: copy/rename return error if source is nonexistent (#5528) * copy() return error if `from` is nonexistent * Check `from` in loop to avoid TOCTOU race --- src/lib.rs | 27 +++++++++++++++++++++++++++ src/local.rs | 16 +++++++++++++--- 2 files changed, 40 insertions(+), 3 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index e02675d..97604a7 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -2127,6 +2127,33 @@ mod tests { storage.delete(&path2).await.unwrap(); } + pub(crate) async fn copy_rename_nonexistent_object(storage: &DynObjectStore) { + // Create empty source object + let path1 = Path::from("test1"); + + // Create destination object + let path2 = Path::from("test2"); + storage.put(&path2, Bytes::from("hello")).await.unwrap(); + + // copy() errors if source does not exist + let result = storage.copy(&path1, &path2).await; + assert!(result.is_err()); + assert!(matches!(result.unwrap_err(), crate::Error::NotFound { .. })); + + // rename() errors if source does not exist + let result = storage.rename(&path1, &path2).await; + assert!(result.is_err()); + assert!(matches!(result.unwrap_err(), crate::Error::NotFound { .. })); + + // copy_if_not_exists() errors if source does not exist + let result = storage.copy_if_not_exists(&path1, &path2).await; + assert!(result.is_err()); + assert!(matches!(result.unwrap_err(), crate::Error::NotFound { .. })); + + // Clean up + storage.delete(&path2).await.unwrap(); + } + pub(crate) async fn multipart(storage: &dyn ObjectStore, multipart: &dyn MultipartStore) { let path = Path::from("test_multipart"); let chunk_size = 5 * 1024 * 1024; diff --git a/src/local.rs b/src/local.rs index a7eb466..6cc0c67 100644 --- a/src/local.rs +++ b/src/local.rs @@ -598,7 +598,10 @@ impl ObjectStore for LocalFileSystem { } Err(source) => match source.kind() { ErrorKind::AlreadyExists => id += 1, - ErrorKind::NotFound => create_parent_dirs(&to, source)?, + ErrorKind::NotFound => match from.exists() { + true => create_parent_dirs(&to, source)?, + false => return Err(Error::NotFound { path: from, source }.into()), + }, _ => return Err(Error::UnableToCopyFile { from, to, source }.into()), }, } @@ -613,7 +616,10 @@ impl ObjectStore for LocalFileSystem { match std::fs::rename(&from, &to) { Ok(_) => return Ok(()), Err(source) => match source.kind() { - ErrorKind::NotFound => create_parent_dirs(&to, source)?, + ErrorKind::NotFound => match from.exists() { + true => create_parent_dirs(&to, source)?, + false => return Err(Error::NotFound { path: from, source }.into()), + }, _ => return Err(Error::UnableToCopyFile { from, to, source }.into()), }, } @@ -636,7 +642,10 @@ impl ObjectStore for LocalFileSystem { } .into()) } - ErrorKind::NotFound => create_parent_dirs(&to, source)?, + ErrorKind::NotFound => match from.exists() { + true => create_parent_dirs(&to, source)?, + false => return Err(Error::NotFound { path: from, source }.into()), + }, _ => return Err(Error::UnableToCopyFile { from, to, source }.into()), }, } @@ -990,6 +999,7 @@ mod tests { list_with_delimiter(&integration).await; rename_and_copy(&integration).await; copy_if_not_exists(&integration).await; + copy_rename_nonexistent_object(&integration).await; stream_get(&integration).await; put_opts(&integration, false).await; } From 1f3038b672cec4e0af7161be3d786a101add4994 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Sat, 23 Mar 2024 17:42:32 +1300 Subject: [PATCH 283/397] Fix Clippy Lints object_store (#5546) --- src/aws/client.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/src/aws/client.rs b/src/aws/client.rs index aa9f6bf..4d10145 100644 --- a/src/aws/client.rs +++ b/src/aws/client.rs @@ -138,6 +138,7 @@ struct BatchDeleteResponse { #[derive(Deserialize)] enum DeleteObjectResult { + #[allow(unused)] Deleted(DeletedObject), Error(DeleteError), } From 62d7251e03a4bf4150b01b18fcc476fd1a81a81b Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Fri, 29 Mar 2024 03:51:47 +0000 Subject: [PATCH 284/397] Update reqwest 0.12 and http 1.0 (#5536) --- Cargo.toml | 8 ++-- src/aws/credential.rs | 18 ++++----- src/azure/credential.rs | 28 ++++++------- src/client/mock_server.rs | 82 +++++++++++++++++++++++---------------- src/client/retry.rs | 35 +++++++++-------- src/parse.rs | 6 +-- 6 files changed, 99 insertions(+), 78 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index a1e80ce..79813a0 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -45,12 +45,12 @@ walkdir = "2" # Cloud storage support base64 = { version = "0.22", default-features = false, features = ["std"], optional = true } -hyper = { version = "0.14", default-features = false, optional = true } +hyper = { version = "1.2", default-features = false, optional = true } quick-xml = { version = "0.31.0", features = ["serialize", "overlapped-lists"], optional = true } serde = { version = "1.0", default-features = false, features = ["derive"], optional = true } serde_json = { version = "1.0", default-features = false, optional = true } rand = { version = "0.8", default-features = false, features = ["std", "std_rng"], optional = true } -reqwest = { version = "0.11", default-features = false, features = ["rustls-tls-native-roots"], optional = true } +reqwest = { version = "0.12", default-features = false, features = ["rustls-tls-native-roots", "http2"], optional = true } ring = { version = "0.17", default-features = false, features = ["std"], optional = true } rustls-pemfile = { version = "2.0", default-features = false, features = ["std"], optional = true } tokio = { version = "1.25.0", features = ["sync", "macros", "rt", "time", "io-util"] } @@ -69,7 +69,9 @@ tls-webpki-roots = ["reqwest?/rustls-tls-webpki-roots"] [dev-dependencies] # In alphabetical order futures-test = "0.3" -hyper = { version = "0.14.24", features = ["server"] } +hyper = { version = "1.2", features = ["server"] } +hyper-util = "0.1" +http-body-util = "0.1" rand = "0.8" tempfile = "3.1.0" diff --git a/src/aws/credential.rs b/src/aws/credential.rs index f8614f4..dd7fa5b 100644 --- a/src/aws/credential.rs +++ b/src/aws/credential.rs @@ -738,7 +738,7 @@ struct CreateSessionOutput { mod tests { use super::*; use crate::client::mock_server::MockServer; - use hyper::{Body, Response}; + use hyper::Response; use reqwest::{Client, Method}; use std::env; @@ -939,7 +939,7 @@ mod tests { #[tokio::test] async fn test_mock() { - let server = MockServer::new(); + let server = MockServer::new().await; const IMDSV2_HEADER: &str = "X-aws-ec2-metadata-token"; @@ -955,7 +955,7 @@ mod tests { server.push_fn(|req| { assert_eq!(req.uri().path(), "/latest/api/token"); assert_eq!(req.method(), &Method::PUT); - Response::new(Body::from("cupcakes")) + Response::new("cupcakes".to_string()) }); server.push_fn(|req| { assert_eq!( @@ -965,14 +965,14 @@ mod tests { assert_eq!(req.method(), &Method::GET); let t = req.headers().get(IMDSV2_HEADER).unwrap().to_str().unwrap(); assert_eq!(t, "cupcakes"); - Response::new(Body::from("myrole")) + Response::new("myrole".to_string()) }); server.push_fn(|req| { assert_eq!(req.uri().path(), "/latest/meta-data/iam/security-credentials/myrole"); assert_eq!(req.method(), &Method::GET); let t = req.headers().get(IMDSV2_HEADER).unwrap().to_str().unwrap(); assert_eq!(t, "cupcakes"); - Response::new(Body::from(r#"{"AccessKeyId":"KEYID","Code":"Success","Expiration":"2022-08-30T10:51:04Z","LastUpdated":"2022-08-30T10:21:04Z","SecretAccessKey":"SECRET","Token":"TOKEN","Type":"AWS-HMAC"}"#)) + Response::new(r#"{"AccessKeyId":"KEYID","Code":"Success","Expiration":"2022-08-30T10:51:04Z","LastUpdated":"2022-08-30T10:21:04Z","SecretAccessKey":"SECRET","Token":"TOKEN","Type":"AWS-HMAC"}"#.to_string()) }); let creds = instance_creds(&client, &retry_config, endpoint, true) @@ -989,7 +989,7 @@ mod tests { assert_eq!(req.method(), &Method::PUT); Response::builder() .status(StatusCode::FORBIDDEN) - .body(Body::empty()) + .body(String::new()) .unwrap() }); server.push_fn(|req| { @@ -999,13 +999,13 @@ mod tests { ); assert_eq!(req.method(), &Method::GET); assert!(req.headers().get(IMDSV2_HEADER).is_none()); - Response::new(Body::from("myrole")) + Response::new("myrole".to_string()) }); server.push_fn(|req| { assert_eq!(req.uri().path(), "/latest/meta-data/iam/security-credentials/myrole"); assert_eq!(req.method(), &Method::GET); assert!(req.headers().get(IMDSV2_HEADER).is_none()); - Response::new(Body::from(r#"{"AccessKeyId":"KEYID","Code":"Success","Expiration":"2022-08-30T10:51:04Z","LastUpdated":"2022-08-30T10:21:04Z","SecretAccessKey":"SECRET","Token":"TOKEN","Type":"AWS-HMAC"}"#)) + Response::new(r#"{"AccessKeyId":"KEYID","Code":"Success","Expiration":"2022-08-30T10:51:04Z","LastUpdated":"2022-08-30T10:21:04Z","SecretAccessKey":"SECRET","Token":"TOKEN","Type":"AWS-HMAC"}"#.to_string()) }); let creds = instance_creds(&client, &retry_config, endpoint, true) @@ -1020,7 +1020,7 @@ mod tests { server.push( Response::builder() .status(StatusCode::FORBIDDEN) - .body(Body::empty()) + .body(String::new()) .unwrap(), ); diff --git a/src/azure/credential.rs b/src/azure/credential.rs index 9360831..6dc3141 100644 --- a/src/azure/credential.rs +++ b/src/azure/credential.rs @@ -930,8 +930,8 @@ impl CredentialProvider for AzureCliCredential { #[cfg(test)] mod tests { use futures::executor::block_on; - use hyper::body::to_bytes; - use hyper::{Body, Response, StatusCode}; + use http_body_util::BodyExt; + use hyper::{Response, StatusCode}; use reqwest::{Client, Method}; use tempfile::NamedTempFile; @@ -942,7 +942,7 @@ mod tests { #[tokio::test] async fn test_managed_identity() { - let server = MockServer::new(); + let server = MockServer::new().await; std::env::set_var(MSI_SECRET_ENV_KEY, "env-secret"); @@ -964,7 +964,7 @@ mod tests { assert_eq!(t, "env-secret"); let t = req.headers().get("metadata").unwrap().to_str().unwrap(); assert_eq!(t, "true"); - Response::new(Body::from( + Response::new( r#" { "access_token": "TOKEN", @@ -975,8 +975,9 @@ mod tests { "resource": "https://management.azure.com/", "token_type": "Bearer" } - "#, - )) + "# + .to_string(), + ) }); let credential = ImdsManagedIdentityProvider::new( @@ -999,7 +1000,7 @@ mod tests { #[tokio::test] async fn test_workload_identity() { - let server = MockServer::new(); + let server = MockServer::new().await; let tokenfile = NamedTempFile::new().unwrap(); let tenant = "tenant"; std::fs::write(tokenfile.path(), "federated-token").unwrap(); @@ -1012,10 +1013,10 @@ mod tests { server.push_fn(move |req| { assert_eq!(req.uri().path(), format!("/{tenant}/oauth2/v2.0/token")); assert_eq!(req.method(), &Method::POST); - let body = block_on(to_bytes(req.into_body())).unwrap(); + let body = block_on(async move { req.into_body().collect().await.unwrap().to_bytes() }); let body = String::from_utf8(body.to_vec()).unwrap(); assert!(body.contains("federated-token")); - Response::new(Body::from( + Response::new( r#" { "access_token": "TOKEN", @@ -1026,8 +1027,9 @@ mod tests { "resource": "https://management.azure.com/", "token_type": "Bearer" } - "#, - )) + "# + .to_string(), + ) }); let credential = WorkloadIdentityOAuthProvider::new( @@ -1050,7 +1052,7 @@ mod tests { #[tokio::test] async fn test_no_credentials() { - let server = MockServer::new(); + let server = MockServer::new().await; let endpoint = server.url(); let store = MicrosoftAzureBuilder::new() @@ -1068,7 +1070,7 @@ mod tests { assert!(req.headers().get("Authorization").is_none()); Response::builder() .status(StatusCode::NOT_FOUND) - .body(Body::from("not found")) + .body("not found".to_string()) .unwrap() }); diff --git a/src/client/mock_server.rs b/src/client/mock_server.rs index 70b8561..aa5a9e0 100644 --- a/src/client/mock_server.rs +++ b/src/client/mock_server.rs @@ -17,18 +17,23 @@ use futures::future::BoxFuture; use futures::FutureExt; -use hyper::service::{make_service_fn, service_fn}; -use hyper::{Body, Request, Response, Server}; +use hyper::body::Incoming; +use hyper::server::conn::http1; +use hyper::service::service_fn; +use hyper::{Request, Response}; +use hyper_util::rt::TokioIo; use parking_lot::Mutex; use std::collections::VecDeque; use std::convert::Infallible; use std::future::Future; use std::net::SocketAddr; use std::sync::Arc; +use tokio::net::TcpListener; use tokio::sync::oneshot; -use tokio::task::JoinHandle; +use tokio::task::{JoinHandle, JoinSet}; -pub type ResponseFn = Box) -> BoxFuture<'static, Response> + Send>; +pub type ResponseFn = + Box) -> BoxFuture<'static, Response> + Send>; /// A mock server pub struct MockServer { @@ -39,39 +44,48 @@ pub struct MockServer { } impl MockServer { - pub fn new() -> Self { + pub async fn new() -> Self { let responses: Arc>> = Arc::new(Mutex::new(VecDeque::with_capacity(10))); - let r = Arc::clone(&responses); - let make_service = make_service_fn(move |_conn| { - let r = Arc::clone(&r); - async move { - Ok::<_, Infallible>(service_fn(move |req| { - let r = Arc::clone(&r); - let next = r.lock().pop_front(); - async move { - Ok::<_, Infallible>(match next { - Some(r) => r(req).await, - None => Response::new(Body::from("Hello World")), - }) - } - })) - } - }); + let addr = SocketAddr::from(([127, 0, 0, 1], 0)); + let listener = TcpListener::bind(addr).await.unwrap(); - let (shutdown, rx) = oneshot::channel::<()>(); - let server = Server::bind(&SocketAddr::from(([127, 0, 0, 1], 0))).serve(make_service); + let (shutdown, mut rx) = oneshot::channel::<()>(); - let url = format!("http://{}", server.local_addr()); + let url = format!("http://{}", listener.local_addr().unwrap()); + let r = Arc::clone(&responses); let handle = tokio::spawn(async move { - server - .with_graceful_shutdown(async { - rx.await.ok(); - }) - .await - .unwrap() + let mut set = JoinSet::new(); + + loop { + let (stream, _) = tokio::select! { + conn = listener.accept() => conn.unwrap(), + _ = &mut rx => break, + }; + + let r = Arc::clone(&r); + set.spawn(async move { + let _ = http1::Builder::new() + .serve_connection( + TokioIo::new(stream), + service_fn(move |req| { + let r = Arc::clone(&r); + let next = r.lock().pop_front(); + async move { + Ok::<_, Infallible>(match next { + Some(r) => r(req).await, + None => Response::new("Hello World".to_string()), + }) + } + }), + ) + .await; + }); + } + + set.abort_all(); }); Self { @@ -88,14 +102,14 @@ impl MockServer { } /// Add a response - pub fn push(&self, response: Response) { + pub fn push(&self, response: Response) { self.push_fn(|_| response) } /// Add a response function pub fn push_fn(&self, f: F) where - F: FnOnce(Request) -> Response + Send + 'static, + F: FnOnce(Request) -> Response + Send + 'static, { let f = Box::new(|req| async move { f(req) }.boxed()); self.responses.lock().push_back(f) @@ -103,8 +117,8 @@ impl MockServer { pub fn push_async_fn(&self, f: F) where - F: FnOnce(Request) -> Fut + Send + 'static, - Fut: Future> + Send + 'static, + F: FnOnce(Request) -> Fut + Send + 'static, + Fut: Future> + Send + 'static, { self.responses.lock().push_back(Box::new(|r| f(r).boxed())) } diff --git a/src/client/retry.rs b/src/client/retry.rs index fbd3645..c4f5298 100644 --- a/src/client/retry.rs +++ b/src/client/retry.rs @@ -259,13 +259,16 @@ impl RetryExt for reqwest::RequestBuilder { Err(e) => { let mut do_retry = false; - if req.method().is_safe() && e.is_timeout() { + if e.is_connect() || ( req.method().is_safe() && e.is_timeout()) { do_retry = true - } else if let Some(source) = e.source() { - if let Some(e) = source.downcast_ref::() { - if e.is_connect() || e.is_closed() || e.is_incomplete_message() { - do_retry = true; + } else { + let mut source = e.source(); + while let Some(e) = source { + if let Some(e) = e.downcast_ref::() { + do_retry = e.is_closed() || e.is_incomplete_message(); + break } + source = e.source(); } } @@ -305,13 +308,13 @@ mod tests { use crate::client::retry::{Error, RetryExt}; use crate::RetryConfig; use hyper::header::LOCATION; - use hyper::{Body, Response}; + use hyper::Response; use reqwest::{Client, Method, StatusCode}; use std::time::Duration; #[tokio::test] async fn test_retry() { - let mock = MockServer::new(); + let mock = MockServer::new().await; let retry = RetryConfig { backoff: Default::default(), @@ -334,7 +337,7 @@ mod tests { mock.push( Response::builder() .status(StatusCode::BAD_REQUEST) - .body(Body::from("cupcakes")) + .body("cupcakes".to_string()) .unwrap(), ); @@ -350,7 +353,7 @@ mod tests { mock.push( Response::builder() .status(StatusCode::BAD_REQUEST) - .body(Body::empty()) + .body(String::new()) .unwrap(), ); @@ -366,7 +369,7 @@ mod tests { mock.push( Response::builder() .status(StatusCode::BAD_GATEWAY) - .body(Body::empty()) + .body(String::new()) .unwrap(), ); @@ -377,7 +380,7 @@ mod tests { mock.push( Response::builder() .status(StatusCode::NO_CONTENT) - .body(Body::empty()) + .body(String::new()) .unwrap(), ); @@ -389,7 +392,7 @@ mod tests { Response::builder() .status(StatusCode::FOUND) .header(LOCATION, "/foo") - .body(Body::empty()) + .body(String::new()) .unwrap(), ); @@ -402,7 +405,7 @@ mod tests { Response::builder() .status(StatusCode::FOUND) .header(LOCATION, "/bar") - .body(Body::empty()) + .body(String::new()) .unwrap(), ); @@ -416,7 +419,7 @@ mod tests { Response::builder() .status(StatusCode::FOUND) .header(LOCATION, "/bar") - .body(Body::empty()) + .body(String::new()) .unwrap(), ); } @@ -428,7 +431,7 @@ mod tests { mock.push( Response::builder() .status(StatusCode::FOUND) - .body(Body::empty()) + .body(String::new()) .unwrap(), ); @@ -441,7 +444,7 @@ mod tests { mock.push( Response::builder() .status(StatusCode::BAD_GATEWAY) - .body(Body::from("ignored")) + .body("ignored".to_string()) .unwrap(), ); } diff --git a/src/parse.rs b/src/parse.rs index 116c2ad..5549fd3 100644 --- a/src/parse.rs +++ b/src/parse.rs @@ -311,14 +311,14 @@ mod tests { #[cfg(feature = "http")] async fn test_url_http() { use crate::client::mock_server::MockServer; - use hyper::{header::USER_AGENT, Body, Response}; + use hyper::{header::USER_AGENT, Response}; - let server = MockServer::new(); + let server = MockServer::new().await; server.push_fn(|r| { assert_eq!(r.uri().path(), "/foo/bar"); assert_eq!(r.headers().get(USER_AGENT).unwrap(), "test_url"); - Response::new(Body::empty()) + Response::new(String::new()) }); let test = format!("{}/foo/bar", server.url()); From 46b55a22a5346e55db0c492c5abf4c01b3c8d1ec Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Fri, 29 Mar 2024 07:58:18 +0000 Subject: [PATCH 285/397] Update error message tests (#5569) --- src/aws/builder.rs | 5 +---- src/client/retry.rs | 4 ++-- src/gcp/builder.rs | 5 +---- 3 files changed, 4 insertions(+), 10 deletions(-) diff --git a/src/aws/builder.rs b/src/aws/builder.rs index a578d1a..664e183 100644 --- a/src/aws/builder.rs +++ b/src/aws/builder.rs @@ -1333,10 +1333,7 @@ mod tests { .unwrap_err() .to_string(); - assert_eq!( - "Generic HTTP client error: builder error: unknown proxy scheme", - err - ); + assert_eq!("Generic HTTP client error: builder error", err); } #[test] diff --git a/src/client/retry.rs b/src/client/retry.rs index c4f5298..e4bb5c9 100644 --- a/src/client/retry.rs +++ b/src/client/retry.rs @@ -425,7 +425,7 @@ mod tests { } let e = do_request().await.unwrap_err().to_string(); - assert!(e.ends_with("too many redirects"), "{}", e); + assert!(e.contains("error following redirect for url"), "{}", e); // Handles redirect missing location mock.push( @@ -489,7 +489,7 @@ mod tests { let res = client.request(Method::PUT, mock.url()).send_retry(&retry); let e = res.await.unwrap_err().to_string(); assert!( - e.contains("Error after 0 retries in") && e.contains("operation timed out"), + e.contains("Error after 0 retries in") && e.contains("error sending request for url"), "{e}" ); diff --git a/src/gcp/builder.rs b/src/gcp/builder.rs index 14c4257..2cf7504 100644 --- a/src/gcp/builder.rs +++ b/src/gcp/builder.rs @@ -594,10 +594,7 @@ mod tests { .unwrap_err() .to_string(); - assert_eq!( - "Generic HTTP client error: builder error: unknown proxy scheme", - err - ); + assert_eq!("Generic HTTP client error: builder error", err); } #[test] From 4e3dcd54a78ed2db8ab11db28de53beb5f99ad2c Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Fri, 29 Mar 2024 10:14:35 +0000 Subject: [PATCH 286/397] Implement MultipartStore for ThrottledStore (#5533) * Implement MultipartStore for ThrottledStore Limit concurrency in BufWriter Tweak WriteMultipart * Fix MSRV * Format --- src/buffered.rs | 14 +++++++++ src/throttle.rs | 78 ++++++++++++++++++++++++++++++++++++++++++++----- src/upload.rs | 76 ++++++++++++++++++++++++++++++++++++++--------- 3 files changed, 148 insertions(+), 20 deletions(-) diff --git a/src/buffered.rs b/src/buffered.rs index 39f8eaf..de6d4eb 100644 --- a/src/buffered.rs +++ b/src/buffered.rs @@ -216,6 +216,7 @@ impl AsyncBufRead for BufReader { /// streamed using [`ObjectStore::put_multipart`] pub struct BufWriter { capacity: usize, + max_concurrency: usize, state: BufWriterState, store: Arc, } @@ -250,10 +251,21 @@ impl BufWriter { Self { capacity, store, + max_concurrency: 8, state: BufWriterState::Buffer(path, Vec::new()), } } + /// Override the maximum number of in-flight requests for this writer + /// + /// Defaults to 8 + pub fn with_max_concurrency(self, max_concurrency: usize) -> Self { + Self { + max_concurrency, + ..self + } + } + /// Abort this writer, cleaning up any partially uploaded state /// /// # Panic @@ -275,9 +287,11 @@ impl AsyncWrite for BufWriter { buf: &[u8], ) -> Poll> { let cap = self.capacity; + let max_concurrency = self.max_concurrency; loop { return match &mut self.state { BufWriterState::Write(Some(write)) => { + ready!(write.poll_for_capacity(cx, max_concurrency))?; write.write(buf); Poll::Ready(Ok(buf.len())) } diff --git a/src/throttle.rs b/src/throttle.rs index 5ca1eed..65fac59 100644 --- a/src/throttle.rs +++ b/src/throttle.rs @@ -20,11 +20,12 @@ use parking_lot::Mutex; use std::ops::Range; use std::{convert::TryInto, sync::Arc}; -use crate::GetOptions; +use crate::multipart::{MultipartStore, PartId}; use crate::{ - path::Path, GetResult, GetResultPayload, ListResult, MultipartUpload, ObjectMeta, ObjectStore, - PutOptions, PutResult, Result, + path::Path, GetResult, GetResultPayload, ListResult, MultipartId, MultipartUpload, ObjectMeta, + ObjectStore, PutOptions, PutResult, Result, }; +use crate::{GetOptions, UploadPart}; use async_trait::async_trait; use bytes::Bytes; use futures::{stream::BoxStream, FutureExt, StreamExt}; @@ -110,12 +111,12 @@ async fn sleep(duration: Duration) { /// **Note that the behavior of the wrapper is deterministic and might not reflect real-world /// conditions!** #[derive(Debug)] -pub struct ThrottledStore { +pub struct ThrottledStore { inner: T, config: Arc>, } -impl ThrottledStore { +impl ThrottledStore { /// Create new wrapper with zero waiting times. pub fn new(inner: T, config: ThrottleConfig) -> Self { Self { @@ -157,8 +158,12 @@ impl ObjectStore for ThrottledStore { self.inner.put_opts(location, bytes, opts).await } - async fn put_multipart(&self, _location: &Path) -> Result> { - Err(super::Error::NotImplemented) + async fn put_multipart(&self, location: &Path) -> Result> { + let upload = self.inner.put_multipart(location).await?; + Ok(Box::new(ThrottledUpload { + upload, + sleep: self.config().wait_put_per_call, + })) } async fn get(&self, location: &Path) -> Result { @@ -316,6 +321,63 @@ where .boxed() } +#[async_trait] +impl MultipartStore for ThrottledStore { + async fn create_multipart(&self, path: &Path) -> Result { + self.inner.create_multipart(path).await + } + + async fn put_part( + &self, + path: &Path, + id: &MultipartId, + part_idx: usize, + data: Bytes, + ) -> Result { + sleep(self.config().wait_put_per_call).await; + self.inner.put_part(path, id, part_idx, data).await + } + + async fn complete_multipart( + &self, + path: &Path, + id: &MultipartId, + parts: Vec, + ) -> Result { + self.inner.complete_multipart(path, id, parts).await + } + + async fn abort_multipart(&self, path: &Path, id: &MultipartId) -> Result<()> { + self.inner.abort_multipart(path, id).await + } +} + +#[derive(Debug)] +struct ThrottledUpload { + upload: Box, + sleep: Duration, +} + +#[async_trait] +impl MultipartUpload for ThrottledUpload { + fn put_part(&mut self, data: Bytes) -> UploadPart { + let duration = self.sleep; + let put = self.upload.put_part(data); + Box::pin(async move { + sleep(duration).await; + put.await + }) + } + + async fn complete(&mut self) -> Result { + self.upload.complete().await + } + + async fn abort(&mut self) -> Result<()> { + self.upload.abort().await + } +} + #[cfg(test)] mod tests { use super::*; @@ -351,6 +413,8 @@ mod tests { list_with_delimiter(&store).await; rename_and_copy(&store).await; copy_if_not_exists(&store).await; + stream_get(&store).await; + multipart(&store, &store).await; } #[tokio::test] diff --git a/src/upload.rs b/src/upload.rs index 6f8bfa8..fe864e2 100644 --- a/src/upload.rs +++ b/src/upload.rs @@ -15,12 +15,16 @@ // specific language governing permissions and limitations // under the License. -use crate::{PutResult, Result}; +use std::task::{Context, Poll}; + use async_trait::async_trait; use bytes::Bytes; use futures::future::BoxFuture; +use futures::ready; use tokio::task::JoinSet; +use crate::{PutResult, Result}; + /// An upload part request pub type UploadPart = BoxFuture<'static, Result<()>>; @@ -110,31 +114,44 @@ pub struct WriteMultipart { impl WriteMultipart { /// Create a new [`WriteMultipart`] that will upload using 5MB chunks pub fn new(upload: Box) -> Self { - Self::new_with_capacity(upload, 5 * 1024 * 1024) + Self::new_with_chunk_size(upload, 5 * 1024 * 1024) } - /// Create a new [`WriteMultipart`] that will upload in fixed `capacity` sized chunks - pub fn new_with_capacity(upload: Box, capacity: usize) -> Self { + /// Create a new [`WriteMultipart`] that will upload in fixed `chunk_size` sized chunks + pub fn new_with_chunk_size(upload: Box, chunk_size: usize) -> Self { Self { upload, - buffer: Vec::with_capacity(capacity), + buffer: Vec::with_capacity(chunk_size), tasks: Default::default(), } } - /// Wait until there are `max_concurrency` or fewer requests in-flight - pub async fn wait_for_capacity(&mut self, max_concurrency: usize) -> Result<()> { - while self.tasks.len() > max_concurrency { - self.tasks.join_next().await.unwrap()??; + /// Polls for there to be less than `max_concurrency` [`UploadPart`] in progress + /// + /// See [`Self::wait_for_capacity`] for an async version of this function + pub fn poll_for_capacity( + &mut self, + cx: &mut Context<'_>, + max_concurrency: usize, + ) -> Poll> { + while !self.tasks.is_empty() && self.tasks.len() >= max_concurrency { + ready!(self.tasks.poll_join_next(cx)).unwrap()?? } - Ok(()) + Poll::Ready(Ok(())) + } + + /// Wait until there are less than `max_concurrency` [`UploadPart`] in progress + /// + /// See [`Self::poll_for_capacity`] for a [`Poll`] version of this function + pub async fn wait_for_capacity(&mut self, max_concurrency: usize) -> Result<()> { + futures::future::poll_fn(|cx| self.poll_for_capacity(cx, max_concurrency)).await } /// Write data to this [`WriteMultipart`] /// - /// Note this method is synchronous (not `async`) and will immediately start new uploads - /// as soon as the internal `capacity` is hit, regardless of - /// how many outstanding uploads are already in progress. + /// Note this method is synchronous (not `async`) and will immediately + /// start new uploads as soon as the internal `chunk_size` is hit, + /// regardless of how many outstanding uploads are already in progress. /// /// Back pressure can optionally be applied to producers by calling /// [`Self::wait_for_capacity`] prior to calling this method @@ -173,3 +190,36 @@ impl WriteMultipart { self.upload.complete().await } } + +#[cfg(test)] +mod tests { + use std::time::Duration; + + use futures::FutureExt; + + use crate::memory::InMemory; + use crate::path::Path; + use crate::throttle::{ThrottleConfig, ThrottledStore}; + use crate::ObjectStore; + + use super::*; + + #[tokio::test] + async fn test_concurrency() { + let config = ThrottleConfig { + wait_put_per_call: Duration::from_millis(1), + ..Default::default() + }; + + let path = Path::from("foo"); + let store = ThrottledStore::new(InMemory::new(), config); + let upload = store.put_multipart(&path).await.unwrap(); + let mut write = WriteMultipart::new_with_chunk_size(upload, 10); + + for _ in 0..20 { + write.write(&[0; 5]); + } + assert!(write.wait_for_capacity(10).now_or_never().is_none()); + write.wait_for_capacity(10).await.unwrap() + } +} From 7e3a6cdc8ae766df658f5b81878515f25d1c3e45 Mon Sep 17 00:00:00 2001 From: Yu Zeng Date: Tue, 2 Apr 2024 03:23:18 +0800 Subject: [PATCH 287/397] using latest choron (#5578) --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index 79813a0..d0c3af2 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -32,7 +32,7 @@ all-features = true [dependencies] # In alphabetical order async-trait = "0.1.53" bytes = "1.0" -chrono = { version = "0.4.31", default-features = false, features = ["clock"] } +chrono = { version = "0.4.34", default-features = false, features = ["clock"] } futures = "0.3" humantime = "2.1" itertools = "0.12.0" From fb10fd08ddd180d02eebd331a9365fa8647f1de3 Mon Sep 17 00:00:00 2001 From: Yu Zeng Date: Thu, 4 Apr 2024 23:23:18 +0800 Subject: [PATCH 288/397] Add GCS signed URL support (#5300) * add util function for gcp sign url * add string to sign and other sign functions * add GoogleCloudStorageConfig::new and config and move functions to client * add more code and rearrange struct * add client_email for credential and return the signed url * clean some code * add client email for AuthorizedUserCredentials * tidy some code * format doc * Add GcpSigningCredentialProvider for getting email * add test * Move some functions which shared by aws and gcp to utils. * fix some bug and make it can get proper result * remoe useless code * tidy some code * do not export host * add sign_by_key * Cleanup * Add ServiceAccountKey * Further tweaks * add more scope for signing. * tidy * Tweak and add test * Retry and handle errors for signBlob --------- Co-authored-by: Raphael Taylor-Davies --- src/aws/credential.rs | 19 +- src/aws/mod.rs | 11 +- src/gcp/builder.rs | 55 ++++- src/gcp/client.rs | 105 +++++++++- src/gcp/credential.rs | 465 ++++++++++++++++++++++++++++++++++++++---- src/gcp/mod.rs | 74 ++++++- src/util.rs | 29 +++ 7 files changed, 677 insertions(+), 81 deletions(-) diff --git a/src/aws/credential.rs b/src/aws/credential.rs index dd7fa5b..478e56d 100644 --- a/src/aws/credential.rs +++ b/src/aws/credential.rs @@ -19,7 +19,7 @@ use crate::aws::{AwsCredentialProvider, STORE, STRICT_ENCODE_SET, STRICT_PATH_EN use crate::client::retry::RetryExt; use crate::client::token::{TemporaryToken, TokenCache}; use crate::client::TokenProvider; -use crate::util::hmac_sha256; +use crate::util::{hex_digest, hex_encode, hmac_sha256}; use crate::{CredentialProvider, Result, RetryConfig}; use async_trait::async_trait; use bytes::Buf; @@ -342,23 +342,6 @@ impl CredentialExt for RequestBuilder { } } -/// Computes the SHA256 digest of `body` returned as a hex encoded string -fn hex_digest(bytes: &[u8]) -> String { - let digest = ring::digest::digest(&ring::digest::SHA256, bytes); - hex_encode(digest.as_ref()) -} - -/// Returns `bytes` as a lower-case hex encoded string -fn hex_encode(bytes: &[u8]) -> String { - use std::fmt::Write; - let mut out = String::with_capacity(bytes.len() * 2); - for byte in bytes { - // String writing is infallible - let _ = write!(out, "{byte:02x}"); - } - out -} - /// Canonicalizes query parameters into the AWS canonical form /// /// diff --git a/src/aws/mod.rs b/src/aws/mod.rs index b33771d..76d01d5 100644 --- a/src/aws/mod.rs +++ b/src/aws/mod.rs @@ -43,6 +43,7 @@ use crate::client::list::ListClientExt; use crate::client::CredentialProvider; use crate::multipart::{MultipartStore, PartId}; use crate::signer::Signer; +use crate::util::STRICT_ENCODE_SET; use crate::{ Error, GetOptions, GetResult, ListResult, MultipartId, MultipartUpload, ObjectMeta, ObjectStore, Path, PutMode, PutOptions, PutResult, Result, UploadPart, @@ -64,16 +65,6 @@ pub use dynamo::DynamoCommit; pub use precondition::{S3ConditionalPut, S3CopyIfNotExists}; pub use resolve::resolve_bucket_region; -// http://docs.aws.amazon.com/general/latest/gr/sigv4-create-canonical-request.html -// -// Do not URI-encode any of the unreserved characters that RFC 3986 defines: -// A-Z, a-z, 0-9, hyphen ( - ), underscore ( _ ), period ( . ), and tilde ( ~ ). -pub(crate) const STRICT_ENCODE_SET: percent_encoding::AsciiSet = percent_encoding::NON_ALPHANUMERIC - .remove(b'-') - .remove(b'.') - .remove(b'_') - .remove(b'~'); - /// This struct is used to maintain the URI path encoding const STRICT_PATH_ENCODE_SET: percent_encoding::AsciiSet = STRICT_ENCODE_SET.remove(b'/'); diff --git a/src/gcp/builder.rs b/src/gcp/builder.rs index 2cf7504..4fa9167 100644 --- a/src/gcp/builder.rs +++ b/src/gcp/builder.rs @@ -21,7 +21,10 @@ use crate::gcp::credential::{ ApplicationDefaultCredentials, InstanceCredentialProvider, ServiceAccountCredentials, DEFAULT_GCS_BASE_URL, }; -use crate::gcp::{credential, GcpCredential, GcpCredentialProvider, GoogleCloudStorage, STORE}; +use crate::gcp::{ + credential, GcpCredential, GcpCredentialProvider, GcpSigningCredential, + GcpSigningCredentialProvider, GoogleCloudStorage, STORE, +}; use crate::{ClientConfigKey, ClientOptions, Result, RetryConfig, StaticCredentialProvider}; use serde::{Deserialize, Serialize}; use snafu::{OptionExt, ResultExt, Snafu}; @@ -29,6 +32,8 @@ use std::str::FromStr; use std::sync::Arc; use url::Url; +use super::credential::{AuthorizedUserSigningCredentials, InstanceSigningCredentialProvider}; + #[derive(Debug, Snafu)] enum Error { #[snafu(display("Missing bucket name"))] @@ -107,6 +112,8 @@ pub struct GoogleCloudStorageBuilder { client_options: ClientOptions, /// Credentials credentials: Option, + /// Credentials for sign url + signing_cedentials: Option, } /// Configuration keys for [`GoogleCloudStorageBuilder`] @@ -202,6 +209,7 @@ impl Default for GoogleCloudStorageBuilder { client_options: ClientOptions::new().with_allow_http(true), url: None, credentials: None, + signing_cedentials: None, } } } @@ -452,13 +460,13 @@ impl GoogleCloudStorageBuilder { Arc::new(StaticCredentialProvider::new(GcpCredential { bearer: "".to_string(), })) as _ - } else if let Some(credentials) = service_account_credentials { + } else if let Some(credentials) = service_account_credentials.clone() { Arc::new(TokenCredentialProvider::new( credentials.token_provider()?, self.client_options.client()?, self.retry_config.clone(), )) as _ - } else if let Some(credentials) = application_default_credentials { + } else if let Some(credentials) = application_default_credentials.clone() { match credentials { ApplicationDefaultCredentials::AuthorizedUser(token) => { Arc::new(TokenCredentialProvider::new( @@ -483,13 +491,44 @@ impl GoogleCloudStorageBuilder { )) as _ }; - let config = GoogleCloudStorageConfig { - base_url: gcs_base_url, + let signing_credentials = if let Some(signing_credentials) = self.signing_cedentials { + signing_credentials + } else if disable_oauth { + Arc::new(StaticCredentialProvider::new(GcpSigningCredential { + email: "".to_string(), + private_key: None, + })) as _ + } else if let Some(credentials) = service_account_credentials.clone() { + credentials.signing_credentials()? + } else if let Some(credentials) = application_default_credentials.clone() { + match credentials { + ApplicationDefaultCredentials::AuthorizedUser(token) => { + Arc::new(TokenCredentialProvider::new( + AuthorizedUserSigningCredentials::from(token)?, + self.client_options.client()?, + self.retry_config.clone(), + )) as _ + } + ApplicationDefaultCredentials::ServiceAccount(token) => { + token.signing_credentials()? + } + } + } else { + Arc::new(TokenCredentialProvider::new( + InstanceSigningCredentialProvider::default(), + self.client_options.metadata_client()?, + self.retry_config.clone(), + )) as _ + }; + + let config = GoogleCloudStorageConfig::new( + gcs_base_url, credentials, + signing_credentials, bucket_name, - retry_config: self.retry_config, - client_options: self.client_options, - }; + self.retry_config, + self.client_options, + ); Ok(GoogleCloudStorage { client: Arc::new(GoogleCloudStorageClient::new(config)?), diff --git a/src/gcp/client.rs b/src/gcp/client.rs index def53be..901257f 100644 --- a/src/gcp/client.rs +++ b/src/gcp/client.rs @@ -24,19 +24,22 @@ use crate::client::s3::{ ListResponse, }; use crate::client::GetOptionsExt; -use crate::gcp::{GcpCredential, GcpCredentialProvider, STORE}; +use crate::gcp::{GcpCredential, GcpCredentialProvider, GcpSigningCredentialProvider, STORE}; use crate::multipart::PartId; use crate::path::{Path, DELIMITER}; +use crate::util::hex_encode; use crate::{ ClientOptions, GetOptions, ListResult, MultipartId, PutMode, PutOptions, PutResult, Result, RetryConfig, }; use async_trait::async_trait; +use base64::prelude::BASE64_STANDARD; +use base64::Engine; use bytes::{Buf, Bytes}; use percent_encoding::{percent_encode, utf8_percent_encode, NON_ALPHANUMERIC}; use reqwest::header::HeaderName; use reqwest::{header, Client, Method, RequestBuilder, Response, StatusCode}; -use serde::Serialize; +use serde::{Deserialize, Serialize}; use snafu::{OptionExt, ResultExt, Snafu}; use std::sync::Arc; @@ -101,6 +104,15 @@ enum Error { #[snafu(display("Got invalid multipart response: {}", source))] InvalidMultipartResponse { source: quick_xml::de::DeError }, + + #[snafu(display("Error signing blob: {}", source))] + SignBlobRequest { source: crate::client::retry::Error }, + + #[snafu(display("Got invalid signing blob repsonse: {}", source))] + InvalidSignBlobResponse { source: reqwest::Error }, + + #[snafu(display("Got invalid signing blob signature: {}", source))] + InvalidSignBlobSignature { source: base64::DecodeError }, } impl From for crate::Error { @@ -123,6 +135,8 @@ pub struct GoogleCloudStorageConfig { pub credentials: GcpCredentialProvider, + pub signing_credentials: GcpSigningCredentialProvider, + pub bucket_name: String, pub retry_config: RetryConfig, @@ -130,6 +144,30 @@ pub struct GoogleCloudStorageConfig { pub client_options: ClientOptions, } +impl GoogleCloudStorageConfig { + pub fn new( + base_url: String, + credentials: GcpCredentialProvider, + signing_credentials: GcpSigningCredentialProvider, + bucket_name: String, + retry_config: RetryConfig, + client_options: ClientOptions, + ) -> Self { + Self { + base_url, + credentials, + signing_credentials, + bucket_name, + retry_config, + client_options, + } + } + + pub fn path_url(&self, path: &Path) -> String { + format!("{}/{}/{}", self.base_url, self.bucket_name, path) + } +} + /// A builder for a put request allowing customisation of the headers and query string pub struct PutRequest<'a> { path: &'a Path, @@ -163,6 +201,21 @@ impl<'a> PutRequest<'a> { } } +/// Sign Blob Request Body +#[derive(Debug, Serialize)] +struct SignBlobBody { + /// The payload to sign + payload: String, +} + +/// Sign Blob Response +#[derive(Deserialize)] +#[serde(rename_all = "camelCase")] +struct SignBlobResponse { + /// The signature for the payload + signed_blob: String, +} + #[derive(Debug)] pub struct GoogleCloudStorageClient { config: GoogleCloudStorageConfig, @@ -197,6 +250,54 @@ impl GoogleCloudStorageClient { self.config.credentials.get_credential().await } + /// Create a signature from a string-to-sign using Google Cloud signBlob method. + /// form like: + /// ```plaintext + /// curl -X POST --data-binary @JSON_FILE_NAME \ + /// -H "Authorization: Bearer OAUTH2_TOKEN" \ + /// -H "Content-Type: application/json" \ + /// "https://iamcredentials.googleapis.com/v1/projects/-/serviceAccounts/SERVICE_ACCOUNT_EMAIL:signBlob" + /// ``` + /// + /// 'JSON_FILE_NAME' is a file containing the following JSON object: + /// ```plaintext + /// { + /// "payload": "REQUEST_INFORMATION" + /// } + /// ``` + pub async fn sign_blob(&self, string_to_sign: &str, client_email: &str) -> Result { + let credential = self.get_credential().await?; + let body = SignBlobBody { + payload: BASE64_STANDARD.encode(string_to_sign), + }; + + let url = format!( + "https://iamcredentials.googleapis.com/v1/projects/-/serviceAccounts/{}:signBlob", + client_email + ); + + let response = self + .client + .post(&url) + .bearer_auth(&credential.bearer) + .json(&body) + .send_retry(&self.config.retry_config) + .await + .context(SignBlobRequestSnafu)?; + + //If successful, the signature is returned in the signedBlob field in the response. + let response = response + .json::() + .await + .context(InvalidSignBlobResponseSnafu)?; + + let signed_blob = BASE64_STANDARD + .decode(response.signed_blob) + .context(InvalidSignBlobSignatureSnafu)?; + + Ok(hex_encode(&signed_blob)) + } + pub fn object_url(&self, path: &Path) -> String { let encoded = utf8_percent_encode(path.as_ref(), NON_ALPHANUMERIC); format!( diff --git a/src/gcp/credential.rs b/src/gcp/credential.rs index 34cd6ee..fcd516a 100644 --- a/src/gcp/credential.rs +++ b/src/gcp/credential.rs @@ -15,19 +15,26 @@ // specific language governing permissions and limitations // under the License. +use super::client::GoogleCloudStorageClient; use crate::client::retry::RetryExt; use crate::client::token::TemporaryToken; use crate::client::TokenProvider; -use crate::gcp::STORE; -use crate::RetryConfig; +use crate::gcp::{GcpSigningCredentialProvider, STORE}; +use crate::util::{hex_digest, hex_encode, STRICT_ENCODE_SET}; +use crate::{RetryConfig, StaticCredentialProvider}; use async_trait::async_trait; use base64::prelude::BASE64_URL_SAFE_NO_PAD; use base64::Engine; +use chrono::{DateTime, Utc}; use futures::TryFutureExt; +use hyper::HeaderMap; +use itertools::Itertools; +use percent_encoding::utf8_percent_encode; use reqwest::{Client, Method}; use ring::signature::RsaKeyPair; use serde::Deserialize; use snafu::{ResultExt, Snafu}; +use std::collections::BTreeMap; use std::env; use std::fs::File; use std::io::BufReader; @@ -35,11 +42,15 @@ use std::path::{Path, PathBuf}; use std::sync::Arc; use std::time::{Duration, Instant}; use tracing::info; +use url::Url; -pub const DEFAULT_SCOPE: &str = "https://www.googleapis.com/auth/devstorage.full_control"; +pub const DEFAULT_SCOPE: &str = "https://www.googleapis.com/auth/cloud-platform"; pub const DEFAULT_GCS_BASE_URL: &str = "https://storage.googleapis.com"; +const DEFAULT_GCS_PLAYLOAD_STRING: &str = "UNSIGNED-PAYLOAD"; +const DEFAULT_GCS_SIGN_BLOB_HOST: &str = "storage.googleapis.com"; + #[derive(Debug, Snafu)] pub enum Error { #[snafu(display("Unable to open service account file from {}: {}", path.display(), source))] @@ -57,7 +68,7 @@ pub enum Error { #[snafu(display("Invalid RSA key: {}", source), context(false))] InvalidKey { source: ring::error::KeyRejected }, - #[snafu(display("Error signing jwt: {}", source))] + #[snafu(display("Error signing: {}", source))] Sign { source: ring::error::Unspecified }, #[snafu(display("Error encoding jwt payload: {}", source))] @@ -82,6 +93,69 @@ impl From for crate::Error { } } +/// A Google Cloud Storage Credential for signing +#[derive(Debug)] +pub struct GcpSigningCredential { + /// The email of the service account + pub email: String, + + /// An optional RSA private key + /// + /// If provided this will be used to sign the URL, otherwise a call will be made to + /// [`iam.serviceAccounts.signBlob`]. This allows supporting credential sources + /// that don't expose the service account private key, e.g. [IMDS]. + /// + /// [IMDS]: https://cloud.google.com/docs/authentication/get-id-token#metadata-server + /// [`iam.serviceAccounts.signBlob`]: https://cloud.google.com/storage/docs/authentication/creating-signatures + pub private_key: Option, +} + +/// A private RSA key for a service account +#[derive(Debug)] +pub struct ServiceAccountKey(RsaKeyPair); + +impl ServiceAccountKey { + /// Parses a pem-encoded RSA key + pub fn from_pem(encoded: &[u8]) -> Result { + use rustls_pemfile::Item; + use std::io::Cursor; + + let mut cursor = Cursor::new(encoded); + let mut reader = BufReader::new(&mut cursor); + + // Reading from string is infallible + match rustls_pemfile::read_one(&mut reader).unwrap() { + Some(Item::Pkcs8Key(key)) => Self::from_pkcs8(key.secret_pkcs8_der()), + Some(Item::Pkcs1Key(key)) => Self::from_der(key.secret_pkcs1_der()), + _ => Err(Error::MissingKey), + } + } + + /// Parses an unencrypted PKCS#8-encoded RSA private key. + pub fn from_pkcs8(key: &[u8]) -> Result { + Ok(Self(RsaKeyPair::from_pkcs8(key)?)) + } + + /// Parses an unencrypted PKCS#8-encoded RSA private key. + pub fn from_der(key: &[u8]) -> Result { + Ok(Self(RsaKeyPair::from_der(key)?)) + } + + fn sign(&self, string_to_sign: &str) -> Result { + let mut signature = vec![0; self.0.public().modulus_len()]; + self.0 + .sign( + &ring::signature::RSA_PKCS1_SHA256, + &ring::rand::SystemRandom::new(), + string_to_sign.as_bytes(), + &mut signature, + ) + .context(SignSnafu)?; + + Ok(hex_encode(&signature)) + } +} + /// A Google Cloud Storage Credential #[derive(Debug, Eq, PartialEq)] pub struct GcpCredential { @@ -152,9 +226,8 @@ struct TokenResponse { pub struct SelfSignedJwt { issuer: String, scope: String, - key_pair: RsaKeyPair, - jwt_header: String, - random: ring::rand::SystemRandom, + private_key: ServiceAccountKey, + key_id: String, } impl SelfSignedJwt { @@ -162,23 +235,14 @@ impl SelfSignedJwt { pub fn new( key_id: String, issuer: String, - private_key_pem: String, + private_key: ServiceAccountKey, scope: String, ) -> Result { - let key_pair = decode_first_rsa_key(private_key_pem)?; - let jwt_header = b64_encode_obj(&JwtHeader { - alg: "RS256", - typ: Some("JWT"), - kid: Some(&key_id), - ..Default::default() - })?; - Ok(Self { issuer, - key_pair, scope, - jwt_header, - random: ring::rand::SystemRandom::new(), + private_key, + key_id, }) } } @@ -204,13 +268,21 @@ impl TokenProvider for SelfSignedJwt { exp, }; + let jwt_header = b64_encode_obj(&JwtHeader { + alg: "RS256", + typ: Some("JWT"), + kid: Some(&self.key_id), + ..Default::default() + })?; + let claim_str = b64_encode_obj(&claims)?; - let message = [self.jwt_header.as_ref(), claim_str.as_ref()].join("."); - let mut sig_bytes = vec![0; self.key_pair.public().modulus_len()]; - self.key_pair + let message = [jwt_header.as_ref(), claim_str.as_ref()].join("."); + let mut sig_bytes = vec![0; self.private_key.0.public().modulus_len()]; + self.private_key + .0 .sign( &ring::signature::RSA_PKCS1_SHA256, - &self.random, + &ring::rand::SystemRandom::new(), message.as_bytes(), &mut sig_bytes, ) @@ -238,7 +310,7 @@ where } /// A deserialized `service-account-********.json`-file. -#[derive(serde::Deserialize, Debug)] +#[derive(serde::Deserialize, Debug, Clone)] pub struct ServiceAccountCredentials { /// The private key in RSA format. pub private_key: String, @@ -281,10 +353,19 @@ impl ServiceAccountCredentials { Ok(SelfSignedJwt::new( self.private_key_id, self.client_email, - self.private_key, + ServiceAccountKey::from_pem(self.private_key.as_bytes())?, DEFAULT_SCOPE.to_string(), )?) } + + pub fn signing_credentials(self) -> crate::Result { + Ok(Arc::new(StaticCredentialProvider::new( + GcpSigningCredential { + email: self.client_email, + private_key: Some(ServiceAccountKey::from_pem(self.private_key.as_bytes())?), + }, + ))) + } } /// Returns the number of seconds since unix epoch @@ -295,21 +376,6 @@ fn seconds_since_epoch() -> u64 { .as_secs() } -fn decode_first_rsa_key(private_key_pem: String) -> Result { - use rustls_pemfile::Item; - use std::io::Cursor; - - let mut cursor = Cursor::new(private_key_pem); - let mut reader = BufReader::new(&mut cursor); - - // Reading from string is infallible - match rustls_pemfile::read_one(&mut reader).unwrap() { - Some(Item::Pkcs8Key(key)) => Ok(RsaKeyPair::from_pkcs8(key.secret_pkcs8_der())?), - Some(Item::Pkcs1Key(key)) => Ok(RsaKeyPair::from_der(key.secret_pkcs1_der())?), - _ => Err(Error::MissingKey), - } -} - fn b64_encode_obj(obj: &T) -> Result { let string = serde_json::to_string(obj).context(EncodeSnafu)?; Ok(BASE64_URL_SAFE_NO_PAD.encode(string)) @@ -360,6 +426,7 @@ impl TokenProvider for InstanceCredentialProvider { let response = make_metadata_request(client, METADATA_HOST, retry) .or_else(|_| make_metadata_request(client, METADATA_IP, retry)) .await?; + let token = TemporaryToken { token: Arc::new(GcpCredential { bearer: response.access_token, @@ -370,12 +437,69 @@ impl TokenProvider for InstanceCredentialProvider { } } +/// Make a request to the metadata server to fetch the client email, using a given hostname. +async fn make_metadata_request_for_email( + client: &Client, + hostname: &str, + retry: &RetryConfig, +) -> crate::Result { + let url = + format!("http://{hostname}/computeMetadata/v1/instance/service-accounts/default/email",); + let response = client + .request(Method::GET, url) + .header("Metadata-Flavor", "Google") + .send_retry(retry) + .await + .context(TokenRequestSnafu)? + .text() + .await + .context(TokenResponseBodySnafu)?; + Ok(response) +} + +/// A provider that uses the Google Cloud Platform metadata server to fetch a email for signing. +/// +/// +#[derive(Debug, Default)] +pub struct InstanceSigningCredentialProvider {} + +#[async_trait] +impl TokenProvider for InstanceSigningCredentialProvider { + type Credential = GcpSigningCredential; + + /// Fetch a token from the metadata server. + /// Since the connection is local we need to enable http access and don't actually use the client object passed in. + async fn fetch_token( + &self, + client: &Client, + retry: &RetryConfig, + ) -> crate::Result>> { + const METADATA_IP: &str = "169.254.169.254"; + const METADATA_HOST: &str = "metadata"; + + info!("fetching token from metadata server"); + + let email = make_metadata_request_for_email(client, METADATA_HOST, retry) + .or_else(|_| make_metadata_request_for_email(client, METADATA_IP, retry)) + .await?; + + let token = TemporaryToken { + token: Arc::new(GcpSigningCredential { + email, + private_key: None, + }), + expiry: None, + }; + Ok(token) + } +} + /// A deserialized `application_default_credentials.json`-file. /// /// # References /// - /// - -#[derive(serde::Deserialize)] +#[derive(serde::Deserialize, Clone)] #[serde(tag = "type")] pub enum ApplicationDefaultCredentials { /// Service Account. @@ -423,13 +547,65 @@ impl ApplicationDefaultCredentials { const DEFAULT_TOKEN_GCP_URI: &str = "https://accounts.google.com/o/oauth2/token"; /// -#[derive(Debug, Deserialize)] +#[derive(Debug, Deserialize, Clone)] pub struct AuthorizedUserCredentials { client_id: String, client_secret: String, refresh_token: String, } +#[derive(Debug, Deserialize)] +pub struct AuthorizedUserSigningCredentials { + credential: AuthorizedUserCredentials, +} + +/// +#[derive(Debug, Deserialize)] +struct EmailResponse { + email: String, +} + +impl AuthorizedUserSigningCredentials { + pub fn from(credential: AuthorizedUserCredentials) -> crate::Result { + Ok(Self { credential }) + } + + async fn client_email(&self, client: &Client, retry: &RetryConfig) -> crate::Result { + let response = client + .request(Method::GET, "https://oauth2.googleapis.com/tokeninfo") + .query(&[("access_token", &self.credential.refresh_token)]) + .send_retry(retry) + .await + .context(TokenRequestSnafu)? + .json::() + .await + .context(TokenResponseBodySnafu)?; + + Ok(response.email) + } +} + +#[async_trait] +impl TokenProvider for AuthorizedUserSigningCredentials { + type Credential = GcpSigningCredential; + + async fn fetch_token( + &self, + client: &Client, + retry: &RetryConfig, + ) -> crate::Result>> { + let email = self.client_email(client, retry).await?; + + Ok(TemporaryToken { + token: Arc::new(GcpSigningCredential { + email, + private_key: None, + }), + expiry: None, + }) + } +} + #[async_trait] impl TokenProvider for AuthorizedUserCredentials { type Credential = GcpCredential; @@ -462,3 +638,208 @@ impl TokenProvider for AuthorizedUserCredentials { }) } } + +/// Trim whitespace from header values +fn trim_header_value(value: &str) -> String { + let mut ret = value.to_string(); + ret.retain(|c| !c.is_whitespace()); + ret +} + +/// A Google Cloud Storage Authorizer for generating signed URL using [Google SigV4] +/// +/// [Google SigV4]: https://cloud.google.com/storage/docs/access-control/signed-urls +#[derive(Debug)] +pub struct GCSAuthorizer { + date: Option>, + credential: Arc, +} + +impl GCSAuthorizer { + /// Create a new [`GCSAuthorizer`] + pub fn new(credential: Arc) -> Self { + Self { + date: None, + credential, + } + } + + pub(crate) async fn sign( + &self, + method: Method, + url: &mut Url, + expires_in: Duration, + client: &GoogleCloudStorageClient, + ) -> crate::Result<()> { + let email = &self.credential.email; + let date = self.date.unwrap_or_else(Utc::now); + let scope = self.scope(date); + let credential_with_scope = format!("{}/{}", email, scope); + + let mut headers = HeaderMap::new(); + headers.insert("host", DEFAULT_GCS_SIGN_BLOB_HOST.parse().unwrap()); + + let (_, signed_headers) = Self::canonicalize_headers(&headers); + + url.query_pairs_mut() + .append_pair("X-Goog-Algorithm", "GOOG4-RSA-SHA256") + .append_pair("X-Goog-Credential", &credential_with_scope) + .append_pair("X-Goog-Date", &date.format("%Y%m%dT%H%M%SZ").to_string()) + .append_pair("X-Goog-Expires", &expires_in.as_secs().to_string()) + .append_pair("X-Goog-SignedHeaders", &signed_headers); + + let string_to_sign = self.string_to_sign(date, &method, url, &headers); + let signature = match &self.credential.private_key { + Some(key) => key.sign(&string_to_sign)?, + None => client.sign_blob(&string_to_sign, email).await?, + }; + + url.query_pairs_mut() + .append_pair("X-Goog-Signature", &signature); + Ok(()) + } + + /// Get scope for the request + /// + /// + fn scope(&self, date: DateTime) -> String { + format!("{}/auto/storage/goog4_request", date.format("%Y%m%d"),) + } + + /// Canonicalizes query parameters into the GCP canonical form + /// form like: + ///```plaintext + ///HTTP_VERB + ///PATH_TO_RESOURCE + ///CANONICAL_QUERY_STRING + ///CANONICAL_HEADERS + /// + ///SIGNED_HEADERS + ///PAYLOAD + ///``` + /// + /// + fn canonicalize_request(url: &Url, methond: &Method, headers: &HeaderMap) -> String { + let verb = methond.as_str(); + let path = url.path(); + let query = Self::canonicalize_query(url); + let (canaonical_headers, signed_headers) = Self::canonicalize_headers(headers); + + format!( + "{}\n{}\n{}\n{}\n\n{}\n{}", + verb, path, query, canaonical_headers, signed_headers, DEFAULT_GCS_PLAYLOAD_STRING + ) + } + + /// Canonicalizes query parameters into the GCP canonical form + /// form like `max-keys=2&prefix=object` + /// + /// + fn canonicalize_query(url: &Url) -> String { + url.query_pairs() + .sorted_unstable_by(|a, b| a.0.cmp(&b.0)) + .map(|(k, v)| { + format!( + "{}={}", + utf8_percent_encode(k.as_ref(), &STRICT_ENCODE_SET), + utf8_percent_encode(v.as_ref(), &STRICT_ENCODE_SET) + ) + }) + .join("&") + } + + /// Canonicalizes header into the GCP canonical form + /// + /// + fn canonicalize_headers(header_map: &HeaderMap) -> (String, String) { + //FIXME add error handling for invalid header values + let mut headers = BTreeMap::>::new(); + for (k, v) in header_map { + headers + .entry(k.as_str().to_lowercase()) + .or_default() + .push(std::str::from_utf8(v.as_bytes()).unwrap()); + } + + let canonicalize_headers = headers + .iter() + .map(|(k, v)| { + format!( + "{}:{}", + k.trim(), + v.iter().map(|v| trim_header_value(v)).join(",") + ) + }) + .join("\n"); + + let signed_headers = headers.keys().join(";"); + + (canonicalize_headers, signed_headers) + } + + ///construct the string to sign + ///form like: + ///```plaintext + ///SIGNING_ALGORITHM + ///ACTIVE_DATETIME + ///CREDENTIAL_SCOPE + ///HASHED_CANONICAL_REQUEST + ///``` + ///`ACTIVE_DATETIME` format:`YYYYMMDD'T'HHMMSS'Z'` + /// + pub fn string_to_sign( + &self, + date: DateTime, + request_method: &Method, + url: &Url, + headers: &HeaderMap, + ) -> String { + let caninical_request = Self::canonicalize_request(url, request_method, headers); + let hashed_canonical_req = hex_digest(caninical_request.as_bytes()); + let scope = self.scope(date); + + format!( + "{}\n{}\n{}\n{}", + "GOOG4-RSA-SHA256", + date.format("%Y%m%dT%H%M%SZ"), + scope, + hashed_canonical_req + ) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_canonicalize_headers() { + let mut input_header = HeaderMap::new(); + input_header.insert("content-type", "text/plain".parse().unwrap()); + input_header.insert("host", "storage.googleapis.com".parse().unwrap()); + input_header.insert("x-goog-meta-reviewer", "jane".parse().unwrap()); + input_header.append("x-goog-meta-reviewer", "john".parse().unwrap()); + assert_eq!( + GCSAuthorizer::canonicalize_headers(&input_header), + ( + "content-type:text/plain +host:storage.googleapis.com +x-goog-meta-reviewer:jane,john" + .into(), + "content-type;host;x-goog-meta-reviewer".to_string() + ) + ); + } + + #[test] + fn test_canonicalize_query() { + let mut url = Url::parse("https://storage.googleapis.com/bucket/object").unwrap(); + url.query_pairs_mut() + .append_pair("max-keys", "2") + .append_pair("prefix", "object"); + assert_eq!( + GCSAuthorizer::canonicalize_query(&url), + "max-keys=2&prefix=object".to_string() + ); + } +} diff --git a/src/gcp/mod.rs b/src/gcp/mod.rs index 2058d1f..96afa45 100644 --- a/src/gcp/mod.rs +++ b/src/gcp/mod.rs @@ -35,8 +35,11 @@ //! //! [lifecycle rule]: https://cloud.google.com/storage/docs/lifecycle#abort-mpu use std::sync::Arc; +use std::time::Duration; use crate::client::CredentialProvider; +use crate::gcp::credential::GCSAuthorizer; +use crate::signer::Signer; use crate::{ multipart::PartId, path::Path, GetOptions, GetResult, ListResult, MultipartId, MultipartUpload, ObjectMeta, ObjectStore, PutOptions, PutResult, Result, UploadPart, @@ -45,13 +48,15 @@ use async_trait::async_trait; use bytes::Bytes; use client::GoogleCloudStorageClient; use futures::stream::BoxStream; +use hyper::Method; +use url::Url; use crate::client::get::GetClientExt; use crate::client::list::ListClientExt; use crate::client::parts::Parts; use crate::multipart::MultipartStore; pub use builder::{GoogleCloudStorageBuilder, GoogleConfigKey}; -pub use credential::GcpCredential; +pub use credential::{GcpCredential, GcpSigningCredential, ServiceAccountKey}; mod builder; mod client; @@ -62,6 +67,10 @@ const STORE: &str = "GCS"; /// [`CredentialProvider`] for [`GoogleCloudStorage`] pub type GcpCredentialProvider = Arc>; +/// [`GcpSigningCredential`] for [`GoogleCloudStorage`] +pub type GcpSigningCredentialProvider = + Arc>; + /// Interface for [Google Cloud Storage](https://cloud.google.com/storage/). #[derive(Debug)] pub struct GoogleCloudStorage { @@ -83,6 +92,11 @@ impl GoogleCloudStorage { pub fn credentials(&self) -> &GcpCredentialProvider { &self.client.config().credentials } + + /// Returns the [`GcpSigningCredentialProvider`] used by [`GoogleCloudStorage`] + pub fn signing_credentials(&self) -> &GcpSigningCredentialProvider { + &self.client.config().signing_credentials + } } #[derive(Debug)] @@ -215,6 +229,34 @@ impl MultipartStore for GoogleCloudStorage { } } +#[async_trait] +impl Signer for GoogleCloudStorage { + async fn signed_url(&self, method: Method, path: &Path, expires_in: Duration) -> Result { + if expires_in.as_secs() > 604800 { + return Err(crate::Error::Generic { + store: STORE, + source: "Expiration Time can't be longer than 604800 seconds (7 days).".into(), + }); + } + + let config = self.client.config(); + let path_url = config.path_url(path); + let mut url = Url::parse(&path_url).map_err(|e| crate::Error::Generic { + store: STORE, + source: format!("Unable to parse url {path_url}: {e}").into(), + })?; + + let signing_credentials = self.signing_credentials().get_credential().await?; + let authorizer = GCSAuthorizer::new(signing_credentials); + + authorizer + .sign(method, &mut url, expires_in, &self.client) + .await?; + + Ok(url) + } +} + #[cfg(test)] mod test { @@ -250,6 +292,36 @@ mod test { } } + #[tokio::test] + #[ignore] + async fn gcs_test_sign() { + crate::test_util::maybe_skip_integration!(); + let integration = GoogleCloudStorageBuilder::from_env().build().unwrap(); + + let client = reqwest::Client::new(); + + let path = Path::from("test_sign"); + let url = integration + .signed_url(Method::PUT, &path, Duration::from_secs(3600)) + .await + .unwrap(); + println!("PUT {url}"); + + let resp = client.put(url).body("data").send().await.unwrap(); + resp.error_for_status().unwrap(); + + let url = integration + .signed_url(Method::GET, &path, Duration::from_secs(3600)) + .await + .unwrap(); + println!("GET {url}"); + + let resp = client.get(url).send().await.unwrap(); + let resp = resp.error_for_status().unwrap(); + let data = resp.bytes().await.unwrap(); + assert_eq!(data.as_ref(), b"data"); + } + #[tokio::test] async fn gcs_test_get_nonexistent_location() { crate::test_util::maybe_skip_integration!(); diff --git a/src/util.rs b/src/util.rs index a19d5aa..161d2d1 100644 --- a/src/util.rs +++ b/src/util.rs @@ -285,6 +285,35 @@ impl> From for GetRange { } } } +// http://docs.aws.amazon.com/general/latest/gr/sigv4-create-canonical-request.html +// +// Do not URI-encode any of the unreserved characters that RFC 3986 defines: +// A-Z, a-z, 0-9, hyphen ( - ), underscore ( _ ), period ( . ), and tilde ( ~ ). +#[cfg(any(feature = "aws", feature = "gcp"))] +pub(crate) const STRICT_ENCODE_SET: percent_encoding::AsciiSet = percent_encoding::NON_ALPHANUMERIC + .remove(b'-') + .remove(b'.') + .remove(b'_') + .remove(b'~'); + +/// Computes the SHA256 digest of `body` returned as a hex encoded string +#[cfg(any(feature = "aws", feature = "gcp"))] +pub(crate) fn hex_digest(bytes: &[u8]) -> String { + let digest = ring::digest::digest(&ring::digest::SHA256, bytes); + hex_encode(digest.as_ref()) +} + +/// Returns `bytes` as a lower-case hex encoded string +#[cfg(any(feature = "aws", feature = "gcp"))] +pub(crate) fn hex_encode(bytes: &[u8]) -> String { + use std::fmt::Write; + let mut out = String::with_capacity(bytes.len() * 2); + for byte in bytes { + // String writing is infallible + let _ = write!(out, "{byte:02x}"); + } + out +} #[cfg(test)] mod tests { From 1f4731b6dd4fa84855fa1c1c9465a7c172721104 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Sat, 6 Apr 2024 07:07:50 +0100 Subject: [PATCH 289/397] Fix handling of empty multipart uploads for GCS (#5590) * Fix handling of empty multipart uploads for GCS * Clippy --- src/gcp/client.rs | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/gcp/client.rs b/src/gcp/client.rs index 901257f..3762915 100644 --- a/src/gcp/client.rs +++ b/src/gcp/client.rs @@ -430,6 +430,13 @@ impl GoogleCloudStorageClient { multipart_id: &MultipartId, completed_parts: Vec, ) -> Result { + if completed_parts.is_empty() { + // GCS doesn't allow empty multipart uploads + let result = self.put_request(path, Default::default()).send().await?; + self.multipart_cleanup(path, multipart_id).await?; + return Ok(result); + } + let upload_id = multipart_id.clone(); let url = self.object_url(path); From a34e490cc5499078ecb6f5d87bc80816e916df5d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Guedes?= Date: Tue, 9 Apr 2024 07:43:18 -0300 Subject: [PATCH 290/397] Adds send_retry_with_idempotency and retry more kinds of transport errors (#5609) --- src/aws/client.rs | 17 ++- src/aws/credential.rs | 4 +- src/aws/dynamo.rs | 6 +- src/aws/mod.rs | 8 +- src/azure/client.rs | 17 ++- src/azure/credential.rs | 4 +- src/client/retry.rs | 269 +++++++++++++++++++++++----------------- src/gcp/client.rs | 32 +++-- src/gcp/credential.rs | 18 +-- src/http/client.rs | 2 +- 10 files changed, 233 insertions(+), 144 deletions(-) diff --git a/src/aws/client.rs b/src/aws/client.rs index 4d10145..838bef8 100644 --- a/src/aws/client.rs +++ b/src/aws/client.rs @@ -268,6 +268,7 @@ pub(crate) struct Request<'a> { builder: RequestBuilder, payload_sha256: Option>, use_session_creds: bool, + idempotent: bool, } impl<'a> Request<'a> { @@ -285,6 +286,11 @@ impl<'a> Request<'a> { Self { builder, ..self } } + pub fn set_idempotent(mut self, idempotent: bool) -> Self { + self.idempotent = idempotent; + self + } + pub async fn send(self) -> Result { let credential = match self.use_session_creds { true => self.config.get_session_credential().await?, @@ -298,7 +304,7 @@ impl<'a> Request<'a> { let path = self.path.as_ref(); self.builder .with_aws_sigv4(credential.authorizer(), self.payload_sha256.as_deref()) - .send_retry(&self.config.retry_config) + .send_retry_with_idempotency(&self.config.retry_config, self.idempotent) .await .context(RetrySnafu { path }) } @@ -360,6 +366,7 @@ impl S3Client { payload_sha256, config: &self.config, use_session_creds: true, + idempotent: false, } } @@ -462,7 +469,7 @@ impl S3Client { .header(CONTENT_TYPE, "application/xml") .body(body) .with_aws_sigv4(credential.authorizer(), payload_sha256.as_deref()) - .send_retry(&self.config.retry_config) + .send_retry_with_idempotency(&self.config.retry_config, false) .await .context(DeleteObjectsRequestSnafu {})? .bytes() @@ -510,6 +517,7 @@ impl S3Client { config: &self.config, payload_sha256: None, use_session_creds: false, + idempotent: false, } } @@ -522,7 +530,7 @@ impl S3Client { .request(Method::POST, url) .headers(self.config.encryption_headers.clone().into()) .with_aws_sigv4(credential.authorizer(), None) - .send_retry(&self.config.retry_config) + .send_retry_with_idempotency(&self.config.retry_config, true) .await .context(CreateMultipartRequestSnafu)? .bytes() @@ -547,6 +555,7 @@ impl S3Client { let response = self .put_request(path, data, false) .query(&[("partNumber", &part), ("uploadId", upload_id)]) + .set_idempotent(true) .send() .await?; @@ -582,7 +591,7 @@ impl S3Client { .query(&[("uploadId", upload_id)]) .body(body) .with_aws_sigv4(credential.authorizer(), None) - .send_retry(&self.config.retry_config) + .send_retry_with_idempotency(&self.config.retry_config, true) .await .context(CompleteMultipartRequestSnafu)?; diff --git a/src/aws/credential.rs b/src/aws/credential.rs index 478e56d..a7d1a97 100644 --- a/src/aws/credential.rs +++ b/src/aws/credential.rs @@ -517,7 +517,7 @@ async fn instance_creds( let token_result = client .request(Method::PUT, token_url) .header("X-aws-ec2-metadata-token-ttl-seconds", "600") // 10 minute TTL - .send_retry(retry_config) + .send_retry_with_idempotency(retry_config, true) .await; let token = match token_result { @@ -607,7 +607,7 @@ async fn web_identity( ("Version", "2011-06-15"), ("WebIdentityToken", &token), ]) - .send_retry(retry_config) + .send_retry_with_idempotency(retry_config, true) .await? .bytes() .await?; diff --git a/src/aws/dynamo.rs b/src/aws/dynamo.rs index 2390187..2e60bba 100644 --- a/src/aws/dynamo.rs +++ b/src/aws/dynamo.rs @@ -186,7 +186,11 @@ impl DynamoCommit { to: &Path, ) -> Result<()> { self.conditional_op(client, to, None, || async { - client.copy_request(from, to).send().await?; + client + .copy_request(from, to) + .set_idempotent(false) + .send() + .await?; Ok(()) }) .await diff --git a/src/aws/mod.rs b/src/aws/mod.rs index 76d01d5..16af4d3 100644 --- a/src/aws/mod.rs +++ b/src/aws/mod.rs @@ -159,7 +159,7 @@ impl ObjectStore for AmazonS3 { } match (opts.mode, &self.client.config.conditional_put) { - (PutMode::Overwrite, _) => request.do_put().await, + (PutMode::Overwrite, _) => request.set_idempotent(true).do_put().await, (PutMode::Create | PutMode::Update(_), None) => Err(Error::NotImplemented), (PutMode::Create, Some(S3ConditionalPut::ETagMatch)) => { match request.header(&IF_NONE_MATCH, "*").do_put().await { @@ -268,7 +268,11 @@ impl ObjectStore for AmazonS3 { } async fn copy(&self, from: &Path, to: &Path) -> Result<()> { - self.client.copy_request(from, to).send().await?; + self.client + .copy_request(from, to) + .set_idempotent(true) + .send() + .await?; Ok(()) } diff --git a/src/azure/client.rs b/src/azure/client.rs index 5be6658..0e6af50 100644 --- a/src/azure/client.rs +++ b/src/azure/client.rs @@ -172,6 +172,7 @@ struct PutRequest<'a> { path: &'a Path, config: &'a AzureConfig, builder: RequestBuilder, + idempotent: bool, } impl<'a> PutRequest<'a> { @@ -185,12 +186,17 @@ impl<'a> PutRequest<'a> { Self { builder, ..self } } + fn set_idempotent(mut self, idempotent: bool) -> Self { + self.idempotent = idempotent; + self + } + async fn send(self) -> Result { let credential = self.config.get_credential().await?; let response = self .builder .with_azure_authorization(&credential, &self.config.account) - .send_retry(&self.config.retry_config) + .send_retry_with_idempotency(&self.config.retry_config, self.idempotent) .await .context(PutRequestSnafu { path: self.path.as_ref(), @@ -239,6 +245,7 @@ impl AzureClient { path, builder, config: &self.config, + idempotent: false, } } @@ -247,7 +254,7 @@ impl AzureClient { let builder = self.put_request(path, bytes); let builder = match &opts.mode { - PutMode::Overwrite => builder, + PutMode::Overwrite => builder.set_idempotent(true), PutMode::Create => builder.header(&IF_NONE_MATCH, "*"), PutMode::Update(v) => { let etag = v.e_tag.as_ref().context(MissingETagSnafu)?; @@ -271,6 +278,7 @@ impl AzureClient { self.put_request(path, data) .query(&[("comp", "block"), ("blockid", &block_id)]) + .set_idempotent(true) .send() .await?; @@ -287,6 +295,7 @@ impl AzureClient { let response = self .put_request(path, BlockList { blocks }.to_xml().into()) .query(&[("comp", "blocklist")]) + .set_idempotent(true) .send() .await?; @@ -340,7 +349,7 @@ impl AzureClient { builder .with_azure_authorization(&credential, &self.config.account) - .send_retry(&self.config.retry_config) + .send_retry_with_idempotency(&self.config.retry_config, true) .await .map_err(|err| err.error(STORE, from.to_string()))?; @@ -373,7 +382,7 @@ impl AzureClient { .body(body) .query(&[("restype", "service"), ("comp", "userdelegationkey")]) .with_azure_authorization(&credential, &self.config.account) - .send_retry(&self.config.retry_config) + .send_retry_with_idempotency(&self.config.retry_config, true) .await .context(DelegationKeyRequestSnafu)? .bytes() diff --git a/src/azure/credential.rs b/src/azure/credential.rs index 6dc3141..36845bd 100644 --- a/src/azure/credential.rs +++ b/src/azure/credential.rs @@ -615,7 +615,7 @@ impl TokenProvider for ClientSecretOAuthProvider { ("scope", AZURE_STORAGE_SCOPE), ("grant_type", "client_credentials"), ]) - .send_retry(retry) + .send_retry_with_idempotency(retry, true) .await .context(TokenRequestSnafu)? .json() @@ -797,7 +797,7 @@ impl TokenProvider for WorkloadIdentityOAuthProvider { ("scope", AZURE_STORAGE_SCOPE), ("grant_type", "client_credentials"), ]) - .send_retry(retry) + .send_retry_with_idempotency(retry, true) .await .context(TokenRequestSnafu)? .json() diff --git a/src/client/retry.rs b/src/client/retry.rs index e4bb5c9..f3fa715 100644 --- a/src/client/retry.rs +++ b/src/client/retry.rs @@ -166,128 +166,83 @@ impl Default for RetryConfig { } } -pub trait RetryExt { - /// Dispatch a request with the given retry configuration - /// - /// # Panic - /// - /// This will panic if the request body is a stream - fn send_retry(self, config: &RetryConfig) -> BoxFuture<'static, Result>; -} - -impl RetryExt for reqwest::RequestBuilder { - fn send_retry(self, config: &RetryConfig) -> BoxFuture<'static, Result> { - let mut backoff = Backoff::new(&config.backoff); - let max_retries = config.max_retries; - let retry_timeout = config.retry_timeout; - - let (client, req) = self.build_split(); - let req = req.expect("request must be valid"); - - async move { - let mut retries = 0; - let now = Instant::now(); - - loop { - let s = req.try_clone().expect("request body must be cloneable"); - match client.execute(s).await { - Ok(r) => match r.error_for_status_ref() { - Ok(_) if r.status().is_success() => return Ok(r), - Ok(r) if r.status() == StatusCode::NOT_MODIFIED => { - return Err(Error::Client { +fn send_retry_impl( + builder: reqwest::RequestBuilder, + config: &RetryConfig, + is_idempotent: Option, +) -> BoxFuture<'static, Result> { + let mut backoff = Backoff::new(&config.backoff); + let max_retries = config.max_retries; + let retry_timeout = config.retry_timeout; + + let (client, req) = builder.build_split(); + let req = req.expect("request must be valid"); + let is_idempotent = is_idempotent.unwrap_or(req.method().is_safe()); + + async move { + let mut retries = 0; + let now = Instant::now(); + + loop { + let s = req.try_clone().expect("request body must be cloneable"); + match client.execute(s).await { + Ok(r) => match r.error_for_status_ref() { + Ok(_) if r.status().is_success() => return Ok(r), + Ok(r) if r.status() == StatusCode::NOT_MODIFIED => { + return Err(Error::Client { + body: None, + status: StatusCode::NOT_MODIFIED, + }) + } + Ok(r) => { + let is_bare_redirect = r.status().is_redirection() && !r.headers().contains_key(LOCATION); + return match is_bare_redirect { + true => Err(Error::BareRedirect), + // Not actually sure if this is reachable, but here for completeness + false => Err(Error::Client { body: None, - status: StatusCode::NOT_MODIFIED, + status: r.status(), }) } - Ok(r) => { - let is_bare_redirect = r.status().is_redirection() && !r.headers().contains_key(LOCATION); - return match is_bare_redirect { - true => Err(Error::BareRedirect), - // Not actually sure if this is reachable, but here for completeness - false => Err(Error::Client { - body: None, - status: r.status(), - }) - } - } - Err(e) => { - let status = r.status(); - if retries == max_retries - || now.elapsed() > retry_timeout - || !status.is_server_error() { - - return Err(match status.is_client_error() { - true => match r.text().await { - Ok(body) => { - Error::Client { - body: Some(body).filter(|b| !b.is_empty()), - status, - } - } - Err(e) => { - Error::Reqwest { - retries, - max_retries, - elapsed: now.elapsed(), - retry_timeout, - source: e, - } + } + Err(e) => { + let status = r.status(); + if retries == max_retries + || now.elapsed() > retry_timeout + || !status.is_server_error() { + + return Err(match status.is_client_error() { + true => match r.text().await { + Ok(body) => { + Error::Client { + body: Some(body).filter(|b| !b.is_empty()), + status, } } - false => Error::Reqwest { - retries, - max_retries, - elapsed: now.elapsed(), - retry_timeout, - source: e, + Err(e) => { + Error::Reqwest { + retries, + max_retries, + elapsed: now.elapsed(), + retry_timeout, + source: e, + } } - }); - } - - let sleep = backoff.next(); - retries += 1; - info!( - "Encountered server error, backing off for {} seconds, retry {} of {}: {}", - sleep.as_secs_f32(), - retries, - max_retries, - e, - ); - tokio::time::sleep(sleep).await; - } - }, - Err(e) => - { - let mut do_retry = false; - if e.is_connect() || ( req.method().is_safe() && e.is_timeout()) { - do_retry = true - } else { - let mut source = e.source(); - while let Some(e) = source { - if let Some(e) = e.downcast_ref::() { - do_retry = e.is_closed() || e.is_incomplete_message(); - break } - source = e.source(); - } + false => Error::Reqwest { + retries, + max_retries, + elapsed: now.elapsed(), + retry_timeout, + source: e, + } + }); } - if retries == max_retries - || now.elapsed() > retry_timeout - || !do_retry { - - return Err(Error::Reqwest { - retries, - max_retries, - elapsed: now.elapsed(), - retry_timeout, - source: e, - }) - } let sleep = backoff.next(); retries += 1; info!( - "Encountered transport error backing off for {} seconds, retry {} of {}: {}", + "Encountered server error, backing off for {} seconds, retry {} of {}: {}", sleep.as_secs_f32(), retries, max_retries, @@ -295,10 +250,102 @@ impl RetryExt for reqwest::RequestBuilder { ); tokio::time::sleep(sleep).await; } + }, + Err(e) => + { + let mut do_retry = false; + if e.is_connect() + || e.is_body() + || (e.is_request() && !e.is_timeout()) + || (is_idempotent && e.is_timeout()) { + do_retry = true + } else { + let mut source = e.source(); + while let Some(e) = source { + if let Some(e) = e.downcast_ref::() { + do_retry = e.is_closed() + || e.is_incomplete_message() + || e.is_body_write_aborted() + || (is_idempotent && e.is_timeout()); + break + } + if let Some(e) = e.downcast_ref::() { + if e.kind() == std::io::ErrorKind::TimedOut { + do_retry = is_idempotent; + } else { + do_retry = matches!( + e.kind(), + std::io::ErrorKind::ConnectionReset + | std::io::ErrorKind::ConnectionAborted + | std::io::ErrorKind::BrokenPipe + | std::io::ErrorKind::UnexpectedEof + ); + } + break; + } + source = e.source(); + } + } + + if retries == max_retries + || now.elapsed() > retry_timeout + || !do_retry { + + return Err(Error::Reqwest { + retries, + max_retries, + elapsed: now.elapsed(), + retry_timeout, + source: e, + }) + } + let sleep = backoff.next(); + retries += 1; + info!( + "Encountered transport error backing off for {} seconds, retry {} of {}: {}", + sleep.as_secs_f32(), + retries, + max_retries, + e, + ); + tokio::time::sleep(sleep).await; } } } - .boxed() + } + .boxed() +} + +pub trait RetryExt { + /// Dispatch a request with the given retry configuration + /// + /// # Panic + /// + /// This will panic if the request body is a stream + fn send_retry(self, config: &RetryConfig) -> BoxFuture<'static, Result>; + + /// Dispatch a request with the given retry configuration and idempotency + /// + /// # Panic + /// + /// This will panic if the request body is a stream + fn send_retry_with_idempotency( + self, + config: &RetryConfig, + is_idempotent: bool, + ) -> BoxFuture<'static, Result>; +} + +impl RetryExt for reqwest::RequestBuilder { + fn send_retry(self, config: &RetryConfig) -> BoxFuture<'static, Result> { + send_retry_impl(self, config, None) + } + fn send_retry_with_idempotency( + self, + config: &RetryConfig, + is_idempotent: bool, + ) -> BoxFuture<'static, Result> { + send_retry_impl(self, config, Some(is_idempotent)) } } diff --git a/src/gcp/client.rs b/src/gcp/client.rs index 3762915..17404f9 100644 --- a/src/gcp/client.rs +++ b/src/gcp/client.rs @@ -173,6 +173,7 @@ pub struct PutRequest<'a> { path: &'a Path, config: &'a GoogleCloudStorageConfig, builder: RequestBuilder, + idempotent: bool, } impl<'a> PutRequest<'a> { @@ -186,12 +187,17 @@ impl<'a> PutRequest<'a> { Self { builder, ..self } } + fn set_idempotent(mut self, idempotent: bool) -> Self { + self.idempotent = idempotent; + self + } + async fn send(self) -> Result { let credential = self.config.credentials.get_credential().await?; let response = self .builder .bearer_auth(&credential.bearer) - .send_retry(&self.config.retry_config) + .send_retry_with_idempotency(&self.config.retry_config, self.idempotent) .await .context(PutRequestSnafu { path: self.path.as_ref(), @@ -281,7 +287,7 @@ impl GoogleCloudStorageClient { .post(&url) .bearer_auth(&credential.bearer) .json(&body) - .send_retry(&self.config.retry_config) + .send_retry_with_idempotency(&self.config.retry_config, true) .await .context(SignBlobRequestSnafu)?; @@ -329,6 +335,7 @@ impl GoogleCloudStorageClient { path, builder, config: &self.config, + idempotent: false, } } @@ -336,7 +343,7 @@ impl GoogleCloudStorageClient { let builder = self.put_request(path, data); let builder = match &opts.mode { - PutMode::Overwrite => builder, + PutMode::Overwrite => builder.set_idempotent(true), PutMode::Create => builder.header(&VERSION_MATCH, "0"), PutMode::Update(v) => { let etag = v.version.as_ref().context(MissingVersionSnafu)?; @@ -366,7 +373,12 @@ impl GoogleCloudStorageClient { ("partNumber", &format!("{}", part_idx + 1)), ("uploadId", upload_id), ]; - let result = self.put_request(path, data).query(query).send().await?; + let result = self + .put_request(path, data) + .query(query) + .set_idempotent(true) + .send() + .await?; Ok(PartId { content_id: result.e_tag.unwrap(), @@ -391,7 +403,7 @@ impl GoogleCloudStorageClient { .header(header::CONTENT_TYPE, content_type) .header(header::CONTENT_LENGTH, "0") .query(&[("uploads", "")]) - .send_retry(&self.config.retry_config) + .send_retry_with_idempotency(&self.config.retry_config, true) .await .context(PutRequestSnafu { path: path.as_ref(), @@ -432,7 +444,11 @@ impl GoogleCloudStorageClient { ) -> Result { if completed_parts.is_empty() { // GCS doesn't allow empty multipart uploads - let result = self.put_request(path, Default::default()).send().await?; + let result = self + .put_request(path, Default::default()) + .set_idempotent(true) + .send() + .await?; self.multipart_cleanup(path, multipart_id).await?; return Ok(result); } @@ -456,7 +472,7 @@ impl GoogleCloudStorageClient { .bearer_auth(&credential.bearer) .query(&[("uploadId", upload_id)]) .body(data) - .send_retry(&self.config.retry_config) + .send_retry_with_idempotency(&self.config.retry_config, true) .await .context(CompleteMultipartRequestSnafu)?; @@ -515,7 +531,7 @@ impl GoogleCloudStorageClient { // Needed if reqwest is compiled with native-tls instead of rustls-tls // See https://github.com/apache/arrow-rs/pull/3921 .header(header::CONTENT_LENGTH, 0) - .send_retry(&self.config.retry_config) + .send_retry_with_idempotency(&self.config.retry_config, !if_not_exists) .await .map_err(|err| match err.status() { Some(StatusCode::PRECONDITION_FAILED) => crate::Error::AlreadyExists { diff --git a/src/gcp/credential.rs b/src/gcp/credential.rs index fcd516a..158716c 100644 --- a/src/gcp/credential.rs +++ b/src/gcp/credential.rs @@ -623,7 +623,7 @@ impl TokenProvider for AuthorizedUserCredentials { ("client_secret", &self.client_secret), ("refresh_token", &self.refresh_token), ]) - .send_retry(retry) + .send_retry_with_idempotency(retry, true) .await .context(TokenRequestSnafu)? .json::() @@ -709,12 +709,12 @@ impl GCSAuthorizer { /// Canonicalizes query parameters into the GCP canonical form /// form like: ///```plaintext - ///HTTP_VERB - ///PATH_TO_RESOURCE - ///CANONICAL_QUERY_STRING - ///CANONICAL_HEADERS + ///HTTP_VERB + ///PATH_TO_RESOURCE + ///CANONICAL_QUERY_STRING + ///CANONICAL_HEADERS /// - ///SIGNED_HEADERS + ///SIGNED_HEADERS ///PAYLOAD ///``` /// @@ -780,9 +780,9 @@ impl GCSAuthorizer { ///construct the string to sign ///form like: ///```plaintext - ///SIGNING_ALGORITHM - ///ACTIVE_DATETIME - ///CREDENTIAL_SCOPE + ///SIGNING_ALGORITHM + ///ACTIVE_DATETIME + ///CREDENTIAL_SCOPE ///HASHED_CANONICAL_REQUEST ///``` ///`ACTIVE_DATETIME` format:`YYYYMMDD'T'HHMMSS'Z'` diff --git a/src/http/client.rs b/src/http/client.rs index 8700775..fdc8751 100644 --- a/src/http/client.rs +++ b/src/http/client.rs @@ -189,7 +189,7 @@ impl Client { .client .request(method, url) .header("Depth", depth) - .send_retry(&self.retry_config) + .send_retry_with_idempotency(&self.retry_config, true) .await; let response = match result { From a0905ed61c79114cfae31682e826ff52d86a8842 Mon Sep 17 00:00:00 2001 From: Giovanni Manfredi Date: Thu, 11 Apr 2024 18:45:39 +0200 Subject: [PATCH 291/397] Fixed typos in object store (#5629) Solved some local error in spelling of local variables. Note that a spell checker was used and all the references to the old (wrong spelling) have been corrected. Co-authored-by: Giovanni Manfredi --- src/gcp/builder.rs | 6 +++--- src/gcp/client.rs | 2 +- src/gcp/credential.rs | 12 ++++++------ 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/src/gcp/builder.rs b/src/gcp/builder.rs index 4fa9167..e6da312 100644 --- a/src/gcp/builder.rs +++ b/src/gcp/builder.rs @@ -113,7 +113,7 @@ pub struct GoogleCloudStorageBuilder { /// Credentials credentials: Option, /// Credentials for sign url - signing_cedentials: Option, + signing_credentials: Option, } /// Configuration keys for [`GoogleCloudStorageBuilder`] @@ -209,7 +209,7 @@ impl Default for GoogleCloudStorageBuilder { client_options: ClientOptions::new().with_allow_http(true), url: None, credentials: None, - signing_cedentials: None, + signing_credentials: None, } } } @@ -491,7 +491,7 @@ impl GoogleCloudStorageBuilder { )) as _ }; - let signing_credentials = if let Some(signing_credentials) = self.signing_cedentials { + let signing_credentials = if let Some(signing_credentials) = self.signing_credentials { signing_credentials } else if disable_oauth { Arc::new(StaticCredentialProvider::new(GcpSigningCredential { diff --git a/src/gcp/client.rs b/src/gcp/client.rs index 17404f9..4aed81a 100644 --- a/src/gcp/client.rs +++ b/src/gcp/client.rs @@ -108,7 +108,7 @@ enum Error { #[snafu(display("Error signing blob: {}", source))] SignBlobRequest { source: crate::client::retry::Error }, - #[snafu(display("Got invalid signing blob repsonse: {}", source))] + #[snafu(display("Got invalid signing blob response: {}", source))] InvalidSignBlobResponse { source: reqwest::Error }, #[snafu(display("Got invalid signing blob signature: {}", source))] diff --git a/src/gcp/credential.rs b/src/gcp/credential.rs index 158716c..abb0417 100644 --- a/src/gcp/credential.rs +++ b/src/gcp/credential.rs @@ -719,15 +719,15 @@ impl GCSAuthorizer { ///``` /// /// - fn canonicalize_request(url: &Url, methond: &Method, headers: &HeaderMap) -> String { - let verb = methond.as_str(); + fn canonicalize_request(url: &Url, method: &Method, headers: &HeaderMap) -> String { + let verb = method.as_str(); let path = url.path(); let query = Self::canonicalize_query(url); - let (canaonical_headers, signed_headers) = Self::canonicalize_headers(headers); + let (canonical_headers, signed_headers) = Self::canonicalize_headers(headers); format!( "{}\n{}\n{}\n{}\n\n{}\n{}", - verb, path, query, canaonical_headers, signed_headers, DEFAULT_GCS_PLAYLOAD_STRING + verb, path, query, canonical_headers, signed_headers, DEFAULT_GCS_PLAYLOAD_STRING ) } @@ -794,8 +794,8 @@ impl GCSAuthorizer { url: &Url, headers: &HeaderMap, ) -> String { - let caninical_request = Self::canonicalize_request(url, request_method, headers); - let hashed_canonical_req = hex_digest(caninical_request.as_bytes()); + let canonical_request = Self::canonicalize_request(url, request_method, headers); + let hashed_canonical_req = hex_digest(canonical_request.as_bytes()); let scope = self.scope(date); format!( From e468d6cad740b6e4524f5ececfb975c79555445e Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Mon, 15 Apr 2024 14:50:19 +0100 Subject: [PATCH 292/397] Support non-contiguous put payloads / vectored writes (#5514) (#5538) * Support non-contiguous put payloads (#5514) * Docs * Add more docs * Review feedback --- src/aws/checksum.rs | 15 -- src/aws/client.rs | 79 +++++----- src/aws/credential.rs | 8 +- src/aws/dynamo.rs | 6 +- src/aws/mod.rs | 25 ++-- src/azure/client.rs | 44 ++++-- src/azure/credential.rs | 8 +- src/azure/mod.rs | 19 ++- src/buffered.rs | 16 +- src/chunked.rs | 16 +- src/client/retry.rs | 165 ++++++++++++--------- src/gcp/client.rs | 50 +++++-- src/gcp/credential.rs | 4 +- src/gcp/mod.rs | 23 +-- src/http/client.rs | 23 ++- src/http/mod.rs | 12 +- src/lib.rs | 142 +++++++++++------- src/limit.rs | 17 ++- src/local.rs | 53 ++++--- src/memory.rs | 34 +++-- src/multipart.rs | 5 +- src/payload.rs | 314 ++++++++++++++++++++++++++++++++++++++++ src/prefix.rs | 24 +-- src/throttle.rs | 30 ++-- src/upload.rs | 44 ++++-- tests/get_range_file.rs | 13 +- 26 files changed, 843 insertions(+), 346 deletions(-) create mode 100644 src/payload.rs diff --git a/src/aws/checksum.rs b/src/aws/checksum.rs index a50bd2d..d15bbf0 100644 --- a/src/aws/checksum.rs +++ b/src/aws/checksum.rs @@ -16,7 +16,6 @@ // under the License. use crate::config::Parse; -use ring::digest::{self, digest as ring_digest}; use std::str::FromStr; #[allow(non_camel_case_types)] @@ -27,20 +26,6 @@ pub enum Checksum { SHA256, } -impl Checksum { - pub(super) fn digest(&self, bytes: &[u8]) -> Vec { - match self { - Self::SHA256 => ring_digest(&digest::SHA256, bytes).as_ref().to_owned(), - } - } - - pub(super) fn header_name(&self) -> &'static str { - match self { - Self::SHA256 => "x-amz-checksum-sha256", - } - } -} - impl std::fmt::Display for Checksum { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match &self { diff --git a/src/aws/client.rs b/src/aws/client.rs index 838bef8..c1789ed 100644 --- a/src/aws/client.rs +++ b/src/aws/client.rs @@ -35,7 +35,8 @@ use crate::client::GetOptionsExt; use crate::multipart::PartId; use crate::path::DELIMITER; use crate::{ - ClientOptions, GetOptions, ListResult, MultipartId, Path, PutResult, Result, RetryConfig, + ClientOptions, GetOptions, ListResult, MultipartId, Path, PutPayload, PutResult, Result, + RetryConfig, }; use async_trait::async_trait; use base64::prelude::BASE64_STANDARD; @@ -51,11 +52,14 @@ use reqwest::{ header::{CONTENT_LENGTH, CONTENT_TYPE}, Client as ReqwestClient, Method, RequestBuilder, Response, }; +use ring::digest; +use ring::digest::Context; use serde::{Deserialize, Serialize}; use snafu::{ResultExt, Snafu}; use std::sync::Arc; const VERSION_HEADER: &str = "x-amz-version-id"; +const SHA256_CHECKSUM: &str = "x-amz-checksum-sha256"; /// A specialized `Error` for object store-related errors #[derive(Debug, Snafu)] @@ -266,7 +270,8 @@ pub(crate) struct Request<'a> { path: &'a Path, config: &'a S3Config, builder: RequestBuilder, - payload_sha256: Option>, + payload_sha256: Option, + payload: Option, use_session_creds: bool, idempotent: bool, } @@ -286,7 +291,7 @@ impl<'a> Request<'a> { Self { builder, ..self } } - pub fn set_idempotent(mut self, idempotent: bool) -> Self { + pub fn idempotent(mut self, idempotent: bool) -> Self { self.idempotent = idempotent; self } @@ -301,10 +306,15 @@ impl<'a> Request<'a> { }, }; + let sha = self.payload_sha256.as_ref().map(|x| x.as_ref()); + let path = self.path.as_ref(); self.builder - .with_aws_sigv4(credential.authorizer(), self.payload_sha256.as_deref()) - .send_retry_with_idempotency(&self.config.retry_config, self.idempotent) + .with_aws_sigv4(credential.authorizer(), sha) + .retryable(&self.config.retry_config) + .idempotent(self.idempotent) + .payload(self.payload) + .send() .await .context(RetrySnafu { path }) } @@ -333,7 +343,7 @@ impl S3Client { pub fn put_request<'a>( &'a self, path: &'a Path, - bytes: Bytes, + payload: PutPayload, with_encryption_headers: bool, ) -> Request<'a> { let url = self.config.path_url(path); @@ -341,20 +351,17 @@ impl S3Client { if with_encryption_headers { builder = builder.headers(self.config.encryption_headers.clone().into()); } - let mut payload_sha256 = None; - if let Some(checksum) = self.config.checksum { - let digest = checksum.digest(&bytes); - builder = builder.header(checksum.header_name(), BASE64_STANDARD.encode(&digest)); - if checksum == Checksum::SHA256 { - payload_sha256 = Some(digest); - } - } + let mut sha256 = Context::new(&digest::SHA256); + payload.iter().for_each(|x| sha256.update(x)); + let payload_sha256 = sha256.finish(); - builder = match bytes.is_empty() { - true => builder.header(CONTENT_LENGTH, 0), // Handle empty uploads (#4514) - false => builder.body(bytes), - }; + if let Some(Checksum::SHA256) = self.config.checksum { + builder = builder.header( + "x-amz-checksum-sha256", + BASE64_STANDARD.encode(payload_sha256), + ) + } if let Some(value) = self.config.client_options.get_content_type(path) { builder = builder.header(CONTENT_TYPE, value); @@ -362,8 +369,9 @@ impl S3Client { Request { path, - builder, - payload_sha256, + builder: builder.header(CONTENT_LENGTH, payload.content_length()), + payload: Some(payload), + payload_sha256: Some(payload_sha256), config: &self.config, use_session_creds: true, idempotent: false, @@ -446,16 +454,8 @@ impl S3Client { let mut builder = self.client.request(Method::POST, url); - // Compute checksum - S3 *requires* this for DeleteObjects requests, so we default to - // their algorithm if the user hasn't specified one. - let checksum = self.config.checksum.unwrap_or(Checksum::SHA256); - let digest = checksum.digest(&body); - builder = builder.header(checksum.header_name(), BASE64_STANDARD.encode(&digest)); - let payload_sha256 = if checksum == Checksum::SHA256 { - Some(digest) - } else { - None - }; + let digest = digest::digest(&digest::SHA256, &body); + builder = builder.header(SHA256_CHECKSUM, BASE64_STANDARD.encode(digest)); // S3 *requires* DeleteObjects to include a Content-MD5 header: // https://docs.aws.amazon.com/AmazonS3/latest/API/API_DeleteObjects.html @@ -468,8 +468,8 @@ impl S3Client { let response = builder .header(CONTENT_TYPE, "application/xml") .body(body) - .with_aws_sigv4(credential.authorizer(), payload_sha256.as_deref()) - .send_retry_with_idempotency(&self.config.retry_config, false) + .with_aws_sigv4(credential.authorizer(), Some(digest.as_ref())) + .send_retry(&self.config.retry_config) .await .context(DeleteObjectsRequestSnafu {})? .bytes() @@ -515,6 +515,7 @@ impl S3Client { builder, path: from, config: &self.config, + payload: None, payload_sha256: None, use_session_creds: false, idempotent: false, @@ -530,7 +531,9 @@ impl S3Client { .request(Method::POST, url) .headers(self.config.encryption_headers.clone().into()) .with_aws_sigv4(credential.authorizer(), None) - .send_retry_with_idempotency(&self.config.retry_config, true) + .retryable(&self.config.retry_config) + .idempotent(true) + .send() .await .context(CreateMultipartRequestSnafu)? .bytes() @@ -548,14 +551,14 @@ impl S3Client { path: &Path, upload_id: &MultipartId, part_idx: usize, - data: Bytes, + data: PutPayload, ) -> Result { let part = (part_idx + 1).to_string(); let response = self .put_request(path, data, false) .query(&[("partNumber", &part), ("uploadId", upload_id)]) - .set_idempotent(true) + .idempotent(true) .send() .await?; @@ -573,7 +576,7 @@ impl S3Client { // If no parts were uploaded, upload an empty part // otherwise the completion request will fail let part = self - .put_part(location, &upload_id.to_string(), 0, Bytes::new()) + .put_part(location, &upload_id.to_string(), 0, PutPayload::default()) .await?; vec![part] } else { @@ -591,7 +594,9 @@ impl S3Client { .query(&[("uploadId", upload_id)]) .body(body) .with_aws_sigv4(credential.authorizer(), None) - .send_retry_with_idempotency(&self.config.retry_config, true) + .retryable(&self.config.retry_config) + .idempotent(true) + .send() .await .context(CompleteMultipartRequestSnafu)?; diff --git a/src/aws/credential.rs b/src/aws/credential.rs index a7d1a97..08831fd 100644 --- a/src/aws/credential.rs +++ b/src/aws/credential.rs @@ -517,7 +517,9 @@ async fn instance_creds( let token_result = client .request(Method::PUT, token_url) .header("X-aws-ec2-metadata-token-ttl-seconds", "600") // 10 minute TTL - .send_retry_with_idempotency(retry_config, true) + .retryable(retry_config) + .idempotent(true) + .send() .await; let token = match token_result { @@ -607,7 +609,9 @@ async fn web_identity( ("Version", "2011-06-15"), ("WebIdentityToken", &token), ]) - .send_retry_with_idempotency(retry_config, true) + .retryable(retry_config) + .idempotent(true) + .send() .await? .bytes() .await?; diff --git a/src/aws/dynamo.rs b/src/aws/dynamo.rs index 2e60bba..2390187 100644 --- a/src/aws/dynamo.rs +++ b/src/aws/dynamo.rs @@ -186,11 +186,7 @@ impl DynamoCommit { to: &Path, ) -> Result<()> { self.conditional_op(client, to, None, || async { - client - .copy_request(from, to) - .set_idempotent(false) - .send() - .await?; + client.copy_request(from, to).send().await?; Ok(()) }) .await diff --git a/src/aws/mod.rs b/src/aws/mod.rs index 16af4d3..9e741c9 100644 --- a/src/aws/mod.rs +++ b/src/aws/mod.rs @@ -29,7 +29,6 @@ //! [automatic cleanup]: https://aws.amazon.com/blogs/aws/s3-lifecycle-management-update-support-for-multipart-uploads-and-delete-markers/ use async_trait::async_trait; -use bytes::Bytes; use futures::stream::BoxStream; use futures::{StreamExt, TryStreamExt}; use reqwest::header::{HeaderName, IF_MATCH, IF_NONE_MATCH}; @@ -46,7 +45,7 @@ use crate::signer::Signer; use crate::util::STRICT_ENCODE_SET; use crate::{ Error, GetOptions, GetResult, ListResult, MultipartId, MultipartUpload, ObjectMeta, - ObjectStore, Path, PutMode, PutOptions, PutResult, Result, UploadPart, + ObjectStore, Path, PutMode, PutOptions, PutPayload, PutResult, Result, UploadPart, }; static TAGS_HEADER: HeaderName = HeaderName::from_static("x-amz-tagging"); @@ -151,15 +150,20 @@ impl Signer for AmazonS3 { #[async_trait] impl ObjectStore for AmazonS3 { - async fn put_opts(&self, location: &Path, bytes: Bytes, opts: PutOptions) -> Result { - let mut request = self.client.put_request(location, bytes, true); + async fn put_opts( + &self, + location: &Path, + payload: PutPayload, + opts: PutOptions, + ) -> Result { + let mut request = self.client.put_request(location, payload, true); let tags = opts.tags.encoded(); if !tags.is_empty() && !self.client.config.disable_tagging { request = request.header(&TAGS_HEADER, tags); } match (opts.mode, &self.client.config.conditional_put) { - (PutMode::Overwrite, _) => request.set_idempotent(true).do_put().await, + (PutMode::Overwrite, _) => request.idempotent(true).do_put().await, (PutMode::Create | PutMode::Update(_), None) => Err(Error::NotImplemented), (PutMode::Create, Some(S3ConditionalPut::ETagMatch)) => { match request.header(&IF_NONE_MATCH, "*").do_put().await { @@ -270,7 +274,7 @@ impl ObjectStore for AmazonS3 { async fn copy(&self, from: &Path, to: &Path) -> Result<()> { self.client .copy_request(from, to) - .set_idempotent(true) + .idempotent(true) .send() .await?; Ok(()) @@ -320,7 +324,7 @@ struct UploadState { #[async_trait] impl MultipartUpload for S3MultiPartUpload { - fn put_part(&mut self, data: Bytes) -> UploadPart { + fn put_part(&mut self, data: PutPayload) -> UploadPart { let idx = self.part_idx; self.part_idx += 1; let state = Arc::clone(&self.state); @@ -362,7 +366,7 @@ impl MultipartStore for AmazonS3 { path: &Path, id: &MultipartId, part_idx: usize, - data: Bytes, + data: PutPayload, ) -> Result { self.client.put_part(path, id, part_idx, data).await } @@ -385,7 +389,6 @@ impl MultipartStore for AmazonS3 { mod tests { use super::*; use crate::{client::get::GetClient, tests::*}; - use bytes::Bytes; use hyper::HeaderMap; const NON_EXISTENT_NAME: &str = "nonexistentname"; @@ -474,7 +477,7 @@ mod tests { let integration = config.build().unwrap(); let location = Path::from_iter([NON_EXISTENT_NAME]); - let data = Bytes::from("arbitrary data"); + let data = PutPayload::from("arbitrary data"); let err = integration.put(&location, data).await.unwrap_err(); assert!(matches!(err, crate::Error::NotFound { .. }), "{}", err); @@ -531,7 +534,7 @@ mod tests { async fn s3_encryption(store: &AmazonS3) { crate::test_util::maybe_skip_integration!(); - let data = Bytes::from(vec![3u8; 1024]); + let data = PutPayload::from(vec![3u8; 1024]); let encryption_headers: HeaderMap = store.client.config.encryption_headers.clone().into(); let expected_encryption = diff --git a/src/azure/client.rs b/src/azure/client.rs index 0e6af50..d5972d0 100644 --- a/src/azure/client.rs +++ b/src/azure/client.rs @@ -27,8 +27,8 @@ use crate::multipart::PartId; use crate::path::DELIMITER; use crate::util::{deserialize_rfc1123, GetRange}; use crate::{ - ClientOptions, GetOptions, ListResult, ObjectMeta, Path, PutMode, PutOptions, PutResult, - Result, RetryConfig, + ClientOptions, GetOptions, ListResult, ObjectMeta, Path, PutMode, PutOptions, PutPayload, + PutResult, Result, RetryConfig, }; use async_trait::async_trait; use base64::prelude::BASE64_STANDARD; @@ -171,6 +171,7 @@ impl AzureConfig { struct PutRequest<'a> { path: &'a Path, config: &'a AzureConfig, + payload: PutPayload, builder: RequestBuilder, idempotent: bool, } @@ -195,8 +196,12 @@ impl<'a> PutRequest<'a> { let credential = self.config.get_credential().await?; let response = self .builder + .header(CONTENT_LENGTH, self.payload.content_length()) .with_azure_authorization(&credential, &self.config.account) - .send_retry_with_idempotency(&self.config.retry_config, self.idempotent) + .retryable(&self.config.retry_config) + .idempotent(true) + .payload(Some(self.payload)) + .send() .await .context(PutRequestSnafu { path: self.path.as_ref(), @@ -228,7 +233,7 @@ impl AzureClient { self.config.get_credential().await } - fn put_request<'a>(&'a self, path: &'a Path, bytes: Bytes) -> PutRequest<'a> { + fn put_request<'a>(&'a self, path: &'a Path, payload: PutPayload) -> PutRequest<'a> { let url = self.config.path_url(path); let mut builder = self.client.request(Method::PUT, url); @@ -237,21 +242,23 @@ impl AzureClient { builder = builder.header(CONTENT_TYPE, value); } - builder = builder - .header(CONTENT_LENGTH, HeaderValue::from(bytes.len())) - .body(bytes); - PutRequest { path, builder, + payload, config: &self.config, idempotent: false, } } /// Make an Azure PUT request - pub async fn put_blob(&self, path: &Path, bytes: Bytes, opts: PutOptions) -> Result { - let builder = self.put_request(path, bytes); + pub async fn put_blob( + &self, + path: &Path, + payload: PutPayload, + opts: PutOptions, + ) -> Result { + let builder = self.put_request(path, payload); let builder = match &opts.mode { PutMode::Overwrite => builder.set_idempotent(true), @@ -272,11 +279,16 @@ impl AzureClient { } /// PUT a block - pub async fn put_block(&self, path: &Path, part_idx: usize, data: Bytes) -> Result { + pub async fn put_block( + &self, + path: &Path, + part_idx: usize, + payload: PutPayload, + ) -> Result { let content_id = format!("{part_idx:20}"); let block_id = BASE64_STANDARD.encode(&content_id); - self.put_request(path, data) + self.put_request(path, payload) .query(&[("comp", "block"), ("blockid", &block_id)]) .set_idempotent(true) .send() @@ -349,7 +361,9 @@ impl AzureClient { builder .with_azure_authorization(&credential, &self.config.account) - .send_retry_with_idempotency(&self.config.retry_config, true) + .retryable(&self.config.retry_config) + .idempotent(overwrite) + .send() .await .map_err(|err| err.error(STORE, from.to_string()))?; @@ -382,7 +396,9 @@ impl AzureClient { .body(body) .query(&[("restype", "service"), ("comp", "userdelegationkey")]) .with_azure_authorization(&credential, &self.config.account) - .send_retry_with_idempotency(&self.config.retry_config, true) + .retryable(&self.config.retry_config) + .idempotent(true) + .send() .await .context(DelegationKeyRequestSnafu)? .bytes() diff --git a/src/azure/credential.rs b/src/azure/credential.rs index 36845bd..c8212a9 100644 --- a/src/azure/credential.rs +++ b/src/azure/credential.rs @@ -615,7 +615,9 @@ impl TokenProvider for ClientSecretOAuthProvider { ("scope", AZURE_STORAGE_SCOPE), ("grant_type", "client_credentials"), ]) - .send_retry_with_idempotency(retry, true) + .retryable(retry) + .idempotent(true) + .send() .await .context(TokenRequestSnafu)? .json() @@ -797,7 +799,9 @@ impl TokenProvider for WorkloadIdentityOAuthProvider { ("scope", AZURE_STORAGE_SCOPE), ("grant_type", "client_credentials"), ]) - .send_retry_with_idempotency(retry, true) + .retryable(retry) + .idempotent(true) + .send() .await .context(TokenRequestSnafu)? .json() diff --git a/src/azure/mod.rs b/src/azure/mod.rs index 5d3a405..8dc5242 100644 --- a/src/azure/mod.rs +++ b/src/azure/mod.rs @@ -27,10 +27,9 @@ use crate::{ path::Path, signer::Signer, GetOptions, GetResult, ListResult, MultipartId, MultipartUpload, ObjectMeta, ObjectStore, - PutOptions, PutResult, Result, UploadPart, + PutOptions, PutPayload, PutResult, Result, UploadPart, }; use async_trait::async_trait; -use bytes::Bytes; use futures::stream::BoxStream; use reqwest::Method; use std::fmt::Debug; @@ -87,8 +86,13 @@ impl std::fmt::Display for MicrosoftAzure { #[async_trait] impl ObjectStore for MicrosoftAzure { - async fn put_opts(&self, location: &Path, bytes: Bytes, opts: PutOptions) -> Result { - self.client.put_blob(location, bytes, opts).await + async fn put_opts( + &self, + location: &Path, + payload: PutPayload, + opts: PutOptions, + ) -> Result { + self.client.put_blob(location, payload, opts).await } async fn put_multipart(&self, location: &Path) -> Result> { @@ -203,7 +207,7 @@ struct UploadState { #[async_trait] impl MultipartUpload for AzureMultiPartUpload { - fn put_part(&mut self, data: Bytes) -> UploadPart { + fn put_part(&mut self, data: PutPayload) -> UploadPart { let idx = self.part_idx; self.part_idx += 1; let state = Arc::clone(&self.state); @@ -240,7 +244,7 @@ impl MultipartStore for MicrosoftAzure { path: &Path, _: &MultipartId, part_idx: usize, - data: Bytes, + data: PutPayload, ) -> Result { self.client.put_block(path, part_idx, data).await } @@ -265,6 +269,7 @@ impl MultipartStore for MicrosoftAzure { mod tests { use super::*; use crate::tests::*; + use bytes::Bytes; #[tokio::test] async fn azure_blob_test() { @@ -309,7 +314,7 @@ mod tests { let data = Bytes::from("hello world"); let path = Path::from("file.txt"); - integration.put(&path, data.clone()).await.unwrap(); + integration.put(&path, data.clone().into()).await.unwrap(); let signed = integration .signed_url(Method::GET, &path, Duration::from_secs(60)) diff --git a/src/buffered.rs b/src/buffered.rs index de6d4eb..d412241 100644 --- a/src/buffered.rs +++ b/src/buffered.rs @@ -18,7 +18,7 @@ //! Utilities for performing tokio-style buffered IO use crate::path::Path; -use crate::{ObjectMeta, ObjectStore, WriteMultipart}; +use crate::{ObjectMeta, ObjectStore, PutPayloadMut, WriteMultipart}; use bytes::Bytes; use futures::future::{BoxFuture, FutureExt}; use futures::ready; @@ -231,7 +231,7 @@ impl std::fmt::Debug for BufWriter { enum BufWriterState { /// Buffer up to capacity bytes - Buffer(Path, Vec), + Buffer(Path, PutPayloadMut), /// [`ObjectStore::put_multipart`] Prepare(BoxFuture<'static, std::io::Result>), /// Write to a multipart upload @@ -252,7 +252,7 @@ impl BufWriter { capacity, store, max_concurrency: 8, - state: BufWriterState::Buffer(path, Vec::new()), + state: BufWriterState::Buffer(path, PutPayloadMut::new()), } } @@ -303,14 +303,16 @@ impl AsyncWrite for BufWriter { continue; } BufWriterState::Buffer(path, b) => { - if b.len().saturating_add(buf.len()) >= cap { + if b.content_length().saturating_add(buf.len()) >= cap { let buffer = std::mem::take(b); let path = std::mem::take(path); let store = Arc::clone(&self.store); self.state = BufWriterState::Prepare(Box::pin(async move { let upload = store.put_multipart(&path).await?; - let mut chunked = WriteMultipart::new(upload); - chunked.write(&buffer); + let mut chunked = WriteMultipart::new_with_chunk_size(upload, cap); + for chunk in buffer.freeze() { + chunked.put(chunk); + } Ok(chunked) })); continue; @@ -391,7 +393,7 @@ mod tests { const BYTES: usize = 4096; let data: Bytes = b"12345678".iter().cycle().copied().take(BYTES).collect(); - store.put(&existent, data.clone()).await.unwrap(); + store.put(&existent, data.clone().into()).await.unwrap(); let meta = store.head(&existent).await.unwrap(); diff --git a/src/chunked.rs b/src/chunked.rs index 6db7f4b..9abe49d 100644 --- a/src/chunked.rs +++ b/src/chunked.rs @@ -27,11 +27,11 @@ use futures::stream::BoxStream; use futures::StreamExt; use crate::path::Path; -use crate::Result; use crate::{ GetOptions, GetResult, GetResultPayload, ListResult, MultipartUpload, ObjectMeta, ObjectStore, PutOptions, PutResult, }; +use crate::{PutPayload, Result}; /// Wraps a [`ObjectStore`] and makes its get response return chunks /// in a controllable manner. @@ -62,8 +62,13 @@ impl Display for ChunkedStore { #[async_trait] impl ObjectStore for ChunkedStore { - async fn put_opts(&self, location: &Path, bytes: Bytes, opts: PutOptions) -> Result { - self.inner.put_opts(location, bytes, opts).await + async fn put_opts( + &self, + location: &Path, + payload: PutPayload, + opts: PutOptions, + ) -> Result { + self.inner.put_opts(location, payload, opts).await } async fn put_multipart(&self, location: &Path) -> Result> { @@ -176,10 +181,7 @@ mod tests { async fn test_chunked_basic() { let location = Path::parse("test").unwrap(); let store: Arc = Arc::new(InMemory::new()); - store - .put(&location, Bytes::from(vec![0; 1001])) - .await - .unwrap(); + store.put(&location, vec![0; 1001].into()).await.unwrap(); for chunk_size in [10, 20, 31] { let store = ChunkedStore::new(Arc::clone(&store), chunk_size); diff --git a/src/client/retry.rs b/src/client/retry.rs index f3fa715..5dfdd55 100644 --- a/src/client/retry.rs +++ b/src/client/retry.rs @@ -18,10 +18,10 @@ //! A shared HTTP client implementation incorporating retries use crate::client::backoff::{Backoff, BackoffConfig}; +use crate::PutPayload; use futures::future::BoxFuture; -use futures::FutureExt; use reqwest::header::LOCATION; -use reqwest::{Response, StatusCode}; +use reqwest::{Client, Request, Response, StatusCode}; use snafu::Error as SnafuError; use snafu::Snafu; use std::time::{Duration, Instant}; @@ -166,26 +166,57 @@ impl Default for RetryConfig { } } -fn send_retry_impl( - builder: reqwest::RequestBuilder, - config: &RetryConfig, - is_idempotent: Option, -) -> BoxFuture<'static, Result> { - let mut backoff = Backoff::new(&config.backoff); - let max_retries = config.max_retries; - let retry_timeout = config.retry_timeout; +pub struct RetryableRequest { + client: Client, + request: Request, - let (client, req) = builder.build_split(); - let req = req.expect("request must be valid"); - let is_idempotent = is_idempotent.unwrap_or(req.method().is_safe()); + max_retries: usize, + retry_timeout: Duration, + backoff: Backoff, - async move { + idempotent: Option, + payload: Option, +} + +impl RetryableRequest { + /// Set whether this request is idempotent + /// + /// An idempotent request will be retried on timeout even if the request + /// method is not [safe](https://datatracker.ietf.org/doc/html/rfc7231#section-4.2.1) + pub fn idempotent(self, idempotent: bool) -> Self { + Self { + idempotent: Some(idempotent), + ..self + } + } + + /// Provide a [`PutPayload`] + pub fn payload(self, payload: Option) -> Self { + Self { payload, ..self } + } + + pub async fn send(self) -> Result { + let max_retries = self.max_retries; + let retry_timeout = self.retry_timeout; let mut retries = 0; let now = Instant::now(); + let mut backoff = self.backoff; + let is_idempotent = self + .idempotent + .unwrap_or_else(|| self.request.method().is_safe()); + loop { - let s = req.try_clone().expect("request body must be cloneable"); - match client.execute(s).await { + let mut request = self + .request + .try_clone() + .expect("request body must be cloneable"); + + if let Some(payload) = &self.payload { + *request.body_mut() = Some(payload.body()); + } + + match self.client.execute(request).await { Ok(r) => match r.error_for_status_ref() { Ok(_) if r.status().is_success() => return Ok(r), Ok(r) if r.status() == StatusCode::NOT_MODIFIED => { @@ -195,47 +226,44 @@ fn send_retry_impl( }) } Ok(r) => { - let is_bare_redirect = r.status().is_redirection() && !r.headers().contains_key(LOCATION); + let is_bare_redirect = + r.status().is_redirection() && !r.headers().contains_key(LOCATION); return match is_bare_redirect { true => Err(Error::BareRedirect), // Not actually sure if this is reachable, but here for completeness false => Err(Error::Client { body: None, status: r.status(), - }) - } + }), + }; } Err(e) => { let status = r.status(); if retries == max_retries || now.elapsed() > retry_timeout - || !status.is_server_error() { - + || !status.is_server_error() + { return Err(match status.is_client_error() { true => match r.text().await { - Ok(body) => { - Error::Client { - body: Some(body).filter(|b| !b.is_empty()), - status, - } - } - Err(e) => { - Error::Reqwest { - retries, - max_retries, - elapsed: now.elapsed(), - retry_timeout, - source: e, - } - } - } + Ok(body) => Error::Client { + body: Some(body).filter(|b| !b.is_empty()), + status, + }, + Err(e) => Error::Reqwest { + retries, + max_retries, + elapsed: now.elapsed(), + retry_timeout, + source: e, + }, + }, false => Error::Reqwest { retries, max_retries, elapsed: now.elapsed(), retry_timeout, source: e, - } + }, }); } @@ -251,13 +279,13 @@ fn send_retry_impl( tokio::time::sleep(sleep).await; } }, - Err(e) => - { + Err(e) => { let mut do_retry = false; if e.is_connect() || e.is_body() || (e.is_request() && !e.is_timeout()) - || (is_idempotent && e.is_timeout()) { + || (is_idempotent && e.is_timeout()) + { do_retry = true } else { let mut source = e.source(); @@ -267,7 +295,7 @@ fn send_retry_impl( || e.is_incomplete_message() || e.is_body_write_aborted() || (is_idempotent && e.is_timeout()); - break + break; } if let Some(e) = e.downcast_ref::() { if e.kind() == std::io::ErrorKind::TimedOut { @@ -276,9 +304,9 @@ fn send_retry_impl( do_retry = matches!( e.kind(), std::io::ErrorKind::ConnectionReset - | std::io::ErrorKind::ConnectionAborted - | std::io::ErrorKind::BrokenPipe - | std::io::ErrorKind::UnexpectedEof + | std::io::ErrorKind::ConnectionAborted + | std::io::ErrorKind::BrokenPipe + | std::io::ErrorKind::UnexpectedEof ); } break; @@ -287,17 +315,14 @@ fn send_retry_impl( } } - if retries == max_retries - || now.elapsed() > retry_timeout - || !do_retry { - + if retries == max_retries || now.elapsed() > retry_timeout || !do_retry { return Err(Error::Reqwest { retries, max_retries, elapsed: now.elapsed(), retry_timeout, source: e, - }) + }); } let sleep = backoff.next(); retries += 1; @@ -313,39 +338,39 @@ fn send_retry_impl( } } } - .boxed() } pub trait RetryExt { + /// Return a [`RetryableRequest`] + fn retryable(self, config: &RetryConfig) -> RetryableRequest; + /// Dispatch a request with the given retry configuration /// /// # Panic /// /// This will panic if the request body is a stream fn send_retry(self, config: &RetryConfig) -> BoxFuture<'static, Result>; - - /// Dispatch a request with the given retry configuration and idempotency - /// - /// # Panic - /// - /// This will panic if the request body is a stream - fn send_retry_with_idempotency( - self, - config: &RetryConfig, - is_idempotent: bool, - ) -> BoxFuture<'static, Result>; } impl RetryExt for reqwest::RequestBuilder { - fn send_retry(self, config: &RetryConfig) -> BoxFuture<'static, Result> { - send_retry_impl(self, config, None) + fn retryable(self, config: &RetryConfig) -> RetryableRequest { + let (client, request) = self.build_split(); + let request = request.expect("request must be valid"); + + RetryableRequest { + client, + request, + max_retries: config.max_retries, + retry_timeout: config.retry_timeout, + backoff: Backoff::new(&config.backoff), + idempotent: None, + payload: None, + } } - fn send_retry_with_idempotency( - self, - config: &RetryConfig, - is_idempotent: bool, - ) -> BoxFuture<'static, Result> { - send_retry_impl(self, config, Some(is_idempotent)) + + fn send_retry(self, config: &RetryConfig) -> BoxFuture<'static, Result> { + let request = self.retryable(config); + Box::pin(async move { request.send().await }) } } diff --git a/src/gcp/client.rs b/src/gcp/client.rs index 4aed81a..f91217f 100644 --- a/src/gcp/client.rs +++ b/src/gcp/client.rs @@ -29,13 +29,14 @@ use crate::multipart::PartId; use crate::path::{Path, DELIMITER}; use crate::util::hex_encode; use crate::{ - ClientOptions, GetOptions, ListResult, MultipartId, PutMode, PutOptions, PutResult, Result, - RetryConfig, + ClientOptions, GetOptions, ListResult, MultipartId, PutMode, PutOptions, PutPayload, PutResult, + Result, RetryConfig, }; use async_trait::async_trait; use base64::prelude::BASE64_STANDARD; use base64::Engine; -use bytes::{Buf, Bytes}; +use bytes::Buf; +use hyper::header::CONTENT_LENGTH; use percent_encoding::{percent_encode, utf8_percent_encode, NON_ALPHANUMERIC}; use reqwest::header::HeaderName; use reqwest::{header, Client, Method, RequestBuilder, Response, StatusCode}; @@ -172,6 +173,7 @@ impl GoogleCloudStorageConfig { pub struct PutRequest<'a> { path: &'a Path, config: &'a GoogleCloudStorageConfig, + payload: PutPayload, builder: RequestBuilder, idempotent: bool, } @@ -197,7 +199,11 @@ impl<'a> PutRequest<'a> { let response = self .builder .bearer_auth(&credential.bearer) - .send_retry_with_idempotency(&self.config.retry_config, self.idempotent) + .header(CONTENT_LENGTH, self.payload.content_length()) + .retryable(&self.config.retry_config) + .idempotent(self.idempotent) + .payload(Some(self.payload)) + .send() .await .context(PutRequestSnafu { path: self.path.as_ref(), @@ -287,7 +293,9 @@ impl GoogleCloudStorageClient { .post(&url) .bearer_auth(&credential.bearer) .json(&body) - .send_retry_with_idempotency(&self.config.retry_config, true) + .retryable(&self.config.retry_config) + .idempotent(true) + .send() .await .context(SignBlobRequestSnafu)?; @@ -315,7 +323,7 @@ impl GoogleCloudStorageClient { /// Perform a put request /// /// Returns the new ETag - pub fn put_request<'a>(&'a self, path: &'a Path, payload: Bytes) -> PutRequest<'a> { + pub fn put_request<'a>(&'a self, path: &'a Path, payload: PutPayload) -> PutRequest<'a> { let url = self.object_url(path); let content_type = self @@ -327,20 +335,24 @@ impl GoogleCloudStorageClient { let builder = self .client .request(Method::PUT, url) - .header(header::CONTENT_TYPE, content_type) - .header(header::CONTENT_LENGTH, payload.len()) - .body(payload); + .header(header::CONTENT_TYPE, content_type); PutRequest { path, builder, + payload, config: &self.config, idempotent: false, } } - pub async fn put(&self, path: &Path, data: Bytes, opts: PutOptions) -> Result { - let builder = self.put_request(path, data); + pub async fn put( + &self, + path: &Path, + payload: PutPayload, + opts: PutOptions, + ) -> Result { + let builder = self.put_request(path, payload); let builder = match &opts.mode { PutMode::Overwrite => builder.set_idempotent(true), @@ -367,7 +379,7 @@ impl GoogleCloudStorageClient { path: &Path, upload_id: &MultipartId, part_idx: usize, - data: Bytes, + data: PutPayload, ) -> Result { let query = &[ ("partNumber", &format!("{}", part_idx + 1)), @@ -403,7 +415,9 @@ impl GoogleCloudStorageClient { .header(header::CONTENT_TYPE, content_type) .header(header::CONTENT_LENGTH, "0") .query(&[("uploads", "")]) - .send_retry_with_idempotency(&self.config.retry_config, true) + .retryable(&self.config.retry_config) + .idempotent(true) + .send() .await .context(PutRequestSnafu { path: path.as_ref(), @@ -472,7 +486,9 @@ impl GoogleCloudStorageClient { .bearer_auth(&credential.bearer) .query(&[("uploadId", upload_id)]) .body(data) - .send_retry_with_idempotency(&self.config.retry_config, true) + .retryable(&self.config.retry_config) + .idempotent(true) + .send() .await .context(CompleteMultipartRequestSnafu)?; @@ -530,8 +546,10 @@ impl GoogleCloudStorageClient { .bearer_auth(&credential.bearer) // Needed if reqwest is compiled with native-tls instead of rustls-tls // See https://github.com/apache/arrow-rs/pull/3921 - .header(header::CONTENT_LENGTH, 0) - .send_retry_with_idempotency(&self.config.retry_config, !if_not_exists) + .header(CONTENT_LENGTH, 0) + .retryable(&self.config.retry_config) + .idempotent(!if_not_exists) + .send() .await .map_err(|err| match err.status() { Some(StatusCode::PRECONDITION_FAILED) => crate::Error::AlreadyExists { diff --git a/src/gcp/credential.rs b/src/gcp/credential.rs index abb0417..d7fc2ce 100644 --- a/src/gcp/credential.rs +++ b/src/gcp/credential.rs @@ -623,7 +623,9 @@ impl TokenProvider for AuthorizedUserCredentials { ("client_secret", &self.client_secret), ("refresh_token", &self.refresh_token), ]) - .send_retry_with_idempotency(retry, true) + .retryable(retry) + .idempotent(true) + .send() .await .context(TokenRequestSnafu)? .json::() diff --git a/src/gcp/mod.rs b/src/gcp/mod.rs index 96afa45..149da76 100644 --- a/src/gcp/mod.rs +++ b/src/gcp/mod.rs @@ -42,10 +42,9 @@ use crate::gcp::credential::GCSAuthorizer; use crate::signer::Signer; use crate::{ multipart::PartId, path::Path, GetOptions, GetResult, ListResult, MultipartId, MultipartUpload, - ObjectMeta, ObjectStore, PutOptions, PutResult, Result, UploadPart, + ObjectMeta, ObjectStore, PutOptions, PutPayload, PutResult, Result, UploadPart, }; use async_trait::async_trait; -use bytes::Bytes; use client::GoogleCloudStorageClient; use futures::stream::BoxStream; use hyper::Method; @@ -115,14 +114,14 @@ struct UploadState { #[async_trait] impl MultipartUpload for GCSMultipartUpload { - fn put_part(&mut self, data: Bytes) -> UploadPart { + fn put_part(&mut self, payload: PutPayload) -> UploadPart { let idx = self.part_idx; self.part_idx += 1; let state = Arc::clone(&self.state); Box::pin(async move { let part = state .client - .put_part(&state.path, &state.multipart_id, idx, data) + .put_part(&state.path, &state.multipart_id, idx, payload) .await?; state.parts.put(idx, part); Ok(()) @@ -148,8 +147,13 @@ impl MultipartUpload for GCSMultipartUpload { #[async_trait] impl ObjectStore for GoogleCloudStorage { - async fn put_opts(&self, location: &Path, bytes: Bytes, opts: PutOptions) -> Result { - self.client.put(location, bytes, opts).await + async fn put_opts( + &self, + location: &Path, + payload: PutPayload, + opts: PutOptions, + ) -> Result { + self.client.put(location, payload, opts).await } async fn put_multipart(&self, location: &Path) -> Result> { @@ -210,9 +214,9 @@ impl MultipartStore for GoogleCloudStorage { path: &Path, id: &MultipartId, part_idx: usize, - data: Bytes, + payload: PutPayload, ) -> Result { - self.client.put_part(path, id, part_idx, data).await + self.client.put_part(path, id, part_idx, payload).await } async fn complete_multipart( @@ -260,7 +264,6 @@ impl Signer for GoogleCloudStorage { #[cfg(test)] mod test { - use bytes::Bytes; use credential::DEFAULT_GCS_BASE_URL; use crate::tests::*; @@ -391,7 +394,7 @@ mod test { let integration = config.with_bucket_name(NON_EXISTENT_NAME).build().unwrap(); let location = Path::from_iter([NON_EXISTENT_NAME]); - let data = Bytes::from("arbitrary data"); + let data = PutPayload::from("arbitrary data"); let err = integration .put(&location, data) diff --git a/src/http/client.rs b/src/http/client.rs index fdc8751..39f68ec 100644 --- a/src/http/client.rs +++ b/src/http/client.rs @@ -21,10 +21,11 @@ use crate::client::retry::{self, RetryConfig, RetryExt}; use crate::client::GetOptionsExt; use crate::path::{Path, DELIMITER}; use crate::util::deserialize_rfc1123; -use crate::{ClientOptions, GetOptions, ObjectMeta, Result}; +use crate::{ClientOptions, GetOptions, ObjectMeta, PutPayload, Result}; use async_trait::async_trait; -use bytes::{Buf, Bytes}; +use bytes::Buf; use chrono::{DateTime, Utc}; +use hyper::header::CONTENT_LENGTH; use percent_encoding::percent_decode_str; use reqwest::header::CONTENT_TYPE; use reqwest::{Method, Response, StatusCode}; @@ -156,16 +157,24 @@ impl Client { Ok(()) } - pub async fn put(&self, location: &Path, bytes: Bytes) -> Result { + pub async fn put(&self, location: &Path, payload: PutPayload) -> Result { let mut retry = false; loop { let url = self.path_url(location); - let mut builder = self.client.put(url).body(bytes.clone()); + let mut builder = self.client.put(url); if let Some(value) = self.client_options.get_content_type(location) { builder = builder.header(CONTENT_TYPE, value); } - match builder.send_retry(&self.retry_config).await { + let resp = builder + .header(CONTENT_LENGTH, payload.content_length()) + .retryable(&self.retry_config) + .idempotent(true) + .payload(Some(payload.clone())) + .send() + .await; + + match resp { Ok(response) => return Ok(response), Err(source) => match source.status() { // Some implementations return 404 instead of 409 @@ -189,7 +198,9 @@ impl Client { .client .request(method, url) .header("Depth", depth) - .send_retry_with_idempotency(&self.retry_config, true) + .retryable(&self.retry_config) + .idempotent(true) + .send() .await; let response = match result { diff --git a/src/http/mod.rs b/src/http/mod.rs index 626337d..a838a0f 100644 --- a/src/http/mod.rs +++ b/src/http/mod.rs @@ -32,7 +32,6 @@ //! [WebDAV]: https://en.wikipedia.org/wiki/WebDAV use async_trait::async_trait; -use bytes::Bytes; use futures::stream::BoxStream; use futures::{StreamExt, TryStreamExt}; use itertools::Itertools; @@ -45,7 +44,7 @@ use crate::http::client::Client; use crate::path::Path; use crate::{ ClientConfigKey, ClientOptions, GetOptions, GetResult, ListResult, MultipartUpload, ObjectMeta, - ObjectStore, PutMode, PutOptions, PutResult, Result, RetryConfig, + ObjectStore, PutMode, PutOptions, PutPayload, PutResult, Result, RetryConfig, }; mod client; @@ -95,13 +94,18 @@ impl std::fmt::Display for HttpStore { #[async_trait] impl ObjectStore for HttpStore { - async fn put_opts(&self, location: &Path, bytes: Bytes, opts: PutOptions) -> Result { + async fn put_opts( + &self, + location: &Path, + payload: PutPayload, + opts: PutOptions, + ) -> Result { if opts.mode != PutMode::Overwrite { // TODO: Add support for If header - https://datatracker.ietf.org/doc/html/rfc2518#section-9.4 return Err(crate::Error::NotImplemented); } - let response = self.client.put(location, bytes).await?; + let response = self.client.put(location, payload).await?; let e_tag = match get_etag(response.headers()) { Ok(e_tag) => Some(e_tag), Err(crate::client::header::Error::MissingEtag) => None, diff --git a/src/lib.rs b/src/lib.rs index 97604a7..157852f 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -245,15 +245,14 @@ //! # } //! ``` //! -//! # Put Object +//! # Put Object //! //! Use the [`ObjectStore::put`] method to atomically write data. //! //! ``` //! # use object_store::local::LocalFileSystem; -//! # use object_store::ObjectStore; +//! # use object_store::{ObjectStore, PutPayload}; //! # use std::sync::Arc; -//! # use bytes::Bytes; //! # use object_store::path::Path; //! # fn get_object_store() -> Arc { //! # Arc::new(LocalFileSystem::new()) @@ -262,12 +261,12 @@ //! # //! let object_store: Arc = get_object_store(); //! let path = Path::from("data/file1"); -//! let bytes = Bytes::from_static(b"hello"); -//! object_store.put(&path, bytes).await.unwrap(); +//! let payload = PutPayload::from_static(b"hello"); +//! object_store.put(&path, payload).await.unwrap(); //! # } //! ``` //! -//! # Multipart Upload +//! # Multipart Upload //! //! Use the [`ObjectStore::put_multipart`] method to atomically write a large amount of data //! @@ -320,6 +319,48 @@ //! # } //! ``` //! +//! # Vectored Write +//! +//! When writing data it is often the case that the size of the output is not known ahead of time. +//! +//! A common approach to handling this is to bump-allocate a `Vec`, whereby the underlying +//! allocation is repeatedly reallocated, each time doubling the capacity. The performance of +//! this is suboptimal as reallocating memory will often involve copying it to a new location. +//! +//! Fortunately, as [`PutPayload`] does not require memory regions to be contiguous, it is +//! possible to instead allocate memory in chunks and avoid bump allocating. [`PutPayloadMut`] +//! encapsulates this approach +//! +//! ``` +//! # use object_store::local::LocalFileSystem; +//! # use object_store::{ObjectStore, PutPayloadMut}; +//! # use std::sync::Arc; +//! # use bytes::Bytes; +//! # use tokio::io::AsyncWriteExt; +//! # use object_store::path::Path; +//! # fn get_object_store() -> Arc { +//! # Arc::new(LocalFileSystem::new()) +//! # } +//! # async fn multi_upload() { +//! # +//! let object_store: Arc = get_object_store(); +//! let path = Path::from("data/large_file"); +//! let mut buffer = PutPayloadMut::new().with_block_size(8192); +//! for _ in 0..22 { +//! buffer.extend_from_slice(&[0; 1024]); +//! } +//! let payload = buffer.freeze(); +//! +//! // Payload consists of 3 separate 8KB allocations +//! assert_eq!(payload.as_ref().len(), 3); +//! assert_eq!(payload.as_ref()[0].len(), 8192); +//! assert_eq!(payload.as_ref()[1].len(), 8192); +//! assert_eq!(payload.as_ref()[2].len(), 6144); +//! +//! object_store.put(&path, payload).await.unwrap(); +//! # } +//! ``` +//! //! # Conditional Fetch //! //! More complex object retrieval can be supported by [`ObjectStore::get_opts`]. @@ -427,7 +468,7 @@ //! let new = do_update(r.bytes().await.unwrap()); //! //! // Attempt to commit transaction -//! match store.put_opts(&path, new, PutMode::Update(version).into()).await { +//! match store.put_opts(&path, new.into(), PutMode::Update(version).into()).await { //! Ok(_) => break, // Successfully committed //! Err(Error::Precondition { .. }) => continue, // Object has changed, try again //! Err(e) => panic!("{e}") @@ -498,17 +539,18 @@ pub use tags::TagSet; pub mod multipart; mod parse; +mod payload; mod upload; mod util; pub use parse::{parse_url, parse_url_opts}; +pub use payload::*; pub use upload::*; -pub use util::GetRange; +pub use util::{coalesce_ranges, collect_bytes, GetRange, OBJECT_STORE_COALESCE_DEFAULT}; use crate::path::Path; #[cfg(not(target_arch = "wasm32"))] use crate::util::maybe_spawn_blocking; -pub use crate::util::{coalesce_ranges, collect_bytes, OBJECT_STORE_COALESCE_DEFAULT}; use async_trait::async_trait; use bytes::Bytes; use chrono::{DateTime, Utc}; @@ -532,14 +574,20 @@ pub trait ObjectStore: std::fmt::Display + Send + Sync + Debug + 'static { /// Save the provided bytes to the specified location /// /// The operation is guaranteed to be atomic, it will either successfully - /// write the entirety of `bytes` to `location`, or fail. No clients + /// write the entirety of `payload` to `location`, or fail. No clients /// should be able to observe a partially written object - async fn put(&self, location: &Path, bytes: Bytes) -> Result { - self.put_opts(location, bytes, PutOptions::default()).await + async fn put(&self, location: &Path, payload: PutPayload) -> Result { + self.put_opts(location, payload, PutOptions::default()) + .await } - /// Save the provided bytes to the specified location with the given options - async fn put_opts(&self, location: &Path, bytes: Bytes, opts: PutOptions) -> Result; + /// Save the provided `payload` to `location` with the given options + async fn put_opts( + &self, + location: &Path, + payload: PutPayload, + opts: PutOptions, + ) -> Result; /// Perform a multipart upload /// @@ -616,11 +664,10 @@ pub trait ObjectStore: std::fmt::Display + Send + Sync + Debug + 'static { /// # use object_store::{ObjectStore, ObjectMeta}; /// # use object_store::path::Path; /// # use futures::{StreamExt, TryStreamExt}; - /// # use bytes::Bytes; /// # /// // Create two objects - /// store.put(&Path::from("foo"), Bytes::from("foo")).await?; - /// store.put(&Path::from("bar"), Bytes::from("bar")).await?; + /// store.put(&Path::from("foo"), "foo".into()).await?; + /// store.put(&Path::from("bar"), "bar".into()).await?; /// /// // List object /// let locations = store.list(None).map_ok(|m| m.location).boxed(); @@ -717,17 +764,17 @@ macro_rules! as_ref_impl { ($type:ty) => { #[async_trait] impl ObjectStore for $type { - async fn put(&self, location: &Path, bytes: Bytes) -> Result { - self.as_ref().put(location, bytes).await + async fn put(&self, location: &Path, payload: PutPayload) -> Result { + self.as_ref().put(location, payload).await } async fn put_opts( &self, location: &Path, - bytes: Bytes, + payload: PutPayload, opts: PutOptions, ) -> Result { - self.as_ref().put_opts(location, bytes, opts).await + self.as_ref().put_opts(location, payload, opts).await } async fn put_multipart(&self, location: &Path) -> Result> { @@ -1219,8 +1266,7 @@ mod tests { let location = Path::from("test_dir/test_file.json"); let data = Bytes::from("arbitrary data"); - let expected_data = data.clone(); - storage.put(&location, data).await.unwrap(); + storage.put(&location, data.clone().into()).await.unwrap(); let root = Path::from("/"); @@ -1263,14 +1309,14 @@ mod tests { assert!(content_list.is_empty()); let read_data = storage.get(&location).await.unwrap().bytes().await.unwrap(); - assert_eq!(&*read_data, expected_data); + assert_eq!(&*read_data, data); // Test range request let range = 3..7; let range_result = storage.get_range(&location, range.clone()).await; let bytes = range_result.unwrap(); - assert_eq!(bytes, expected_data.slice(range.clone())); + assert_eq!(bytes, data.slice(range.clone())); let opts = GetOptions { range: Some(GetRange::Bounded(2..5)), @@ -1348,11 +1394,11 @@ mod tests { let ranges = vec![0..1, 2..3, 0..5]; let bytes = storage.get_ranges(&location, &ranges).await.unwrap(); for (range, bytes) in ranges.iter().zip(bytes) { - assert_eq!(bytes, expected_data.slice(range.clone())) + assert_eq!(bytes, data.slice(range.clone())) } let head = storage.head(&location).await.unwrap(); - assert_eq!(head.size, expected_data.len()); + assert_eq!(head.size, data.len()); storage.delete(&location).await.unwrap(); @@ -1369,7 +1415,7 @@ mod tests { let file_with_delimiter = Path::from_iter(["a", "b/c", "foo.file"]); storage - .put(&file_with_delimiter, Bytes::from("arbitrary")) + .put(&file_with_delimiter, "arbitrary".into()) .await .unwrap(); @@ -1409,10 +1455,7 @@ mod tests { let emoji_prefix = Path::from("🙀"); let emoji_file = Path::from("🙀/😀.parquet"); - storage - .put(&emoji_file, Bytes::from("arbitrary")) - .await - .unwrap(); + storage.put(&emoji_file, "arbitrary".into()).await.unwrap(); storage.head(&emoji_file).await.unwrap(); storage @@ -1464,7 +1507,7 @@ mod tests { let hello_prefix = Path::parse("%48%45%4C%4C%4F").unwrap(); let path = hello_prefix.child("foo.parquet"); - storage.put(&path, Bytes::from(vec![0, 1])).await.unwrap(); + storage.put(&path, vec![0, 1].into()).await.unwrap(); let files = flatten_list_stream(storage, Some(&hello_prefix)) .await .unwrap(); @@ -1504,7 +1547,7 @@ mod tests { // Can also write non-percent encoded sequences let path = Path::parse("%Q.parquet").unwrap(); - storage.put(&path, Bytes::from(vec![0, 1])).await.unwrap(); + storage.put(&path, vec![0, 1].into()).await.unwrap(); let files = flatten_list_stream(storage, None).await.unwrap(); assert_eq!(files, vec![path.clone()]); @@ -1512,7 +1555,7 @@ mod tests { storage.delete(&path).await.unwrap(); let path = Path::parse("foo bar/I contain spaces.parquet").unwrap(); - storage.put(&path, Bytes::from(vec![0, 1])).await.unwrap(); + storage.put(&path, vec![0, 1].into()).await.unwrap(); storage.head(&path).await.unwrap(); let files = flatten_list_stream(storage, Some(&Path::from("foo bar"))) @@ -1622,7 +1665,7 @@ mod tests { delete_fixtures(storage).await; let path = Path::from("empty"); - storage.put(&path, Bytes::new()).await.unwrap(); + storage.put(&path, PutPayload::default()).await.unwrap(); let meta = storage.head(&path).await.unwrap(); assert_eq!(meta.size, 0); let data = storage.get(&path).await.unwrap().bytes().await.unwrap(); @@ -1879,7 +1922,7 @@ mod tests { let data = get_chunks(5 * 1024 * 1024, 3); let bytes_expected = data.concat(); let mut upload = storage.put_multipart(&location).await.unwrap(); - let uploads = data.into_iter().map(|x| upload.put_part(x)); + let uploads = data.into_iter().map(|x| upload.put_part(x.into())); futures::future::try_join_all(uploads).await.unwrap(); // Object should not yet exist in store @@ -1928,7 +1971,7 @@ mod tests { // We can abort an in-progress write let mut upload = storage.put_multipart(&location).await.unwrap(); upload - .put_part(data.first().unwrap().clone()) + .put_part(data.first().unwrap().clone().into()) .await .unwrap(); @@ -1953,7 +1996,7 @@ mod tests { let location1 = Path::from("foo/x.json"); let location2 = Path::from("foo.bar/y.json"); - let data = Bytes::from("arbitrary data"); + let data = PutPayload::from("arbitrary data"); storage.put(&location1, data.clone()).await.unwrap(); storage.put(&location2, data).await.unwrap(); @@ -2011,8 +2054,7 @@ mod tests { .collect(); for f in &files { - let data = data.clone(); - storage.put(f, data).await.unwrap(); + storage.put(f, data.clone().into()).await.unwrap(); } // ==================== check: prefix-list `mydb/wb` (directory) ==================== @@ -2076,15 +2118,15 @@ mod tests { let contents2 = Bytes::from("dogs"); // copy() make both objects identical - storage.put(&path1, contents1.clone()).await.unwrap(); - storage.put(&path2, contents2.clone()).await.unwrap(); + storage.put(&path1, contents1.clone().into()).await.unwrap(); + storage.put(&path2, contents2.clone().into()).await.unwrap(); storage.copy(&path1, &path2).await.unwrap(); let new_contents = storage.get(&path2).await.unwrap().bytes().await.unwrap(); assert_eq!(&new_contents, &contents1); // rename() copies contents and deletes original - storage.put(&path1, contents1.clone()).await.unwrap(); - storage.put(&path2, contents2.clone()).await.unwrap(); + storage.put(&path1, contents1.clone().into()).await.unwrap(); + storage.put(&path2, contents2.clone().into()).await.unwrap(); storage.rename(&path1, &path2).await.unwrap(); let new_contents = storage.get(&path2).await.unwrap().bytes().await.unwrap(); assert_eq!(&new_contents, &contents1); @@ -2104,8 +2146,8 @@ mod tests { let contents2 = Bytes::from("dogs"); // copy_if_not_exists() errors if destination already exists - storage.put(&path1, contents1.clone()).await.unwrap(); - storage.put(&path2, contents2.clone()).await.unwrap(); + storage.put(&path1, contents1.clone().into()).await.unwrap(); + storage.put(&path2, contents2.clone().into()).await.unwrap(); let result = storage.copy_if_not_exists(&path1, &path2).await; assert!(result.is_err()); assert!(matches!( @@ -2133,7 +2175,7 @@ mod tests { // Create destination object let path2 = Path::from("test2"); - storage.put(&path2, Bytes::from("hello")).await.unwrap(); + storage.put(&path2, "hello".into()).await.unwrap(); // copy() errors if source does not exist let result = storage.copy(&path1, &path2).await; @@ -2164,7 +2206,7 @@ mod tests { let parts: Vec<_> = futures::stream::iter(chunks) .enumerate() - .map(|(idx, b)| multipart.put_part(&path, &id, idx, b)) + .map(|(idx, b)| multipart.put_part(&path, &id, idx, b.into())) .buffered(2) .try_collect() .await @@ -2204,7 +2246,7 @@ mod tests { let data = Bytes::from("hello world"); let path = Path::from("file.txt"); - integration.put(&path, data.clone()).await.unwrap(); + integration.put(&path, data.clone().into()).await.unwrap(); let signed = integration .signed_url(Method::GET, &path, Duration::from_secs(60)) diff --git a/src/limit.rs b/src/limit.rs index e5f6841..b94aa05 100644 --- a/src/limit.rs +++ b/src/limit.rs @@ -19,7 +19,7 @@ use crate::{ BoxStream, GetOptions, GetResult, GetResultPayload, ListResult, MultipartUpload, ObjectMeta, - ObjectStore, Path, PutOptions, PutResult, Result, StreamExt, UploadPart, + ObjectStore, Path, PutOptions, PutPayload, PutResult, Result, StreamExt, UploadPart, }; use async_trait::async_trait; use bytes::Bytes; @@ -70,14 +70,19 @@ impl std::fmt::Display for LimitStore { #[async_trait] impl ObjectStore for LimitStore { - async fn put(&self, location: &Path, bytes: Bytes) -> Result { + async fn put(&self, location: &Path, payload: PutPayload) -> Result { let _permit = self.semaphore.acquire().await.unwrap(); - self.inner.put(location, bytes).await + self.inner.put(location, payload).await } - async fn put_opts(&self, location: &Path, bytes: Bytes, opts: PutOptions) -> Result { + async fn put_opts( + &self, + location: &Path, + payload: PutPayload, + opts: PutOptions, + ) -> Result { let _permit = self.semaphore.acquire().await.unwrap(); - self.inner.put_opts(location, bytes, opts).await + self.inner.put_opts(location, payload, opts).await } async fn put_multipart(&self, location: &Path) -> Result> { let upload = self.inner.put_multipart(location).await?; @@ -232,7 +237,7 @@ impl LimitUpload { #[async_trait] impl MultipartUpload for LimitUpload { - fn put_part(&mut self, data: Bytes) -> UploadPart { + fn put_part(&mut self, data: PutPayload) -> UploadPart { let upload = self.upload.put_part(data); let s = Arc::clone(&self.semaphore); Box::pin(async move { diff --git a/src/local.rs b/src/local.rs index 6cc0c67..0d7c279 100644 --- a/src/local.rs +++ b/src/local.rs @@ -39,7 +39,7 @@ use crate::{ path::{absolute_path_to_url, Path}, util::InvalidGetRange, GetOptions, GetResult, GetResultPayload, ListResult, MultipartUpload, ObjectMeta, ObjectStore, - PutMode, PutOptions, PutResult, Result, UploadPart, + PutMode, PutOptions, PutPayload, PutResult, Result, UploadPart, }; /// A specialized `Error` for filesystem object store-related errors @@ -336,7 +336,12 @@ fn is_valid_file_path(path: &Path) -> bool { #[async_trait] impl ObjectStore for LocalFileSystem { - async fn put_opts(&self, location: &Path, bytes: Bytes, opts: PutOptions) -> Result { + async fn put_opts( + &self, + location: &Path, + payload: PutPayload, + opts: PutOptions, + ) -> Result { if matches!(opts.mode, PutMode::Update(_)) { return Err(crate::Error::NotImplemented); } @@ -346,7 +351,7 @@ impl ObjectStore for LocalFileSystem { let (mut file, staging_path) = new_staged_upload(&path)?; let mut e_tag = None; - let err = match file.write_all(&bytes) { + let err = match payload.iter().try_for_each(|x| file.write_all(x)) { Ok(_) => { let metadata = file.metadata().map_err(|e| Error::Metadata { source: e.into(), @@ -724,9 +729,9 @@ impl LocalUpload { #[async_trait] impl MultipartUpload for LocalUpload { - fn put_part(&mut self, data: Bytes) -> UploadPart { + fn put_part(&mut self, data: PutPayload) -> UploadPart { let offset = self.offset; - self.offset += data.len() as u64; + self.offset += data.content_length() as u64; let s = Arc::clone(&self.state); maybe_spawn_blocking(move || { @@ -734,7 +739,11 @@ impl MultipartUpload for LocalUpload { let file = f.as_mut().context(AbortedSnafu)?; file.seek(SeekFrom::Start(offset)) .context(SeekSnafu { path: &s.dest })?; - file.write_all(&data).context(UnableToCopyDataToFileSnafu)?; + + data.iter() + .try_for_each(|x| file.write_all(x)) + .context(UnableToCopyDataToFileSnafu)?; + Ok(()) }) .boxed() @@ -1016,8 +1025,8 @@ mod tests { // Can't use stream_get test as WriteMultipart uses a tokio JoinSet let p = Path::from("manual_upload"); let mut upload = integration.put_multipart(&p).await.unwrap(); - upload.put_part(Bytes::from_static(b"123")).await.unwrap(); - upload.put_part(Bytes::from_static(b"45678")).await.unwrap(); + upload.put_part("123".into()).await.unwrap(); + upload.put_part("45678".into()).await.unwrap(); let r = upload.complete().await.unwrap(); let get = integration.get(&p).await.unwrap(); @@ -1035,9 +1044,11 @@ mod tests { let location = Path::from("nested/file/test_file"); let data = Bytes::from("arbitrary data"); - let expected_data = data.clone(); - integration.put(&location, data).await.unwrap(); + integration + .put(&location, data.clone().into()) + .await + .unwrap(); let read_data = integration .get(&location) @@ -1046,7 +1057,7 @@ mod tests { .bytes() .await .unwrap(); - assert_eq!(&*read_data, expected_data); + assert_eq!(&*read_data, data); } #[tokio::test] @@ -1057,9 +1068,11 @@ mod tests { let location = Path::from("some_file"); let data = Bytes::from("arbitrary data"); - let expected_data = data.clone(); - integration.put(&location, data).await.unwrap(); + integration + .put(&location, data.clone().into()) + .await + .unwrap(); let read_data = integration .get(&location) @@ -1068,7 +1081,7 @@ mod tests { .bytes() .await .unwrap(); - assert_eq!(&*read_data, expected_data); + assert_eq!(&*read_data, data); } #[tokio::test] @@ -1260,7 +1273,7 @@ mod tests { // Adding a file through a symlink creates in both paths integration - .put(&Path::from("b/file.parquet"), Bytes::from(vec![0, 1, 2])) + .put(&Path::from("b/file.parquet"), vec![0, 1, 2].into()) .await .unwrap(); @@ -1279,7 +1292,7 @@ mod tests { let directory = Path::from("directory"); let object = directory.child("child.txt"); let data = Bytes::from("arbitrary"); - integration.put(&object, data.clone()).await.unwrap(); + integration.put(&object, data.clone().into()).await.unwrap(); integration.head(&object).await.unwrap(); let result = integration.get(&object).await.unwrap(); assert_eq!(result.bytes().await.unwrap(), data); @@ -1319,7 +1332,7 @@ mod tests { let integration = LocalFileSystem::new_with_prefix(root.path()).unwrap(); let location = Path::from("some_file"); - let data = Bytes::from("arbitrary data"); + let data = PutPayload::from("arbitrary data"); let mut u1 = integration.put_multipart(&location).await.unwrap(); u1.put_part(data.clone()).await.unwrap(); @@ -1418,12 +1431,10 @@ mod tests { #[cfg(test)] mod not_wasm_tests { use std::time::Duration; - - use bytes::Bytes; use tempfile::TempDir; use crate::local::LocalFileSystem; - use crate::{ObjectStore, Path}; + use crate::{ObjectStore, Path, PutPayload}; #[tokio::test] async fn test_cleanup_intermediate_files() { @@ -1431,7 +1442,7 @@ mod not_wasm_tests { let integration = LocalFileSystem::new_with_prefix(root.path()).unwrap(); let location = Path::from("some_file"); - let data = Bytes::from_static(b"hello"); + let data = PutPayload::from_static(b"hello"); let mut upload = integration.put_multipart(&location).await.unwrap(); upload.put_part(data).await.unwrap(); diff --git a/src/memory.rs b/src/memory.rs index 6c960d4..d42e6f2 100644 --- a/src/memory.rs +++ b/src/memory.rs @@ -29,11 +29,11 @@ use snafu::{OptionExt, ResultExt, Snafu}; use crate::multipart::{MultipartStore, PartId}; use crate::util::InvalidGetRange; -use crate::GetOptions; use crate::{ path::Path, GetRange, GetResult, GetResultPayload, ListResult, MultipartId, MultipartUpload, ObjectMeta, ObjectStore, PutMode, PutOptions, PutResult, Result, UpdateVersion, UploadPart, }; +use crate::{GetOptions, PutPayload}; /// A specialized `Error` for in-memory object store-related errors #[derive(Debug, Snafu)] @@ -192,10 +192,15 @@ impl std::fmt::Display for InMemory { #[async_trait] impl ObjectStore for InMemory { - async fn put_opts(&self, location: &Path, bytes: Bytes, opts: PutOptions) -> Result { + async fn put_opts( + &self, + location: &Path, + payload: PutPayload, + opts: PutOptions, + ) -> Result { let mut storage = self.storage.write(); let etag = storage.next_etag; - let entry = Entry::new(bytes, Utc::now(), etag); + let entry = Entry::new(payload.into(), Utc::now(), etag); match opts.mode { PutMode::Overwrite => storage.overwrite(location, entry), @@ -391,14 +396,14 @@ impl MultipartStore for InMemory { _path: &Path, id: &MultipartId, part_idx: usize, - data: Bytes, + payload: PutPayload, ) -> Result { let mut storage = self.storage.write(); let upload = storage.upload_mut(id)?; if part_idx <= upload.parts.len() { upload.parts.resize(part_idx + 1, None); } - upload.parts[part_idx] = Some(data); + upload.parts[part_idx] = Some(payload.into()); Ok(PartId { content_id: Default::default(), }) @@ -471,21 +476,22 @@ impl InMemory { #[derive(Debug)] struct InMemoryUpload { location: Path, - parts: Vec, + parts: Vec, storage: Arc>, } #[async_trait] impl MultipartUpload for InMemoryUpload { - fn put_part(&mut self, data: Bytes) -> UploadPart { - self.parts.push(data); + fn put_part(&mut self, payload: PutPayload) -> UploadPart { + self.parts.push(payload); Box::pin(futures::future::ready(Ok(()))) } async fn complete(&mut self) -> Result { - let cap = self.parts.iter().map(|x| x.len()).sum(); + let cap = self.parts.iter().map(|x| x.content_length()).sum(); let mut buf = Vec::with_capacity(cap); - self.parts.iter().for_each(|x| buf.extend_from_slice(x)); + let parts = self.parts.iter().flatten(); + parts.for_each(|x| buf.extend_from_slice(x)); let etag = self.storage.write().insert(&self.location, buf.into()); Ok(PutResult { e_tag: Some(etag.to_string()), @@ -552,9 +558,11 @@ mod tests { let location = Path::from("some_file"); let data = Bytes::from("arbitrary data"); - let expected_data = data.clone(); - integration.put(&location, data).await.unwrap(); + integration + .put(&location, data.clone().into()) + .await + .unwrap(); let read_data = integration .get(&location) @@ -563,7 +571,7 @@ mod tests { .bytes() .await .unwrap(); - assert_eq!(&*read_data, expected_data); + assert_eq!(&*read_data, data); } const NON_EXISTENT_NAME: &str = "nonexistentname"; diff --git a/src/multipart.rs b/src/multipart.rs index 26cce39..d94e7f1 100644 --- a/src/multipart.rs +++ b/src/multipart.rs @@ -22,10 +22,9 @@ //! especially useful when dealing with large files or high-throughput systems. use async_trait::async_trait; -use bytes::Bytes; use crate::path::Path; -use crate::{MultipartId, PutResult, Result}; +use crate::{MultipartId, PutPayload, PutResult, Result}; /// Represents a part of a file that has been successfully uploaded in a multipart upload process. #[derive(Debug, Clone)] @@ -64,7 +63,7 @@ pub trait MultipartStore: Send + Sync + 'static { path: &Path, id: &MultipartId, part_idx: usize, - data: Bytes, + data: PutPayload, ) -> Result; /// Completes a multipart upload diff --git a/src/payload.rs b/src/payload.rs new file mode 100644 index 0000000..486bea3 --- /dev/null +++ b/src/payload.rs @@ -0,0 +1,314 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use bytes::Bytes; +use std::sync::Arc; + +/// A cheaply cloneable, ordered collection of [`Bytes`] +#[derive(Debug, Clone)] +pub struct PutPayload(Arc<[Bytes]>); + +impl Default for PutPayload { + fn default() -> Self { + Self(Arc::new([])) + } +} + +impl PutPayload { + /// Create a new empty [`PutPayload`] + pub fn new() -> Self { + Self::default() + } + + /// Creates a [`PutPayload`] from a static slice + pub fn from_static(s: &'static [u8]) -> Self { + s.into() + } + + /// Creates a [`PutPayload`] from a [`Bytes`] + pub fn from_bytes(s: Bytes) -> Self { + s.into() + } + + #[cfg(feature = "cloud")] + pub(crate) fn body(&self) -> reqwest::Body { + reqwest::Body::wrap_stream(futures::stream::iter( + self.clone().into_iter().map(Ok::<_, crate::Error>), + )) + } + + /// Returns the total length of the [`Bytes`] in this payload + pub fn content_length(&self) -> usize { + self.0.iter().map(|b| b.len()).sum() + } + + /// Returns an iterator over the [`Bytes`] in this payload + pub fn iter(&self) -> PutPayloadIter<'_> { + PutPayloadIter(self.0.iter()) + } +} + +impl AsRef<[Bytes]> for PutPayload { + fn as_ref(&self) -> &[Bytes] { + self.0.as_ref() + } +} + +impl<'a> IntoIterator for &'a PutPayload { + type Item = &'a Bytes; + type IntoIter = PutPayloadIter<'a>; + + fn into_iter(self) -> Self::IntoIter { + self.iter() + } +} + +impl IntoIterator for PutPayload { + type Item = Bytes; + type IntoIter = PutPayloadIntoIter; + + fn into_iter(self) -> Self::IntoIter { + PutPayloadIntoIter { + payload: self, + idx: 0, + } + } +} + +/// An iterator over [`PutPayload`] +#[derive(Debug)] +pub struct PutPayloadIter<'a>(std::slice::Iter<'a, Bytes>); + +impl<'a> Iterator for PutPayloadIter<'a> { + type Item = &'a Bytes; + + fn next(&mut self) -> Option { + self.0.next() + } + + fn size_hint(&self) -> (usize, Option) { + self.0.size_hint() + } +} + +/// An owning iterator of [`PutPayload`] +#[derive(Debug)] +pub struct PutPayloadIntoIter { + payload: PutPayload, + idx: usize, +} + +impl Iterator for PutPayloadIntoIter { + type Item = Bytes; + + fn next(&mut self) -> Option { + let p = self.payload.0.get(self.idx)?.clone(); + self.idx += 1; + Some(p) + } + + fn size_hint(&self) -> (usize, Option) { + let l = self.payload.0.len() - self.idx; + (l, Some(l)) + } +} + +impl From for PutPayload { + fn from(value: Bytes) -> Self { + Self(Arc::new([value])) + } +} + +impl From> for PutPayload { + fn from(value: Vec) -> Self { + Self(Arc::new([value.into()])) + } +} + +impl From<&'static str> for PutPayload { + fn from(value: &'static str) -> Self { + Bytes::from(value).into() + } +} + +impl From<&'static [u8]> for PutPayload { + fn from(value: &'static [u8]) -> Self { + Bytes::from(value).into() + } +} + +impl From for PutPayload { + fn from(value: String) -> Self { + Bytes::from(value).into() + } +} + +impl FromIterator for PutPayload { + fn from_iter>(iter: T) -> Self { + Bytes::from_iter(iter).into() + } +} + +impl FromIterator for PutPayload { + fn from_iter>(iter: T) -> Self { + Self(iter.into_iter().collect()) + } +} + +impl From for Bytes { + fn from(value: PutPayload) -> Self { + match value.0.len() { + 0 => Self::new(), + 1 => value.0[0].clone(), + _ => { + let mut buf = Vec::with_capacity(value.content_length()); + value.iter().for_each(|x| buf.extend_from_slice(x)); + buf.into() + } + } + } +} + +/// A builder for [`PutPayload`] that avoids reallocating memory +/// +/// Data is allocated in fixed blocks, which are flushed to [`Bytes`] once full. +/// Unlike [`Vec`] this avoids needing to repeatedly reallocate blocks of memory, +/// which typically involves copying all the previously written data to a new +/// contiguous memory region. +#[derive(Debug)] +pub struct PutPayloadMut { + len: usize, + completed: Vec, + in_progress: Vec, + block_size: usize, +} + +impl Default for PutPayloadMut { + fn default() -> Self { + Self { + len: 0, + completed: vec![], + in_progress: vec![], + + block_size: 8 * 1024, + } + } +} + +impl PutPayloadMut { + /// Create a new [`PutPayloadMut`] + pub fn new() -> Self { + Self::default() + } + + /// Configures the minimum allocation size + /// + /// Defaults to 8KB + pub fn with_block_size(self, block_size: usize) -> Self { + Self { block_size, ..self } + } + + /// Write bytes into this [`PutPayloadMut`] + /// + /// If there is an in-progress block, data will be first written to it, flushing + /// it to [`Bytes`] once full. If data remains to be written, a new block of memory + /// of at least the configured block size will be allocated, to hold the remaining data. + pub fn extend_from_slice(&mut self, slice: &[u8]) { + let remaining = self.in_progress.capacity() - self.in_progress.len(); + let to_copy = remaining.min(slice.len()); + + self.in_progress.extend_from_slice(&slice[..to_copy]); + if self.in_progress.capacity() == self.in_progress.len() { + let new_cap = self.block_size.max(slice.len() - to_copy); + let completed = std::mem::replace(&mut self.in_progress, Vec::with_capacity(new_cap)); + if !completed.is_empty() { + self.completed.push(completed.into()) + } + self.in_progress.extend_from_slice(&slice[to_copy..]) + } + self.len += slice.len(); + } + + /// Append a [`Bytes`] to this [`PutPayloadMut`] without copying + /// + /// This will close any currently buffered block populated by [`Self::extend_from_slice`], + /// and append `bytes` to this payload without copying. + pub fn push(&mut self, bytes: Bytes) { + if !self.in_progress.is_empty() { + let completed = std::mem::take(&mut self.in_progress); + self.completed.push(completed.into()) + } + self.completed.push(bytes) + } + + /// Returns `true` if this [`PutPayloadMut`] contains no bytes + #[inline] + pub fn is_empty(&self) -> bool { + self.len == 0 + } + + /// Returns the total length of the [`Bytes`] in this payload + #[inline] + pub fn content_length(&self) -> usize { + self.len + } + + /// Convert into [`PutPayload`] + pub fn freeze(mut self) -> PutPayload { + if !self.in_progress.is_empty() { + let completed = std::mem::take(&mut self.in_progress).into(); + self.completed.push(completed); + } + PutPayload(self.completed.into()) + } +} + +impl From for PutPayload { + fn from(value: PutPayloadMut) -> Self { + value.freeze() + } +} + +#[cfg(test)] +mod test { + use crate::PutPayloadMut; + + #[test] + fn test_put_payload() { + let mut chunk = PutPayloadMut::new().with_block_size(23); + chunk.extend_from_slice(&[1; 16]); + chunk.extend_from_slice(&[2; 32]); + chunk.extend_from_slice(&[2; 5]); + chunk.extend_from_slice(&[2; 21]); + chunk.extend_from_slice(&[2; 40]); + chunk.extend_from_slice(&[0; 0]); + chunk.push("foobar".into()); + + let payload = chunk.freeze(); + assert_eq!(payload.content_length(), 120); + + let chunks = payload.as_ref(); + assert_eq!(chunks.len(), 6); + + assert_eq!(chunks[0].len(), 23); + assert_eq!(chunks[1].len(), 25); // 32 - (23 - 16) + assert_eq!(chunks[2].len(), 23); + assert_eq!(chunks[3].len(), 23); + assert_eq!(chunks[4].len(), 20); + assert_eq!(chunks[5].len(), 6); + } +} diff --git a/src/prefix.rs b/src/prefix.rs index 053f71a..1d1ffee 100644 --- a/src/prefix.rs +++ b/src/prefix.rs @@ -23,7 +23,7 @@ use std::ops::Range; use crate::path::Path; use crate::{ GetOptions, GetResult, ListResult, MultipartUpload, ObjectMeta, ObjectStore, PutOptions, - PutResult, Result, + PutPayload, PutResult, Result, }; #[doc(hidden)] @@ -80,14 +80,19 @@ impl PrefixStore { #[async_trait::async_trait] impl ObjectStore for PrefixStore { - async fn put(&self, location: &Path, bytes: Bytes) -> Result { + async fn put(&self, location: &Path, payload: PutPayload) -> Result { let full_path = self.full_path(location); - self.inner.put(&full_path, bytes).await + self.inner.put(&full_path, payload).await } - async fn put_opts(&self, location: &Path, bytes: Bytes, opts: PutOptions) -> Result { + async fn put_opts( + &self, + location: &Path, + payload: PutPayload, + opts: PutOptions, + ) -> Result { let full_path = self.full_path(location); - self.inner.put_opts(&full_path, bytes, opts).await + self.inner.put_opts(&full_path, payload, opts).await } async fn put_multipart(&self, location: &Path) -> Result> { @@ -218,9 +223,8 @@ mod tests { let location = Path::from("prefix/test_file.json"); let data = Bytes::from("arbitrary data"); - let expected_data = data.clone(); - local.put(&location, data).await.unwrap(); + local.put(&location, data.clone().into()).await.unwrap(); let prefix = PrefixStore::new(local, "prefix"); let location_prefix = Path::from("test_file.json"); @@ -239,11 +243,11 @@ mod tests { .bytes() .await .unwrap(); - assert_eq!(&*read_data, expected_data); + assert_eq!(&*read_data, data); let target_prefix = Path::from("/test_written.json"); prefix - .put(&target_prefix, expected_data.clone()) + .put(&target_prefix, data.clone().into()) .await .unwrap(); @@ -256,6 +260,6 @@ mod tests { let location = Path::from("prefix/test_written.json"); let read_data = local.get(&location).await.unwrap().bytes().await.unwrap(); - assert_eq!(&*read_data, expected_data) + assert_eq!(&*read_data, data) } } diff --git a/src/throttle.rs b/src/throttle.rs index 65fac59..d089784 100644 --- a/src/throttle.rs +++ b/src/throttle.rs @@ -23,7 +23,7 @@ use std::{convert::TryInto, sync::Arc}; use crate::multipart::{MultipartStore, PartId}; use crate::{ path::Path, GetResult, GetResultPayload, ListResult, MultipartId, MultipartUpload, ObjectMeta, - ObjectStore, PutOptions, PutResult, Result, + ObjectStore, PutOptions, PutPayload, PutResult, Result, }; use crate::{GetOptions, UploadPart}; use async_trait::async_trait; @@ -148,14 +148,19 @@ impl std::fmt::Display for ThrottledStore { #[async_trait] impl ObjectStore for ThrottledStore { - async fn put(&self, location: &Path, bytes: Bytes) -> Result { + async fn put(&self, location: &Path, payload: PutPayload) -> Result { sleep(self.config().wait_put_per_call).await; - self.inner.put(location, bytes).await + self.inner.put(location, payload).await } - async fn put_opts(&self, location: &Path, bytes: Bytes, opts: PutOptions) -> Result { + async fn put_opts( + &self, + location: &Path, + payload: PutPayload, + opts: PutOptions, + ) -> Result { sleep(self.config().wait_put_per_call).await; - self.inner.put_opts(location, bytes, opts).await + self.inner.put_opts(location, payload, opts).await } async fn put_multipart(&self, location: &Path) -> Result> { @@ -332,7 +337,7 @@ impl MultipartStore for ThrottledStore { path: &Path, id: &MultipartId, part_idx: usize, - data: Bytes, + data: PutPayload, ) -> Result { sleep(self.config().wait_put_per_call).await; self.inner.put_part(path, id, part_idx, data).await @@ -360,7 +365,7 @@ struct ThrottledUpload { #[async_trait] impl MultipartUpload for ThrottledUpload { - fn put_part(&mut self, data: Bytes) -> UploadPart { + fn put_part(&mut self, data: PutPayload) -> UploadPart { let duration = self.sleep; let put = self.upload.put_part(data); Box::pin(async move { @@ -382,7 +387,6 @@ impl MultipartUpload for ThrottledUpload { mod tests { use super::*; use crate::{memory::InMemory, tests::*, GetResultPayload}; - use bytes::Bytes; use futures::TryStreamExt; use tokio::time::Duration; use tokio::time::Instant; @@ -536,8 +540,7 @@ mod tests { if let Some(n_bytes) = n_bytes { let data: Vec<_> = std::iter::repeat(1u8).take(n_bytes).collect(); - let bytes = Bytes::from(data); - store.put(&path, bytes).await.unwrap(); + store.put(&path, data.into()).await.unwrap(); } else { // ensure object is absent store.delete(&path).await.unwrap(); @@ -560,9 +563,7 @@ mod tests { // create new entries for i in 0..n_entries { let path = prefix.child(i.to_string().as_str()); - - let data = Bytes::from("bar"); - store.put(&path, data).await.unwrap(); + store.put(&path, "bar".into()).await.unwrap(); } prefix @@ -630,10 +631,9 @@ mod tests { async fn measure_put(store: &ThrottledStore, n_bytes: usize) -> Duration { let data: Vec<_> = std::iter::repeat(1u8).take(n_bytes).collect(); - let bytes = Bytes::from(data); let t0 = Instant::now(); - store.put(&Path::from("foo"), bytes).await.unwrap(); + store.put(&Path::from("foo"), data.into()).await.unwrap(); t0.elapsed() } diff --git a/src/upload.rs b/src/upload.rs index fe864e2..9805df0 100644 --- a/src/upload.rs +++ b/src/upload.rs @@ -17,14 +17,13 @@ use std::task::{Context, Poll}; +use crate::{PutPayload, PutPayloadMut, PutResult, Result}; use async_trait::async_trait; use bytes::Bytes; use futures::future::BoxFuture; use futures::ready; use tokio::task::JoinSet; -use crate::{PutResult, Result}; - /// An upload part request pub type UploadPart = BoxFuture<'static, Result<()>>; @@ -65,7 +64,7 @@ pub trait MultipartUpload: Send + std::fmt::Debug { /// ``` /// /// [R2]: https://developers.cloudflare.com/r2/objects/multipart-objects/#limitations - fn put_part(&mut self, data: Bytes) -> UploadPart; + fn put_part(&mut self, data: PutPayload) -> UploadPart; /// Complete the multipart upload /// @@ -106,7 +105,9 @@ pub trait MultipartUpload: Send + std::fmt::Debug { pub struct WriteMultipart { upload: Box, - buffer: Vec, + buffer: PutPayloadMut, + + chunk_size: usize, tasks: JoinSet>, } @@ -121,7 +122,8 @@ impl WriteMultipart { pub fn new_with_chunk_size(upload: Box, chunk_size: usize) -> Self { Self { upload, - buffer: Vec::with_capacity(chunk_size), + chunk_size, + buffer: PutPayloadMut::new(), tasks: Default::default(), } } @@ -149,6 +151,9 @@ impl WriteMultipart { /// Write data to this [`WriteMultipart`] /// + /// Data is buffered using [`PutPayloadMut::extend_from_slice`]. Implementations looking to + /// write data from owned buffers may prefer [`Self::put`] as this avoids copying. + /// /// Note this method is synchronous (not `async`) and will immediately /// start new uploads as soon as the internal `chunk_size` is hit, /// regardless of how many outstanding uploads are already in progress. @@ -157,19 +162,38 @@ impl WriteMultipart { /// [`Self::wait_for_capacity`] prior to calling this method pub fn write(&mut self, mut buf: &[u8]) { while !buf.is_empty() { - let capacity = self.buffer.capacity(); - let remaining = capacity - self.buffer.len(); + let remaining = self.chunk_size - self.buffer.content_length(); let to_read = buf.len().min(remaining); self.buffer.extend_from_slice(&buf[..to_read]); if to_read == remaining { - let part = std::mem::replace(&mut self.buffer, Vec::with_capacity(capacity)); - self.put_part(part.into()) + let buffer = std::mem::take(&mut self.buffer); + self.put_part(buffer.into()) } buf = &buf[to_read..] } } - fn put_part(&mut self, part: Bytes) { + /// Put a chunk of data into this [`WriteMultipart`] without copying + /// + /// Data is buffered using [`PutPayloadMut::push`]. Implementations looking to + /// perform writes from non-owned buffers should prefer [`Self::write`] as this + /// will allow multiple calls to share the same underlying allocation. + /// + /// See [`Self::write`] for information on backpressure + pub fn put(&mut self, mut bytes: Bytes) { + while !bytes.is_empty() { + let remaining = self.chunk_size - self.buffer.content_length(); + if bytes.len() < remaining { + self.buffer.push(bytes); + return; + } + self.buffer.push(bytes.split_to(remaining)); + let buffer = std::mem::take(&mut self.buffer); + self.put_part(buffer.into()) + } + } + + pub(crate) fn put_part(&mut self, part: PutPayload) { self.tasks.spawn(self.upload.put_part(part)); } diff --git a/tests/get_range_file.rs b/tests/get_range_file.rs index 309a86d..59c5934 100644 --- a/tests/get_range_file.rs +++ b/tests/get_range_file.rs @@ -37,8 +37,13 @@ impl std::fmt::Display for MyStore { #[async_trait] impl ObjectStore for MyStore { - async fn put_opts(&self, path: &Path, data: Bytes, opts: PutOptions) -> Result { - self.0.put_opts(path, data, opts).await + async fn put_opts( + &self, + location: &Path, + payload: PutPayload, + opts: PutOptions, + ) -> Result { + self.0.put_opts(location, payload, opts).await } async fn put_multipart(&self, _location: &Path) -> Result> { @@ -77,7 +82,7 @@ async fn test_get_range() { let path = Path::from("foo"); let expected = Bytes::from_static(b"hello world"); - store.put(&path, expected.clone()).await.unwrap(); + store.put(&path, expected.clone().into()).await.unwrap(); let fetched = store.get(&path).await.unwrap().bytes().await.unwrap(); assert_eq!(expected, fetched); @@ -101,7 +106,7 @@ async fn test_get_opts_over_range() { let path = Path::from("foo"); let expected = Bytes::from_static(b"hello world"); - store.put(&path, expected.clone()).await.unwrap(); + store.put(&path, expected.clone().into()).await.unwrap(); let opts = GetOptions { range: Some(GetRange::Bounded(0..(expected.len() * 2))), From 5f2997c746abf48192f619f69b38c55b56653c22 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Mon, 15 Apr 2024 15:17:58 +0100 Subject: [PATCH 293/397] Fix flaky test_cleanup_intermediate_files (#5645) --- src/local.rs | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/src/local.rs b/src/local.rs index 0d7c279..d5581cd 100644 --- a/src/local.rs +++ b/src/local.rs @@ -1450,10 +1450,14 @@ mod not_wasm_tests { assert_eq!(file_count, 1); drop(upload); - tokio::time::sleep(Duration::from_millis(1)).await; - - let file_count = std::fs::read_dir(root.path()).unwrap().count(); - assert_eq!(file_count, 0); + for _ in 0..100 { + tokio::time::sleep(Duration::from_millis(1)).await; + let file_count = std::fs::read_dir(root.path()).unwrap().count(); + if file_count == 0 { + return; + } + } + panic!("Failed to cleanup file in 100ms") } } From 2d21ecba51b9679828f4609c97a57fa1f4fee1bd Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Tue, 16 Apr 2024 12:48:56 +0100 Subject: [PATCH 294/397] Add Attributes API (#5329) (#5650) * Add Attributes API (#5329) * Clippy * Emulator test tweaks --- src/attributes.rs | 211 ++++++++++++++++++++++++++++++++++++++++++++ src/aws/client.rs | 30 +++++-- src/aws/mod.rs | 10 ++- src/azure/client.rs | 43 ++++++--- src/azure/mod.rs | 9 +- src/client/get.rs | 21 ++++- src/client/mod.rs | 2 +- src/gcp/client.rs | 47 ++++++---- src/gcp/mod.rs | 2 + src/http/client.rs | 29 ++++-- src/http/mod.rs | 4 +- src/lib.rs | 36 +++++++- src/local.rs | 9 +- src/memory.rs | 36 +++++--- 14 files changed, 419 insertions(+), 70 deletions(-) create mode 100644 src/attributes.rs diff --git a/src/attributes.rs b/src/attributes.rs new file mode 100644 index 0000000..9b90b53 --- /dev/null +++ b/src/attributes.rs @@ -0,0 +1,211 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::borrow::Cow; +use std::collections::HashMap; +use std::ops::Deref; + +/// Additional object attribute types +#[non_exhaustive] +#[derive(Debug, Hash, Eq, PartialEq, Clone)] +pub enum Attribute { + /// Specifies the MIME type of the object + /// + /// This takes precedence over any [ClientOptions](crate::ClientOptions) configuration + /// + /// See [Content-Type](https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Content-Type) + ContentType, + /// Overrides cache control policy of the object + /// + /// See [Cache-Control](https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Cache-Control) + CacheControl, +} + +/// The value of an [`Attribute`] +/// +/// Provides efficient conversion from both static and owned strings +/// +/// ``` +/// # use object_store::AttributeValue; +/// // Can use static strings without needing an allocation +/// let value = AttributeValue::from("bar"); +/// // Can also store owned strings +/// let value = AttributeValue::from("foo".to_string()); +/// ``` +#[derive(Debug, Hash, Eq, PartialEq, Clone)] +pub struct AttributeValue(Cow<'static, str>); + +impl AsRef for AttributeValue { + fn as_ref(&self) -> &str { + &self.0 + } +} + +impl From<&'static str> for AttributeValue { + fn from(value: &'static str) -> Self { + Self(Cow::Borrowed(value)) + } +} + +impl From for AttributeValue { + fn from(value: String) -> Self { + Self(Cow::Owned(value)) + } +} + +impl Deref for AttributeValue { + type Target = str; + + fn deref(&self) -> &Self::Target { + self.0.as_ref() + } +} + +/// Additional attributes of an object +/// +/// Attributes can be specified in [PutOptions](crate::PutOptions) and retrieved +/// from APIs returning [GetResult](crate::GetResult). +/// +/// Unlike [`ObjectMeta`](crate::ObjectMeta), [`Attributes`] are not returned by +/// listing APIs +#[derive(Debug, Default, Eq, PartialEq, Clone)] +pub struct Attributes(HashMap); + +impl Attributes { + /// Create a new empty [`Attributes`] + pub fn new() -> Self { + Self::default() + } + + /// Create a new [`Attributes`] with space for `capacity` [`Attribute`] + pub fn with_capacity(capacity: usize) -> Self { + Self(HashMap::with_capacity(capacity)) + } + + /// Insert a new [`Attribute`], [`AttributeValue`] pair + /// + /// Returns the previous value for `key` if any + pub fn insert(&mut self, key: Attribute, value: AttributeValue) -> Option { + self.0.insert(key, value) + } + + /// Returns the [`AttributeValue`] for `key` if any + pub fn get(&self, key: &Attribute) -> Option<&AttributeValue> { + self.0.get(key) + } + + /// Removes the [`AttributeValue`] for `key` if any + pub fn remove(&mut self, key: &Attribute) -> Option { + self.0.remove(key) + } + + /// Returns an [`AttributesIter`] over this + pub fn iter(&self) -> AttributesIter<'_> { + self.into_iter() + } + + /// Returns the number of [`Attribute`] in this collection + #[inline] + pub fn len(&self) -> usize { + self.0.len() + } + + /// Returns true if this contains no [`Attribute`] + #[inline] + pub fn is_empty(&self) -> bool { + self.0.is_empty() + } +} + +impl FromIterator<(K, V)> for Attributes +where + K: Into, + V: Into, +{ + fn from_iter>(iter: T) -> Self { + Self( + iter.into_iter() + .map(|(k, v)| (k.into(), v.into())) + .collect(), + ) + } +} + +impl<'a> IntoIterator for &'a Attributes { + type Item = (&'a Attribute, &'a AttributeValue); + type IntoIter = AttributesIter<'a>; + + fn into_iter(self) -> Self::IntoIter { + AttributesIter(self.0.iter()) + } +} + +/// Iterator over [`Attributes`] +#[derive(Debug)] +pub struct AttributesIter<'a>(std::collections::hash_map::Iter<'a, Attribute, AttributeValue>); + +impl<'a> Iterator for AttributesIter<'a> { + type Item = (&'a Attribute, &'a AttributeValue); + + fn next(&mut self) -> Option { + self.0.next() + } + + fn size_hint(&self) -> (usize, Option) { + self.0.size_hint() + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_attributes_basic() { + let mut attributes = Attributes::from_iter([ + (Attribute::ContentType, "test"), + (Attribute::CacheControl, "control"), + ]); + + assert!(!attributes.is_empty()); + assert_eq!(attributes.len(), 2); + + assert_eq!( + attributes.get(&Attribute::ContentType), + Some(&"test".into()) + ); + + let metav = "control".into(); + assert_eq!(attributes.get(&Attribute::CacheControl), Some(&metav)); + assert_eq!( + attributes.insert(Attribute::CacheControl, "v1".into()), + Some(metav) + ); + assert_eq!(attributes.len(), 2); + + assert_eq!( + attributes.remove(&Attribute::CacheControl).unwrap(), + "v1".into() + ); + assert_eq!(attributes.len(), 1); + + let metav: AttributeValue = "v2".into(); + attributes.insert(Attribute::CacheControl, metav.clone()); + assert_eq!(attributes.get(&Attribute::CacheControl), Some(&metav)); + assert_eq!(attributes.len(), 2); + } +} diff --git a/src/aws/client.rs b/src/aws/client.rs index c1789ed..e81ef6a 100644 --- a/src/aws/client.rs +++ b/src/aws/client.rs @@ -35,23 +35,21 @@ use crate::client::GetOptionsExt; use crate::multipart::PartId; use crate::path::DELIMITER; use crate::{ - ClientOptions, GetOptions, ListResult, MultipartId, Path, PutPayload, PutResult, Result, - RetryConfig, + Attribute, Attributes, ClientOptions, GetOptions, ListResult, MultipartId, Path, PutPayload, + PutResult, Result, RetryConfig, }; use async_trait::async_trait; use base64::prelude::BASE64_STANDARD; use base64::Engine; use bytes::{Buf, Bytes}; +use hyper::header::{CACHE_CONTROL, CONTENT_LENGTH}; use hyper::http; use hyper::http::HeaderName; use itertools::Itertools; use md5::{Digest, Md5}; use percent_encoding::{utf8_percent_encode, PercentEncode}; use quick_xml::events::{self as xml_events}; -use reqwest::{ - header::{CONTENT_LENGTH, CONTENT_TYPE}, - Client as ReqwestClient, Method, RequestBuilder, Response, -}; +use reqwest::{header::CONTENT_TYPE, Client as ReqwestClient, Method, RequestBuilder, Response}; use ring::digest; use ring::digest::Context; use serde::{Deserialize, Serialize}; @@ -344,6 +342,7 @@ impl S3Client { &'a self, path: &'a Path, payload: PutPayload, + attributes: Attributes, with_encryption_headers: bool, ) -> Request<'a> { let url = self.config.path_url(path); @@ -363,8 +362,21 @@ impl S3Client { ) } - if let Some(value) = self.config.client_options.get_content_type(path) { - builder = builder.header(CONTENT_TYPE, value); + let mut has_content_type = false; + for (k, v) in &attributes { + builder = match k { + Attribute::CacheControl => builder.header(CACHE_CONTROL, v.as_ref()), + Attribute::ContentType => { + has_content_type = true; + builder.header(CONTENT_TYPE, v.as_ref()) + } + }; + } + + if !has_content_type { + if let Some(value) = self.config.client_options.get_content_type(path) { + builder = builder.header(CONTENT_TYPE, value); + } } Request { @@ -556,7 +568,7 @@ impl S3Client { let part = (part_idx + 1).to_string(); let response = self - .put_request(path, data, false) + .put_request(path, data, Attributes::default(), false) .query(&[("partNumber", &part), ("uploadId", upload_id)]) .idempotent(true) .send() diff --git a/src/aws/mod.rs b/src/aws/mod.rs index 9e741c9..43bd38a 100644 --- a/src/aws/mod.rs +++ b/src/aws/mod.rs @@ -156,7 +156,8 @@ impl ObjectStore for AmazonS3 { payload: PutPayload, opts: PutOptions, ) -> Result { - let mut request = self.client.put_request(location, payload, true); + let attrs = opts.attributes; + let mut request = self.client.put_request(location, payload, attrs, true); let tags = opts.tags.encoded(); if !tags.is_empty() && !self.client.config.disable_tagging { request = request.header(&TAGS_HEADER, tags); @@ -403,7 +404,7 @@ mod tests { let test_not_exists = config.copy_if_not_exists.is_some(); let test_conditional_put = config.conditional_put.is_some(); - put_get_delete_list_opts(&integration).await; + put_get_delete_list(&integration).await; get_opts(&integration).await; list_uses_directories_correctly(&integration).await; list_with_delimiter(&integration).await; @@ -412,6 +413,7 @@ mod tests { multipart(&integration, &integration).await; signing(&integration).await; s3_encryption(&integration).await; + put_get_attributes(&integration).await; // Object tagging is not supported by S3 Express One Zone if config.session_provider.is_none() { @@ -432,12 +434,12 @@ mod tests { // run integration test with unsigned payload enabled let builder = AmazonS3Builder::from_env().with_unsigned_payload(true); let integration = builder.build().unwrap(); - put_get_delete_list_opts(&integration).await; + put_get_delete_list(&integration).await; // run integration test with checksum set to sha256 let builder = AmazonS3Builder::from_env().with_checksum_algorithm(Checksum::SHA256); let integration = builder.build().unwrap(); - put_get_delete_list_opts(&integration).await; + put_get_delete_list(&integration).await; match &integration.client.config.copy_if_not_exists { Some(S3CopyIfNotExists::Dynamo(d)) => dynamo::integration_test(&integration, d).await, diff --git a/src/azure/client.rs b/src/azure/client.rs index d5972d0..134609e 100644 --- a/src/azure/client.rs +++ b/src/azure/client.rs @@ -27,14 +27,15 @@ use crate::multipart::PartId; use crate::path::DELIMITER; use crate::util::{deserialize_rfc1123, GetRange}; use crate::{ - ClientOptions, GetOptions, ListResult, ObjectMeta, Path, PutMode, PutOptions, PutPayload, - PutResult, Result, RetryConfig, + Attribute, Attributes, ClientOptions, GetOptions, ListResult, ObjectMeta, Path, PutMode, + PutOptions, PutPayload, PutResult, Result, RetryConfig, }; use async_trait::async_trait; use base64::prelude::BASE64_STANDARD; use base64::Engine; use bytes::{Buf, Bytes}; use chrono::{DateTime, Utc}; +use hyper::header::CACHE_CONTROL; use hyper::http::HeaderName; use reqwest::header::CONTENT_TYPE; use reqwest::{ @@ -187,9 +188,8 @@ impl<'a> PutRequest<'a> { Self { builder, ..self } } - fn set_idempotent(mut self, idempotent: bool) -> Self { - self.idempotent = idempotent; - self + fn set_idempotent(self, idempotent: bool) -> Self { + Self { idempotent, ..self } } async fn send(self) -> Result { @@ -199,7 +199,7 @@ impl<'a> PutRequest<'a> { .header(CONTENT_LENGTH, self.payload.content_length()) .with_azure_authorization(&credential, &self.config.account) .retryable(&self.config.retry_config) - .idempotent(true) + .idempotent(self.idempotent) .payload(Some(self.payload)) .send() .await @@ -233,13 +233,31 @@ impl AzureClient { self.config.get_credential().await } - fn put_request<'a>(&'a self, path: &'a Path, payload: PutPayload) -> PutRequest<'a> { + fn put_request<'a>( + &'a self, + path: &'a Path, + payload: PutPayload, + attributes: Attributes, + ) -> PutRequest<'a> { let url = self.config.path_url(path); let mut builder = self.client.request(Method::PUT, url); - if let Some(value) = self.config().client_options.get_content_type(path) { - builder = builder.header(CONTENT_TYPE, value); + let mut has_content_type = false; + for (k, v) in &attributes { + builder = match k { + Attribute::CacheControl => builder.header(CACHE_CONTROL, v.as_ref()), + Attribute::ContentType => { + has_content_type = true; + builder.header(CONTENT_TYPE, v.as_ref()) + } + }; + } + + if !has_content_type { + if let Some(value) = self.config.client_options.get_content_type(path) { + builder = builder.header(CONTENT_TYPE, value); + } } PutRequest { @@ -258,7 +276,7 @@ impl AzureClient { payload: PutPayload, opts: PutOptions, ) -> Result { - let builder = self.put_request(path, payload); + let builder = self.put_request(path, payload, opts.attributes); let builder = match &opts.mode { PutMode::Overwrite => builder.set_idempotent(true), @@ -288,7 +306,7 @@ impl AzureClient { let content_id = format!("{part_idx:20}"); let block_id = BASE64_STANDARD.encode(&content_id); - self.put_request(path, payload) + self.put_request(path, payload, Attributes::default()) .query(&[("comp", "block"), ("blockid", &block_id)]) .set_idempotent(true) .send() @@ -304,8 +322,9 @@ impl AzureClient { .map(|part| BlockId::from(part.content_id)) .collect(); + let payload = BlockList { blocks }.to_xml().into(); let response = self - .put_request(path, BlockList { blocks }.to_xml().into()) + .put_request(path, payload, Attributes::default()) .query(&[("comp", "blocklist")]) .set_idempotent(true) .send() diff --git a/src/azure/mod.rs b/src/azure/mod.rs index 8dc5242..3bb57c4 100644 --- a/src/azure/mod.rs +++ b/src/azure/mod.rs @@ -276,7 +276,7 @@ mod tests { crate::test_util::maybe_skip_integration!(); let integration = MicrosoftAzureBuilder::from_env().build().unwrap(); - put_get_delete_list_opts(&integration).await; + put_get_delete_list(&integration).await; get_opts(&integration).await; list_uses_directories_correctly(&integration).await; list_with_delimiter(&integration).await; @@ -292,7 +292,12 @@ mod tests { let client = Arc::clone(&integration.client); async move { client.get_blob_tagging(&p).await } }) - .await + .await; + + // Azurite doesn't support attributes properly + if !integration.client.config().is_emulator { + put_get_attributes(&integration).await; + } } #[ignore = "Used for manual testing against a real storage account."] diff --git a/src/client/get.rs b/src/client/get.rs index 2e399e5..f700457 100644 --- a/src/client/get.rs +++ b/src/client/get.rs @@ -19,10 +19,10 @@ use std::ops::Range; use crate::client::header::{header_meta, HeaderConfig}; use crate::path::Path; -use crate::{GetOptions, GetRange, GetResult, GetResultPayload, Result}; +use crate::{Attribute, Attributes, GetOptions, GetRange, GetResult, GetResultPayload, Result}; use async_trait::async_trait; use futures::{StreamExt, TryStreamExt}; -use hyper::header::CONTENT_RANGE; +use hyper::header::{CACHE_CONTROL, CONTENT_RANGE, CONTENT_TYPE}; use hyper::StatusCode; use reqwest::header::ToStrError; use reqwest::Response; @@ -117,6 +117,12 @@ enum GetResultError { #[snafu(display("Content-Range header contained non UTF-8 characters"))] InvalidContentRange { source: ToStrError }, + #[snafu(display("Cache-Control header contained non UTF-8 characters"))] + InvalidCacheControl { source: ToStrError }, + + #[snafu(display("Content-Type header contained non UTF-8 characters"))] + InvalidContentType { source: ToStrError }, + #[snafu(display("Requested {expected:?}, got {actual:?}"))] UnexpectedRange { expected: Range, @@ -161,6 +167,16 @@ fn get_result( 0..meta.size }; + let mut attributes = Attributes::new(); + if let Some(x) = response.headers().get(CACHE_CONTROL) { + let x = x.to_str().context(InvalidCacheControlSnafu)?; + attributes.insert(Attribute::CacheControl, x.to_string().into()); + } + if let Some(x) = response.headers().get(CONTENT_TYPE) { + let x = x.to_str().context(InvalidContentTypeSnafu)?; + attributes.insert(Attribute::ContentType, x.to_string().into()); + } + let stream = response .bytes_stream() .map_err(|source| crate::Error::Generic { @@ -172,6 +188,7 @@ fn get_result( Ok(GetResult { range, meta, + attributes, payload: GetResultPayload::Stream(stream), }) } diff --git a/src/client/mod.rs b/src/client/mod.rs index 7728f38..3fefbb5 100644 --- a/src/client/mod.rs +++ b/src/client/mod.rs @@ -485,7 +485,7 @@ impl ClientOptions { /// mime type if it was defined initially through /// `ClientOptions::with_content_type_for_suffix` /// - /// Otherwise returns the default mime type if it was defined + /// Otherwise, returns the default mime type if it was defined /// earlier through `ClientOptions::with_default_content_type` pub fn get_content_type(&self, path: &Path) -> Option<&str> { match path.extension() { diff --git a/src/gcp/client.rs b/src/gcp/client.rs index f91217f..4ee03ea 100644 --- a/src/gcp/client.rs +++ b/src/gcp/client.rs @@ -29,14 +29,14 @@ use crate::multipart::PartId; use crate::path::{Path, DELIMITER}; use crate::util::hex_encode; use crate::{ - ClientOptions, GetOptions, ListResult, MultipartId, PutMode, PutOptions, PutPayload, PutResult, - Result, RetryConfig, + Attribute, Attributes, ClientOptions, GetOptions, ListResult, MultipartId, PutMode, PutOptions, + PutPayload, PutResult, Result, RetryConfig, }; use async_trait::async_trait; use base64::prelude::BASE64_STANDARD; use base64::Engine; use bytes::Buf; -use hyper::header::CONTENT_LENGTH; +use hyper::header::{CACHE_CONTROL, CONTENT_LENGTH, CONTENT_TYPE}; use percent_encoding::{percent_encode, utf8_percent_encode, NON_ALPHANUMERIC}; use reqwest::header::HeaderName; use reqwest::{header, Client, Method, RequestBuilder, Response, StatusCode}; @@ -45,6 +45,7 @@ use snafu::{OptionExt, ResultExt, Snafu}; use std::sync::Arc; const VERSION_HEADER: &str = "x-goog-generation"; +const DEFAULT_CONTENT_TYPE: &str = "application/octet-stream"; static VERSION_MATCH: HeaderName = HeaderName::from_static("x-goog-if-generation-match"); @@ -323,19 +324,31 @@ impl GoogleCloudStorageClient { /// Perform a put request /// /// Returns the new ETag - pub fn put_request<'a>(&'a self, path: &'a Path, payload: PutPayload) -> PutRequest<'a> { + pub fn put_request<'a>( + &'a self, + path: &'a Path, + payload: PutPayload, + attributes: Attributes, + ) -> PutRequest<'a> { let url = self.object_url(path); + let mut builder = self.client.request(Method::PUT, url); + + let mut has_content_type = false; + for (k, v) in &attributes { + builder = match k { + Attribute::CacheControl => builder.header(CACHE_CONTROL, v.as_ref()), + Attribute::ContentType => { + has_content_type = true; + builder.header(CONTENT_TYPE, v.as_ref()) + } + }; + } - let content_type = self - .config - .client_options - .get_content_type(path) - .unwrap_or("application/octet-stream"); - - let builder = self - .client - .request(Method::PUT, url) - .header(header::CONTENT_TYPE, content_type); + if !has_content_type { + let opts = &self.config.client_options; + let value = opts.get_content_type(path).unwrap_or(DEFAULT_CONTENT_TYPE); + builder = builder.header(CONTENT_TYPE, value) + } PutRequest { path, @@ -352,7 +365,7 @@ impl GoogleCloudStorageClient { payload: PutPayload, opts: PutOptions, ) -> Result { - let builder = self.put_request(path, payload); + let builder = self.put_request(path, payload, opts.attributes); let builder = match &opts.mode { PutMode::Overwrite => builder.set_idempotent(true), @@ -386,7 +399,7 @@ impl GoogleCloudStorageClient { ("uploadId", upload_id), ]; let result = self - .put_request(path, data) + .put_request(path, data, Attributes::new()) .query(query) .set_idempotent(true) .send() @@ -459,7 +472,7 @@ impl GoogleCloudStorageClient { if completed_parts.is_empty() { // GCS doesn't allow empty multipart uploads let result = self - .put_request(path, Default::default()) + .put_request(path, Default::default(), Attributes::new()) .set_idempotent(true) .send() .await?; diff --git a/src/gcp/mod.rs b/src/gcp/mod.rs index 149da76..af6e671 100644 --- a/src/gcp/mod.rs +++ b/src/gcp/mod.rs @@ -292,6 +292,8 @@ mod test { // Fake GCS server doesn't currently honor preconditions get_opts(&integration).await; put_opts(&integration, true).await; + // Fake GCS server doesn't currently support attributes + put_get_attributes(&integration).await; } } diff --git a/src/http/client.rs b/src/http/client.rs index 39f68ec..cf25919 100644 --- a/src/http/client.rs +++ b/src/http/client.rs @@ -21,11 +21,11 @@ use crate::client::retry::{self, RetryConfig, RetryExt}; use crate::client::GetOptionsExt; use crate::path::{Path, DELIMITER}; use crate::util::deserialize_rfc1123; -use crate::{ClientOptions, GetOptions, ObjectMeta, PutPayload, Result}; +use crate::{Attribute, Attributes, ClientOptions, GetOptions, ObjectMeta, PutPayload, Result}; use async_trait::async_trait; use bytes::Buf; use chrono::{DateTime, Utc}; -use hyper::header::CONTENT_LENGTH; +use hyper::header::{CACHE_CONTROL, CONTENT_LENGTH}; use percent_encoding::percent_decode_str; use reqwest::header::CONTENT_TYPE; use reqwest::{Method, Response, StatusCode}; @@ -157,13 +157,32 @@ impl Client { Ok(()) } - pub async fn put(&self, location: &Path, payload: PutPayload) -> Result { + pub async fn put( + &self, + location: &Path, + payload: PutPayload, + attributes: Attributes, + ) -> Result { let mut retry = false; loop { let url = self.path_url(location); let mut builder = self.client.put(url); - if let Some(value) = self.client_options.get_content_type(location) { - builder = builder.header(CONTENT_TYPE, value); + + let mut has_content_type = false; + for (k, v) in &attributes { + builder = match k { + Attribute::CacheControl => builder.header(CACHE_CONTROL, v.as_ref()), + Attribute::ContentType => { + has_content_type = true; + builder.header(CONTENT_TYPE, v.as_ref()) + } + }; + } + + if !has_content_type { + if let Some(value) = self.client_options.get_content_type(location) { + builder = builder.header(CONTENT_TYPE, value); + } } let resp = builder diff --git a/src/http/mod.rs b/src/http/mod.rs index a838a0f..d6ba4f4 100644 --- a/src/http/mod.rs +++ b/src/http/mod.rs @@ -105,7 +105,7 @@ impl ObjectStore for HttpStore { return Err(crate::Error::NotImplemented); } - let response = self.client.put(location, payload).await?; + let response = self.client.put(location, payload, opts.attributes).await?; let e_tag = match get_etag(response.headers()) { Ok(e_tag) => Some(e_tag), Err(crate::client::header::Error::MissingEtag) => None, @@ -260,7 +260,7 @@ mod tests { .build() .unwrap(); - put_get_delete_list_opts(&integration).await; + put_get_delete_list(&integration).await; list_uses_directories_correctly(&integration).await; list_with_delimiter(&integration).await; rename_and_copy(&integration).await; diff --git a/src/lib.rs b/src/lib.rs index 157852f..b492d93 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -543,6 +543,10 @@ mod payload; mod upload; mod util; +mod attributes; + +pub use attributes::*; + pub use parse::{parse_url, parse_url_opts}; pub use payload::*; pub use upload::*; @@ -989,6 +993,8 @@ pub struct GetResult { pub meta: ObjectMeta, /// The range of bytes returned by this request pub range: Range, + /// Additional object attributes + pub attributes: Attributes, } /// The kind of a [`GetResult`] @@ -1114,6 +1120,10 @@ pub struct PutOptions { /// /// Implementations that don't support object tagging should ignore this pub tags: TagSet, + /// Provide a set of [`Attributes`] + /// + /// Implementations that don't support an attribute should return an error + pub attributes: Attributes, } impl From for PutOptions { @@ -1251,10 +1261,6 @@ mod tests { use rand::{thread_rng, Rng}; pub(crate) async fn put_get_delete_list(storage: &DynObjectStore) { - put_get_delete_list_opts(storage).await - } - - pub(crate) async fn put_get_delete_list_opts(storage: &DynObjectStore) { delete_fixtures(storage).await; let content_list = flatten_list_stream(storage, None).await.unwrap(); @@ -1674,6 +1680,28 @@ mod tests { storage.delete(&path).await.unwrap(); } + pub(crate) async fn put_get_attributes(integration: &dyn ObjectStore) { + // Test handling of attributes + let attributes = Attributes::from_iter([ + (Attribute::ContentType, "text/html; charset=utf-8"), + (Attribute::CacheControl, "max-age=604800"), + ]); + + let path = Path::from("attributes"); + let opts = PutOptions { + attributes: attributes.clone(), + ..Default::default() + }; + match integration.put_opts(&path, "foo".into(), opts).await { + Ok(_) => { + let r = integration.get(&path).await.unwrap(); + assert_eq!(r.attributes, attributes); + } + Err(Error::NotImplemented) => {} + Err(e) => panic!("{e}"), + } + } + pub(crate) async fn get_opts(storage: &dyn ObjectStore) { let path = Path::from("test"); storage.put(&path, "foo".into()).await.unwrap(); diff --git a/src/local.rs b/src/local.rs index d5581cd..a3695ad 100644 --- a/src/local.rs +++ b/src/local.rs @@ -38,8 +38,8 @@ use crate::{ maybe_spawn_blocking, path::{absolute_path_to_url, Path}, util::InvalidGetRange, - GetOptions, GetResult, GetResultPayload, ListResult, MultipartUpload, ObjectMeta, ObjectStore, - PutMode, PutOptions, PutPayload, PutResult, Result, UploadPart, + Attributes, GetOptions, GetResult, GetResultPayload, ListResult, MultipartUpload, ObjectMeta, + ObjectStore, PutMode, PutOptions, PutPayload, PutResult, Result, UploadPart, }; /// A specialized `Error` for filesystem object store-related errors @@ -346,6 +346,10 @@ impl ObjectStore for LocalFileSystem { return Err(crate::Error::NotImplemented); } + if !opts.attributes.is_empty() { + return Err(crate::Error::NotImplemented); + } + let path = self.path_to_filesystem(location)?; maybe_spawn_blocking(move || { let (mut file, staging_path) = new_staged_upload(&path)?; @@ -421,6 +425,7 @@ impl ObjectStore for LocalFileSystem { Ok(GetResult { payload: GetResultPayload::File(file, path), + attributes: Attributes::default(), range, meta, }) diff --git a/src/memory.rs b/src/memory.rs index d42e6f2..e34b28f 100644 --- a/src/memory.rs +++ b/src/memory.rs @@ -30,8 +30,9 @@ use snafu::{OptionExt, ResultExt, Snafu}; use crate::multipart::{MultipartStore, PartId}; use crate::util::InvalidGetRange; use crate::{ - path::Path, GetRange, GetResult, GetResultPayload, ListResult, MultipartId, MultipartUpload, - ObjectMeta, ObjectStore, PutMode, PutOptions, PutResult, Result, UpdateVersion, UploadPart, + path::Path, Attributes, GetRange, GetResult, GetResultPayload, ListResult, MultipartId, + MultipartUpload, ObjectMeta, ObjectStore, PutMode, PutOptions, PutResult, Result, + UpdateVersion, UploadPart, }; use crate::{GetOptions, PutPayload}; @@ -88,15 +89,22 @@ pub struct InMemory { struct Entry { data: Bytes, last_modified: DateTime, + attributes: Attributes, e_tag: usize, } impl Entry { - fn new(data: Bytes, last_modified: DateTime, e_tag: usize) -> Self { + fn new( + data: Bytes, + last_modified: DateTime, + e_tag: usize, + attributes: Attributes, + ) -> Self { Self { data, last_modified, e_tag, + attributes, } } } @@ -116,10 +124,10 @@ struct PartStorage { type SharedStorage = Arc>; impl Storage { - fn insert(&mut self, location: &Path, bytes: Bytes) -> usize { + fn insert(&mut self, location: &Path, bytes: Bytes, attributes: Attributes) -> usize { let etag = self.next_etag; self.next_etag += 1; - let entry = Entry::new(bytes, Utc::now(), etag); + let entry = Entry::new(bytes, Utc::now(), etag, attributes); self.overwrite(location, entry); etag } @@ -200,7 +208,7 @@ impl ObjectStore for InMemory { ) -> Result { let mut storage = self.storage.write(); let etag = storage.next_etag; - let entry = Entry::new(payload.into(), Utc::now(), etag); + let entry = Entry::new(payload.into(), Utc::now(), etag, opts.attributes); match opts.mode { PutMode::Overwrite => storage.overwrite(location, entry), @@ -247,6 +255,7 @@ impl ObjectStore for InMemory { Ok(GetResult { payload: GetResultPayload::Stream(stream.boxed()), + attributes: entry.attributes, meta, range, }) @@ -363,7 +372,9 @@ impl ObjectStore for InMemory { async fn copy(&self, from: &Path, to: &Path) -> Result<()> { let entry = self.entry(from).await?; - self.storage.write().insert(to, entry.data); + self.storage + .write() + .insert(to, entry.data, entry.attributes); Ok(()) } @@ -376,7 +387,7 @@ impl ObjectStore for InMemory { } .into()); } - storage.insert(to, entry.data); + storage.insert(to, entry.data, entry.attributes); Ok(()) } } @@ -426,7 +437,7 @@ impl MultipartStore for InMemory { for x in &upload.parts { buf.extend_from_slice(x.as_ref().unwrap()) } - let etag = storage.insert(path, buf.into()); + let etag = storage.insert(path, buf.into(), Default::default()); Ok(PutResult { e_tag: Some(etag.to_string()), version: None, @@ -492,7 +503,11 @@ impl MultipartUpload for InMemoryUpload { let mut buf = Vec::with_capacity(cap); let parts = self.parts.iter().flatten(); parts.for_each(|x| buf.extend_from_slice(x)); - let etag = self.storage.write().insert(&self.location, buf.into()); + let etag = self + .storage + .write() + .insert(&self.location, buf.into(), Attributes::new()); + Ok(PutResult { e_tag: Some(etag.to_string()), version: None, @@ -523,6 +538,7 @@ mod tests { stream_get(&integration).await; put_opts(&integration, true).await; multipart(&integration, &integration).await; + put_get_attributes(&integration).await; } #[tokio::test] From d698b8cbfa3c16b0de2ef3942776b5b7b95d1a80 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Wed, 17 Apr 2024 11:51:56 +0100 Subject: [PATCH 295/397] Add put_multipart_opts (#5435) (#5652) * Add put_multipart_opts (#5435) --- src/aws/client.rs | 203 +++++++++++++++++++--------------------- src/aws/mod.rs | 48 +++++++--- src/azure/client.rs | 93 ++++++++++-------- src/azure/mod.rs | 16 +++- src/chunked.rs | 10 +- src/gcp/client.rs | 174 ++++++++++++++++------------------ src/gcp/mod.rs | 15 ++- src/http/mod.rs | 8 +- src/lib.rs | 104 +++++++++++++++++--- src/limit.rs | 16 +++- src/local.rs | 12 ++- src/memory.rs | 21 +++-- src/prefix.rs | 13 ++- src/throttle.rs | 14 ++- tests/get_range_file.rs | 6 +- 15 files changed, 461 insertions(+), 292 deletions(-) diff --git a/src/aws/client.rs b/src/aws/client.rs index e81ef6a..4a4dc17 100644 --- a/src/aws/client.rs +++ b/src/aws/client.rs @@ -19,8 +19,8 @@ use crate::aws::builder::S3EncryptionHeaders; use crate::aws::checksum::Checksum; use crate::aws::credential::{AwsCredential, CredentialExt}; use crate::aws::{ - AwsAuthorizer, AwsCredentialProvider, S3ConditionalPut, S3CopyIfNotExists, STORE, - STRICT_PATH_ENCODE_SET, + AwsAuthorizer, AwsCredentialProvider, S3ConditionalPut, S3CopyIfNotExists, COPY_SOURCE_HEADER, + STORE, STRICT_PATH_ENCODE_SET, TAGS_HEADER, }; use crate::client::get::GetClient; use crate::client::header::{get_etag, HeaderConfig}; @@ -35,16 +35,16 @@ use crate::client::GetOptionsExt; use crate::multipart::PartId; use crate::path::DELIMITER; use crate::{ - Attribute, Attributes, ClientOptions, GetOptions, ListResult, MultipartId, Path, PutPayload, - PutResult, Result, RetryConfig, + Attribute, Attributes, ClientOptions, GetOptions, ListResult, MultipartId, Path, + PutMultipartOpts, PutPayload, PutResult, Result, RetryConfig, TagSet, }; use async_trait::async_trait; use base64::prelude::BASE64_STANDARD; use base64::Engine; use bytes::{Buf, Bytes}; use hyper::header::{CACHE_CONTROL, CONTENT_LENGTH}; -use hyper::http; use hyper::http::HeaderName; +use hyper::{http, HeaderMap}; use itertools::Itertools; use md5::{Digest, Md5}; use percent_encoding::{utf8_percent_encode, PercentEncode}; @@ -98,9 +98,6 @@ pub(crate) enum Error { #[snafu(display("Error getting list response body: {}", source))] ListResponseBody { source: reqwest::Error }, - #[snafu(display("Error performing create multipart request: {}", source))] - CreateMultipartRequest { source: crate::client::retry::Error }, - #[snafu(display("Error getting create multipart response body: {}", source))] CreateMultipartResponseBody { source: reqwest::Error }, @@ -289,8 +286,75 @@ impl<'a> Request<'a> { Self { builder, ..self } } - pub fn idempotent(mut self, idempotent: bool) -> Self { - self.idempotent = idempotent; + pub fn headers(self, headers: HeaderMap) -> Self { + let builder = self.builder.headers(headers); + Self { builder, ..self } + } + + pub fn idempotent(self, idempotent: bool) -> Self { + Self { idempotent, ..self } + } + + pub fn with_encryption_headers(self) -> Self { + let headers = self.config.encryption_headers.clone().into(); + let builder = self.builder.headers(headers); + Self { builder, ..self } + } + + pub fn with_session_creds(self, use_session_creds: bool) -> Self { + Self { + use_session_creds, + ..self + } + } + + pub fn with_tags(mut self, tags: TagSet) -> Self { + let tags = tags.encoded(); + if !tags.is_empty() && !self.config.disable_tagging { + self.builder = self.builder.header(&TAGS_HEADER, tags); + } + self + } + + pub fn with_attributes(self, attributes: Attributes) -> Self { + let mut has_content_type = false; + let mut builder = self.builder; + for (k, v) in &attributes { + builder = match k { + Attribute::CacheControl => builder.header(CACHE_CONTROL, v.as_ref()), + Attribute::ContentType => { + has_content_type = true; + builder.header(CONTENT_TYPE, v.as_ref()) + } + }; + } + + if !has_content_type { + if let Some(value) = self.config.client_options.get_content_type(self.path) { + builder = builder.header(CONTENT_TYPE, value); + } + } + Self { builder, ..self } + } + + pub fn with_payload(mut self, payload: PutPayload) -> Self { + if !self.config.skip_signature || self.config.checksum.is_some() { + let mut sha256 = Context::new(&digest::SHA256); + payload.iter().for_each(|x| sha256.update(x)); + let payload_sha256 = sha256.finish(); + + if let Some(Checksum::SHA256) = self.config.checksum { + self.builder = self.builder.header( + "x-amz-checksum-sha256", + BASE64_STANDARD.encode(payload_sha256), + ); + } + self.payload_sha256 = Some(payload_sha256); + } + + let content_length = payload.content_length(); + self.builder = self.builder.header(CONTENT_LENGTH, content_length); + self.payload = Some(payload); self } @@ -335,81 +399,19 @@ impl S3Client { Ok(Self { config, client }) } - /// Make an S3 PUT request - /// - /// Returns the ETag - pub fn put_request<'a>( - &'a self, - path: &'a Path, - payload: PutPayload, - attributes: Attributes, - with_encryption_headers: bool, - ) -> Request<'a> { + pub fn request<'a>(&'a self, method: Method, path: &'a Path) -> Request<'a> { let url = self.config.path_url(path); - let mut builder = self.client.request(Method::PUT, url); - if with_encryption_headers { - builder = builder.headers(self.config.encryption_headers.clone().into()); - } - - let mut sha256 = Context::new(&digest::SHA256); - payload.iter().for_each(|x| sha256.update(x)); - let payload_sha256 = sha256.finish(); - - if let Some(Checksum::SHA256) = self.config.checksum { - builder = builder.header( - "x-amz-checksum-sha256", - BASE64_STANDARD.encode(payload_sha256), - ) - } - - let mut has_content_type = false; - for (k, v) in &attributes { - builder = match k { - Attribute::CacheControl => builder.header(CACHE_CONTROL, v.as_ref()), - Attribute::ContentType => { - has_content_type = true; - builder.header(CONTENT_TYPE, v.as_ref()) - } - }; - } - - if !has_content_type { - if let Some(value) = self.config.client_options.get_content_type(path) { - builder = builder.header(CONTENT_TYPE, value); - } - } - Request { path, - builder: builder.header(CONTENT_LENGTH, payload.content_length()), - payload: Some(payload), - payload_sha256: Some(payload_sha256), + builder: self.client.request(method, url), + payload: None, + payload_sha256: None, config: &self.config, use_session_creds: true, idempotent: false, } } - /// Make an S3 Delete request - pub async fn delete_request( - &self, - path: &Path, - query: &T, - ) -> Result<()> { - let credential = self.config.get_session_credential().await?; - let url = self.config.path_url(path); - - self.client - .request(Method::DELETE, url) - .query(query) - .with_aws_sigv4(credential.authorizer(), None) - .send_retry(&self.config.retry_config) - .await - .map_err(|e| e.error(STORE, path.to_string()))?; - - Ok(()) - } - /// Make an S3 Delete Objects request /// /// Produces a vector of results, one for each path in the input vector. If @@ -513,41 +515,29 @@ impl S3Client { } /// Make an S3 Copy request - pub fn copy_request<'a>(&'a self, from: &'a Path, to: &Path) -> Request<'a> { - let url = self.config.path_url(to); + pub fn copy_request<'a>(&'a self, from: &Path, to: &'a Path) -> Request<'a> { let source = format!("{}/{}", self.config.bucket, encode_path(from)); - - let builder = self - .client - .request(Method::PUT, url) - .header("x-amz-copy-source", source) - .headers(self.config.encryption_headers.clone().into()); - - Request { - builder, - path: from, - config: &self.config, - payload: None, - payload_sha256: None, - use_session_creds: false, - idempotent: false, - } + self.request(Method::PUT, to) + .idempotent(true) + .header(©_SOURCE_HEADER, &source) + .headers(self.config.encryption_headers.clone().into()) + .with_session_creds(false) } - pub async fn create_multipart(&self, location: &Path) -> Result { - let credential = self.config.get_session_credential().await?; - let url = format!("{}?uploads=", self.config.path_url(location),); - + pub async fn create_multipart( + &self, + location: &Path, + opts: PutMultipartOpts, + ) -> Result { let response = self - .client - .request(Method::POST, url) - .headers(self.config.encryption_headers.clone().into()) - .with_aws_sigv4(credential.authorizer(), None) - .retryable(&self.config.retry_config) + .request(Method::POST, location) + .query(&[("uploads", "")]) + .with_encryption_headers() + .with_attributes(opts.attributes) + .with_tags(opts.tags) .idempotent(true) .send() - .await - .context(CreateMultipartRequestSnafu)? + .await? .bytes() .await .context(CreateMultipartResponseBodySnafu)?; @@ -568,7 +558,8 @@ impl S3Client { let part = (part_idx + 1).to_string(); let response = self - .put_request(path, data, Attributes::default(), false) + .request(Method::PUT, path) + .with_payload(data) .query(&[("partNumber", &part), ("uploadId", upload_id)]) .idempotent(true) .send() diff --git a/src/aws/mod.rs b/src/aws/mod.rs index 43bd38a..7f1edf1 100644 --- a/src/aws/mod.rs +++ b/src/aws/mod.rs @@ -45,10 +45,12 @@ use crate::signer::Signer; use crate::util::STRICT_ENCODE_SET; use crate::{ Error, GetOptions, GetResult, ListResult, MultipartId, MultipartUpload, ObjectMeta, - ObjectStore, Path, PutMode, PutOptions, PutPayload, PutResult, Result, UploadPart, + ObjectStore, Path, PutMode, PutMultipartOpts, PutOptions, PutPayload, PutResult, Result, + UploadPart, }; static TAGS_HEADER: HeaderName = HeaderName::from_static("x-amz-tagging"); +static COPY_SOURCE_HEADER: HeaderName = HeaderName::from_static("x-amz-copy-source"); mod builder; mod checksum; @@ -156,12 +158,13 @@ impl ObjectStore for AmazonS3 { payload: PutPayload, opts: PutOptions, ) -> Result { - let attrs = opts.attributes; - let mut request = self.client.put_request(location, payload, attrs, true); - let tags = opts.tags.encoded(); - if !tags.is_empty() && !self.client.config.disable_tagging { - request = request.header(&TAGS_HEADER, tags); - } + let request = self + .client + .request(Method::PUT, location) + .with_payload(payload) + .with_attributes(opts.attributes) + .with_tags(opts.tags) + .with_encryption_headers(); match (opts.mode, &self.client.config.conditional_put) { (PutMode::Overwrite, _) => request.idempotent(true).do_put().await, @@ -204,8 +207,12 @@ impl ObjectStore for AmazonS3 { } } - async fn put_multipart(&self, location: &Path) -> Result> { - let upload_id = self.client.create_multipart(location).await?; + async fn put_multipart_opts( + &self, + location: &Path, + opts: PutMultipartOpts, + ) -> Result> { + let upload_id = self.client.create_multipart(location, opts).await?; Ok(Box::new(S3MultiPartUpload { part_idx: 0, @@ -223,7 +230,8 @@ impl ObjectStore for AmazonS3 { } async fn delete(&self, location: &Path) -> Result<()> { - self.client.delete_request(location, &()).await + self.client.request(Method::DELETE, location).send().await?; + Ok(()) } fn delete_stream<'a>( @@ -351,15 +359,22 @@ impl MultipartUpload for S3MultiPartUpload { async fn abort(&mut self) -> Result<()> { self.state .client - .delete_request(&self.state.location, &[("uploadId", &self.state.upload_id)]) - .await + .request(Method::DELETE, &self.state.location) + .query(&[("uploadId", &self.state.upload_id)]) + .idempotent(true) + .send() + .await?; + + Ok(()) } } #[async_trait] impl MultipartStore for AmazonS3 { async fn create_multipart(&self, path: &Path) -> Result { - self.client.create_multipart(path).await + self.client + .create_multipart(path, PutMultipartOpts::default()) + .await } async fn put_part( @@ -382,7 +397,12 @@ impl MultipartStore for AmazonS3 { } async fn abort_multipart(&self, path: &Path, id: &MultipartId) -> Result<()> { - self.client.delete_request(path, &[("uploadId", id)]).await + self.client + .request(Method::DELETE, path) + .query(&[("uploadId", id)]) + .send() + .await?; + Ok(()) } } diff --git a/src/azure/client.rs b/src/azure/client.rs index 134609e..918fcd0 100644 --- a/src/azure/client.rs +++ b/src/azure/client.rs @@ -28,16 +28,14 @@ use crate::path::DELIMITER; use crate::util::{deserialize_rfc1123, GetRange}; use crate::{ Attribute, Attributes, ClientOptions, GetOptions, ListResult, ObjectMeta, Path, PutMode, - PutOptions, PutPayload, PutResult, Result, RetryConfig, + PutMultipartOpts, PutOptions, PutPayload, PutResult, Result, RetryConfig, TagSet, }; use async_trait::async_trait; use base64::prelude::BASE64_STANDARD; use base64::Engine; use bytes::{Buf, Bytes}; use chrono::{DateTime, Utc}; -use hyper::header::CACHE_CONTROL; use hyper::http::HeaderName; -use reqwest::header::CONTENT_TYPE; use reqwest::{ header::{HeaderValue, CONTENT_LENGTH, IF_MATCH, IF_NONE_MATCH}, Client as ReqwestClient, Method, RequestBuilder, Response, @@ -50,6 +48,8 @@ use std::time::Duration; use url::Url; const VERSION_HEADER: &str = "x-ms-version-id"; +static MS_CACHE_CONTROL: HeaderName = HeaderName::from_static("x-ms-blob-cache-control"); +static MS_CONTENT_TYPE: HeaderName = HeaderName::from_static("x-ms-blob-content-type"); static TAGS_HEADER: HeaderName = HeaderName::from_static("x-ms-tags"); @@ -188,10 +188,39 @@ impl<'a> PutRequest<'a> { Self { builder, ..self } } - fn set_idempotent(self, idempotent: bool) -> Self { + fn idempotent(self, idempotent: bool) -> Self { Self { idempotent, ..self } } + fn with_tags(mut self, tags: TagSet) -> Self { + let tags = tags.encoded(); + if !tags.is_empty() && !self.config.disable_tagging { + self.builder = self.builder.header(&TAGS_HEADER, tags); + } + self + } + + fn with_attributes(self, attributes: Attributes) -> Self { + let mut builder = self.builder; + let mut has_content_type = false; + for (k, v) in &attributes { + builder = match k { + Attribute::CacheControl => builder.header(&MS_CACHE_CONTROL, v.as_ref()), + Attribute::ContentType => { + has_content_type = true; + builder.header(&MS_CONTENT_TYPE, v.as_ref()) + } + }; + } + + if !has_content_type { + if let Some(value) = self.config.client_options.get_content_type(self.path) { + builder = builder.header(&MS_CONTENT_TYPE, value); + } + } + Self { builder, ..self } + } + async fn send(self) -> Result { let credential = self.config.get_credential().await?; let response = self @@ -233,32 +262,9 @@ impl AzureClient { self.config.get_credential().await } - fn put_request<'a>( - &'a self, - path: &'a Path, - payload: PutPayload, - attributes: Attributes, - ) -> PutRequest<'a> { + fn put_request<'a>(&'a self, path: &'a Path, payload: PutPayload) -> PutRequest<'a> { let url = self.config.path_url(path); - - let mut builder = self.client.request(Method::PUT, url); - - let mut has_content_type = false; - for (k, v) in &attributes { - builder = match k { - Attribute::CacheControl => builder.header(CACHE_CONTROL, v.as_ref()), - Attribute::ContentType => { - has_content_type = true; - builder.header(CONTENT_TYPE, v.as_ref()) - } - }; - } - - if !has_content_type { - if let Some(value) = self.config.client_options.get_content_type(path) { - builder = builder.header(CONTENT_TYPE, value); - } - } + let builder = self.client.request(Method::PUT, url); PutRequest { path, @@ -276,10 +282,13 @@ impl AzureClient { payload: PutPayload, opts: PutOptions, ) -> Result { - let builder = self.put_request(path, payload, opts.attributes); + let builder = self + .put_request(path, payload) + .with_attributes(opts.attributes) + .with_tags(opts.tags); let builder = match &opts.mode { - PutMode::Overwrite => builder.set_idempotent(true), + PutMode::Overwrite => builder.idempotent(true), PutMode::Create => builder.header(&IF_NONE_MATCH, "*"), PutMode::Update(v) => { let etag = v.e_tag.as_ref().context(MissingETagSnafu)?; @@ -287,11 +296,6 @@ impl AzureClient { } }; - let builder = match (opts.tags.encoded(), self.config.disable_tagging) { - ("", _) | (_, true) => builder, - (tags, false) => builder.header(&TAGS_HEADER, tags), - }; - let response = builder.header(&BLOB_TYPE, "BlockBlob").send().await?; Ok(get_put_result(response.headers(), VERSION_HEADER).context(MetadataSnafu)?) } @@ -306,9 +310,9 @@ impl AzureClient { let content_id = format!("{part_idx:20}"); let block_id = BASE64_STANDARD.encode(&content_id); - self.put_request(path, payload, Attributes::default()) + self.put_request(path, payload) .query(&[("comp", "block"), ("blockid", &block_id)]) - .set_idempotent(true) + .idempotent(true) .send() .await?; @@ -316,7 +320,12 @@ impl AzureClient { } /// PUT a block list - pub async fn put_block_list(&self, path: &Path, parts: Vec) -> Result { + pub async fn put_block_list( + &self, + path: &Path, + parts: Vec, + opts: PutMultipartOpts, + ) -> Result { let blocks = parts .into_iter() .map(|part| BlockId::from(part.content_id)) @@ -324,9 +333,11 @@ impl AzureClient { let payload = BlockList { blocks }.to_xml().into(); let response = self - .put_request(path, payload, Attributes::default()) + .put_request(path, payload) + .with_attributes(opts.attributes) + .with_tags(opts.tags) .query(&[("comp", "blocklist")]) - .set_idempotent(true) + .idempotent(true) .send() .await?; diff --git a/src/azure/mod.rs b/src/azure/mod.rs index 3bb57c4..25ae6dd 100644 --- a/src/azure/mod.rs +++ b/src/azure/mod.rs @@ -27,7 +27,7 @@ use crate::{ path::Path, signer::Signer, GetOptions, GetResult, ListResult, MultipartId, MultipartUpload, ObjectMeta, ObjectStore, - PutOptions, PutPayload, PutResult, Result, UploadPart, + PutMultipartOpts, PutOptions, PutPayload, PutResult, Result, UploadPart, }; use async_trait::async_trait; use futures::stream::BoxStream; @@ -95,9 +95,14 @@ impl ObjectStore for MicrosoftAzure { self.client.put_blob(location, payload, opts).await } - async fn put_multipart(&self, location: &Path) -> Result> { + async fn put_multipart_opts( + &self, + location: &Path, + opts: PutMultipartOpts, + ) -> Result> { Ok(Box::new(AzureMultiPartUpload { part_idx: 0, + opts, state: Arc::new(UploadState { client: Arc::clone(&self.client), location: location.clone(), @@ -196,6 +201,7 @@ impl Signer for MicrosoftAzure { struct AzureMultiPartUpload { part_idx: usize, state: Arc, + opts: PutMultipartOpts, } #[derive(Debug)] @@ -223,7 +229,7 @@ impl MultipartUpload for AzureMultiPartUpload { self.state .client - .put_block_list(&self.state.location, parts) + .put_block_list(&self.state.location, parts, std::mem::take(&mut self.opts)) .await } @@ -255,7 +261,9 @@ impl MultipartStore for MicrosoftAzure { _: &MultipartId, parts: Vec, ) -> Result { - self.client.put_block_list(path, parts).await + self.client + .put_block_list(path, parts, Default::default()) + .await } async fn abort_multipart(&self, _: &Path, _: &MultipartId) -> Result<()> { diff --git a/src/chunked.rs b/src/chunked.rs index 9abe49d..a3bd762 100644 --- a/src/chunked.rs +++ b/src/chunked.rs @@ -29,7 +29,7 @@ use futures::StreamExt; use crate::path::Path; use crate::{ GetOptions, GetResult, GetResultPayload, ListResult, MultipartUpload, ObjectMeta, ObjectStore, - PutOptions, PutResult, + PutMultipartOpts, PutOptions, PutResult, }; use crate::{PutPayload, Result}; @@ -75,6 +75,14 @@ impl ObjectStore for ChunkedStore { self.inner.put_multipart(location).await } + async fn put_multipart_opts( + &self, + location: &Path, + opts: PutMultipartOpts, + ) -> Result> { + self.inner.put_multipart_opts(location, opts).await + } + async fn get_opts(&self, location: &Path, options: GetOptions) -> Result { let r = self.inner.get_opts(location, options).await?; let stream = match r.payload { diff --git a/src/gcp/client.rs b/src/gcp/client.rs index 4ee03ea..9c39efe 100644 --- a/src/gcp/client.rs +++ b/src/gcp/client.rs @@ -29,8 +29,8 @@ use crate::multipart::PartId; use crate::path::{Path, DELIMITER}; use crate::util::hex_encode; use crate::{ - Attribute, Attributes, ClientOptions, GetOptions, ListResult, MultipartId, PutMode, PutOptions, - PutPayload, PutResult, Result, RetryConfig, + Attribute, Attributes, ClientOptions, GetOptions, ListResult, MultipartId, PutMode, + PutMultipartOpts, PutOptions, PutPayload, PutResult, Result, RetryConfig, }; use async_trait::async_trait; use base64::prelude::BASE64_STANDARD; @@ -39,7 +39,7 @@ use bytes::Buf; use hyper::header::{CACHE_CONTROL, CONTENT_LENGTH, CONTENT_TYPE}; use percent_encoding::{percent_encode, utf8_percent_encode, NON_ALPHANUMERIC}; use reqwest::header::HeaderName; -use reqwest::{header, Client, Method, RequestBuilder, Response, StatusCode}; +use reqwest::{Client, Method, RequestBuilder, Response, StatusCode}; use serde::{Deserialize, Serialize}; use snafu::{OptionExt, ResultExt, Snafu}; use std::sync::Arc; @@ -66,14 +66,8 @@ enum Error { path: String, }, - #[snafu(display("Error performing delete request {}: {}", path, source))] - DeleteRequest { - source: crate::client::retry::Error, - path: String, - }, - - #[snafu(display("Error performing put request {}: {}", path, source))] - PutRequest { + #[snafu(display("Error performing request {}: {}", path, source))] + Request { source: crate::client::retry::Error, path: String, }, @@ -120,9 +114,9 @@ enum Error { impl From for crate::Error { fn from(err: Error) -> Self { match err { - Error::GetRequest { source, path } - | Error::DeleteRequest { source, path } - | Error::PutRequest { source, path } => source.error(STORE, path), + Error::GetRequest { source, path } | Error::Request { source, path } => { + source.error(STORE, path) + } _ => Self::Generic { store: STORE, source: Box::new(err), @@ -171,15 +165,15 @@ impl GoogleCloudStorageConfig { } /// A builder for a put request allowing customisation of the headers and query string -pub struct PutRequest<'a> { +pub struct Request<'a> { path: &'a Path, config: &'a GoogleCloudStorageConfig, - payload: PutPayload, + payload: Option, builder: RequestBuilder, idempotent: bool, } -impl<'a> PutRequest<'a> { +impl<'a> Request<'a> { fn header(self, k: &HeaderName, v: &str) -> Self { let builder = self.builder.header(k, v); Self { builder, ..self } @@ -190,26 +184,58 @@ impl<'a> PutRequest<'a> { Self { builder, ..self } } - fn set_idempotent(mut self, idempotent: bool) -> Self { + fn idempotent(mut self, idempotent: bool) -> Self { self.idempotent = idempotent; self } - async fn send(self) -> Result { + fn with_attributes(self, attributes: Attributes) -> Self { + let mut builder = self.builder; + let mut has_content_type = false; + for (k, v) in &attributes { + builder = match k { + Attribute::CacheControl => builder.header(CACHE_CONTROL, v.as_ref()), + Attribute::ContentType => { + has_content_type = true; + builder.header(CONTENT_TYPE, v.as_ref()) + } + }; + } + + if !has_content_type { + let value = self.config.client_options.get_content_type(self.path); + builder = builder.header(CONTENT_TYPE, value.unwrap_or(DEFAULT_CONTENT_TYPE)) + } + Self { builder, ..self } + } + + fn with_payload(self, payload: PutPayload) -> Self { + let content_length = payload.content_length(); + Self { + builder: self.builder.header(CONTENT_LENGTH, content_length), + payload: Some(payload), + ..self + } + } + + async fn send(self) -> Result { let credential = self.config.credentials.get_credential().await?; - let response = self + let resp = self .builder .bearer_auth(&credential.bearer) - .header(CONTENT_LENGTH, self.payload.content_length()) .retryable(&self.config.retry_config) .idempotent(self.idempotent) - .payload(Some(self.payload)) + .payload(self.payload) .send() .await - .context(PutRequestSnafu { + .context(RequestSnafu { path: self.path.as_ref(), })?; + Ok(resp) + } + async fn do_put(self) -> Result { + let response = self.send().await?; Ok(get_put_result(response.headers(), VERSION_HEADER).context(MetadataSnafu)?) } } @@ -324,36 +350,13 @@ impl GoogleCloudStorageClient { /// Perform a put request /// /// Returns the new ETag - pub fn put_request<'a>( - &'a self, - path: &'a Path, - payload: PutPayload, - attributes: Attributes, - ) -> PutRequest<'a> { - let url = self.object_url(path); - let mut builder = self.client.request(Method::PUT, url); - - let mut has_content_type = false; - for (k, v) in &attributes { - builder = match k { - Attribute::CacheControl => builder.header(CACHE_CONTROL, v.as_ref()), - Attribute::ContentType => { - has_content_type = true; - builder.header(CONTENT_TYPE, v.as_ref()) - } - }; - } + pub fn request<'a>(&'a self, method: Method, path: &'a Path) -> Request<'a> { + let builder = self.client.request(method, self.object_url(path)); - if !has_content_type { - let opts = &self.config.client_options; - let value = opts.get_content_type(path).unwrap_or(DEFAULT_CONTENT_TYPE); - builder = builder.header(CONTENT_TYPE, value) - } - - PutRequest { + Request { path, builder, - payload, + payload: None, config: &self.config, idempotent: false, } @@ -365,10 +368,13 @@ impl GoogleCloudStorageClient { payload: PutPayload, opts: PutOptions, ) -> Result { - let builder = self.put_request(path, payload, opts.attributes); + let builder = self + .request(Method::PUT, path) + .with_payload(payload) + .with_attributes(opts.attributes); let builder = match &opts.mode { - PutMode::Overwrite => builder.set_idempotent(true), + PutMode::Overwrite => builder.idempotent(true), PutMode::Create => builder.header(&VERSION_MATCH, "0"), PutMode::Update(v) => { let etag = v.version.as_ref().context(MissingVersionSnafu)?; @@ -376,7 +382,7 @@ impl GoogleCloudStorageClient { } }; - match (opts.mode, builder.send().await) { + match (opts.mode, builder.do_put().await) { (PutMode::Create, Err(crate::Error::Precondition { path, source })) => { Err(crate::Error::AlreadyExists { path, source }) } @@ -399,10 +405,11 @@ impl GoogleCloudStorageClient { ("uploadId", upload_id), ]; let result = self - .put_request(path, data, Attributes::new()) + .request(Method::PUT, path) + .with_payload(data) .query(query) - .set_idempotent(true) - .send() + .idempotent(true) + .do_put() .await?; Ok(PartId { @@ -411,30 +418,18 @@ impl GoogleCloudStorageClient { } /// Initiate a multipart upload - pub async fn multipart_initiate(&self, path: &Path) -> Result { - let credential = self.get_credential().await?; - let url = self.object_url(path); - - let content_type = self - .config - .client_options - .get_content_type(path) - .unwrap_or("application/octet-stream"); - + pub async fn multipart_initiate( + &self, + path: &Path, + opts: PutMultipartOpts, + ) -> Result { let response = self - .client - .request(Method::POST, &url) - .bearer_auth(&credential.bearer) - .header(header::CONTENT_TYPE, content_type) - .header(header::CONTENT_LENGTH, "0") + .request(Method::POST, path) + .with_attributes(opts.attributes) + .header(&CONTENT_LENGTH, "0") .query(&[("uploads", "")]) - .retryable(&self.config.retry_config) - .idempotent(true) .send() - .await - .context(PutRequestSnafu { - path: path.as_ref(), - })?; + .await?; let data = response.bytes().await.context(PutResponseBodySnafu)?; let result: InitiateMultipartUploadResult = @@ -451,12 +446,12 @@ impl GoogleCloudStorageClient { self.client .request(Method::DELETE, &url) .bearer_auth(&credential.bearer) - .header(header::CONTENT_TYPE, "application/octet-stream") - .header(header::CONTENT_LENGTH, "0") + .header(CONTENT_TYPE, "application/octet-stream") + .header(CONTENT_LENGTH, "0") .query(&[("uploadId", multipart_id)]) .send_retry(&self.config.retry_config) .await - .context(PutRequestSnafu { + .context(RequestSnafu { path: path.as_ref(), })?; @@ -472,9 +467,9 @@ impl GoogleCloudStorageClient { if completed_parts.is_empty() { // GCS doesn't allow empty multipart uploads let result = self - .put_request(path, Default::default(), Attributes::new()) - .set_idempotent(true) - .send() + .request(Method::PUT, path) + .idempotent(true) + .do_put() .await?; self.multipart_cleanup(path, multipart_id).await?; return Ok(result); @@ -523,18 +518,7 @@ impl GoogleCloudStorageClient { /// Perform a delete request pub async fn delete_request(&self, path: &Path) -> Result<()> { - let credential = self.get_credential().await?; - let url = self.object_url(path); - - let builder = self.client.request(Method::DELETE, url); - builder - .bearer_auth(&credential.bearer) - .send_retry(&self.config.retry_config) - .await - .context(DeleteRequestSnafu { - path: path.as_ref(), - })?; - + self.request(Method::DELETE, path).send().await?; Ok(()) } diff --git a/src/gcp/mod.rs b/src/gcp/mod.rs index af6e671..0ec6e7e 100644 --- a/src/gcp/mod.rs +++ b/src/gcp/mod.rs @@ -42,7 +42,8 @@ use crate::gcp::credential::GCSAuthorizer; use crate::signer::Signer; use crate::{ multipart::PartId, path::Path, GetOptions, GetResult, ListResult, MultipartId, MultipartUpload, - ObjectMeta, ObjectStore, PutOptions, PutPayload, PutResult, Result, UploadPart, + ObjectMeta, ObjectStore, PutMultipartOpts, PutOptions, PutPayload, PutResult, Result, + UploadPart, }; use async_trait::async_trait; use client::GoogleCloudStorageClient; @@ -156,8 +157,12 @@ impl ObjectStore for GoogleCloudStorage { self.client.put(location, payload, opts).await } - async fn put_multipart(&self, location: &Path) -> Result> { - let upload_id = self.client.multipart_initiate(location).await?; + async fn put_multipart_opts( + &self, + location: &Path, + opts: PutMultipartOpts, + ) -> Result> { + let upload_id = self.client.multipart_initiate(location, opts).await?; Ok(Box::new(GCSMultipartUpload { part_idx: 0, @@ -206,7 +211,9 @@ impl ObjectStore for GoogleCloudStorage { #[async_trait] impl MultipartStore for GoogleCloudStorage { async fn create_multipart(&self, path: &Path) -> Result { - self.client.multipart_initiate(path).await + self.client + .multipart_initiate(path, PutMultipartOpts::default()) + .await } async fn put_part( diff --git a/src/http/mod.rs b/src/http/mod.rs index d6ba4f4..404211e 100644 --- a/src/http/mod.rs +++ b/src/http/mod.rs @@ -44,7 +44,7 @@ use crate::http::client::Client; use crate::path::Path; use crate::{ ClientConfigKey, ClientOptions, GetOptions, GetResult, ListResult, MultipartUpload, ObjectMeta, - ObjectStore, PutMode, PutOptions, PutPayload, PutResult, Result, RetryConfig, + ObjectStore, PutMode, PutMultipartOpts, PutOptions, PutPayload, PutResult, Result, RetryConfig, }; mod client; @@ -118,7 +118,11 @@ impl ObjectStore for HttpStore { }) } - async fn put_multipart(&self, _location: &Path) -> Result> { + async fn put_multipart_opts( + &self, + _location: &Path, + _opts: PutMultipartOpts, + ) -> Result> { Err(crate::Error::NotImplemented) } diff --git a/src/lib.rs b/src/lib.rs index b492d93..ad72bd2 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -597,7 +597,20 @@ pub trait ObjectStore: std::fmt::Display + Send + Sync + Debug + 'static { /// /// Client should prefer [`ObjectStore::put`] for small payloads, as streaming uploads /// typically require multiple separate requests. See [`MultipartUpload`] for more information - async fn put_multipart(&self, location: &Path) -> Result>; + async fn put_multipart(&self, location: &Path) -> Result> { + self.put_multipart_opts(location, PutMultipartOpts::default()) + .await + } + + /// Perform a multipart upload with options + /// + /// Client should prefer [`ObjectStore::put`] for small payloads, as streaming uploads + /// typically require multiple separate requests. See [`MultipartUpload`] for more information + async fn put_multipart_opts( + &self, + location: &Path, + opts: PutMultipartOpts, + ) -> Result>; /// Return the bytes that are stored at the specified location. async fn get(&self, location: &Path) -> Result { @@ -785,6 +798,14 @@ macro_rules! as_ref_impl { self.as_ref().put_multipart(location).await } + async fn put_multipart_opts( + &self, + location: &Path, + opts: PutMultipartOpts, + ) -> Result> { + self.as_ref().put_multipart_opts(location, opts).await + } + async fn get(&self, location: &Path) -> Result { self.as_ref().get(location).await } @@ -1144,6 +1165,46 @@ impl From for PutOptions { } } +impl From for PutOptions { + fn from(attributes: Attributes) -> Self { + Self { + attributes, + ..Default::default() + } + } +} + +/// Options for [`ObjectStore::put_multipart_opts`] +#[derive(Debug, Clone, PartialEq, Eq, Default)] +pub struct PutMultipartOpts { + /// Provide a [`TagSet`] for this object + /// + /// Implementations that don't support object tagging should ignore this + pub tags: TagSet, + /// Provide a set of [`Attributes`] + /// + /// Implementations that don't support an attribute should return an error + pub attributes: Attributes, +} + +impl From for PutMultipartOpts { + fn from(tags: TagSet) -> Self { + Self { + tags, + ..Default::default() + } + } +} + +impl From for PutMultipartOpts { + fn from(attributes: Attributes) -> Self { + Self { + attributes, + ..Default::default() + } + } +} + /// Result for a put request #[derive(Debug, Clone, PartialEq, Eq)] pub struct PutResult { @@ -1688,10 +1749,7 @@ mod tests { ]); let path = Path::from("attributes"); - let opts = PutOptions { - attributes: attributes.clone(), - ..Default::default() - }; + let opts = attributes.clone().into(); match integration.put_opts(&path, "foo".into(), opts).await { Ok(_) => { let r = integration.get(&path).await.unwrap(); @@ -1700,6 +1758,19 @@ mod tests { Err(Error::NotImplemented) => {} Err(e) => panic!("{e}"), } + + let opts = attributes.clone().into(); + match integration.put_multipart_opts(&path, opts).await { + Ok(mut w) => { + w.put_part("foo".into()).await.unwrap(); + w.complete().await.unwrap(); + + let r = integration.get(&path).await.unwrap(); + assert_eq!(r.attributes, attributes); + } + Err(Error::NotImplemented) => {} + Err(e) => panic!("{e}"), + } } pub(crate) async fn get_opts(storage: &dyn ObjectStore) { @@ -2332,21 +2403,32 @@ mod tests { let path = Path::from("tag_test"); storage - .put_opts(&path, "test".into(), tag_set.into()) + .put_opts(&path, "test".into(), tag_set.clone().into()) .await .unwrap(); + let multi_path = Path::from("tag_test_multi"); + let mut write = storage + .put_multipart_opts(&multi_path, tag_set.into()) + .await + .unwrap(); + + write.put_part("foo".into()).await.unwrap(); + write.complete().await.unwrap(); + // Write should always succeed, but certain configurations may simply ignore tags if !validate { return; } - let resp = get_tags(path.clone()).await.unwrap(); - let body = resp.bytes().await.unwrap(); + for path in [path, multi_path] { + let resp = get_tags(path.clone()).await.unwrap(); + let body = resp.bytes().await.unwrap(); - let mut resp: Tagging = quick_xml::de::from_reader(body.reader()).unwrap(); - resp.list.tags.sort_by(|a, b| a.key.cmp(&b.key)); - assert_eq!(resp.list.tags, tags); + let mut resp: Tagging = quick_xml::de::from_reader(body.reader()).unwrap(); + resp.list.tags.sort_by(|a, b| a.key.cmp(&b.key)); + assert_eq!(resp.list.tags, tags); + } } async fn delete_fixtures(storage: &DynObjectStore) { diff --git a/src/limit.rs b/src/limit.rs index b94aa05..f3e1d42 100644 --- a/src/limit.rs +++ b/src/limit.rs @@ -19,7 +19,8 @@ use crate::{ BoxStream, GetOptions, GetResult, GetResultPayload, ListResult, MultipartUpload, ObjectMeta, - ObjectStore, Path, PutOptions, PutPayload, PutResult, Result, StreamExt, UploadPart, + ObjectStore, Path, PutMultipartOpts, PutOptions, PutPayload, PutResult, Result, StreamExt, + UploadPart, }; use async_trait::async_trait; use bytes::Bytes; @@ -91,6 +92,19 @@ impl ObjectStore for LimitStore { upload, })) } + + async fn put_multipart_opts( + &self, + location: &Path, + opts: PutMultipartOpts, + ) -> Result> { + let upload = self.inner.put_multipart_opts(location, opts).await?; + Ok(Box::new(LimitUpload { + semaphore: Arc::clone(&self.semaphore), + upload, + })) + } + async fn get(&self, location: &Path) -> Result { let permit = Arc::clone(&self.semaphore).acquire_owned().await.unwrap(); let r = self.inner.get(location).await?; diff --git a/src/local.rs b/src/local.rs index a3695ad..8dec5be 100644 --- a/src/local.rs +++ b/src/local.rs @@ -39,7 +39,7 @@ use crate::{ path::{absolute_path_to_url, Path}, util::InvalidGetRange, Attributes, GetOptions, GetResult, GetResultPayload, ListResult, MultipartUpload, ObjectMeta, - ObjectStore, PutMode, PutOptions, PutPayload, PutResult, Result, UploadPart, + ObjectStore, PutMode, PutMultipartOpts, PutOptions, PutPayload, PutResult, Result, UploadPart, }; /// A specialized `Error` for filesystem object store-related errors @@ -404,7 +404,15 @@ impl ObjectStore for LocalFileSystem { .await } - async fn put_multipart(&self, location: &Path) -> Result> { + async fn put_multipart_opts( + &self, + location: &Path, + opts: PutMultipartOpts, + ) -> Result> { + if !opts.attributes.is_empty() { + return Err(crate::Error::NotImplemented); + } + let dest = self.path_to_filesystem(location)?; let (file, src) = new_staged_upload(&dest)?; Ok(Box::new(LocalUpload::new(src, dest, file))) diff --git a/src/memory.rs b/src/memory.rs index e34b28f..daf14e1 100644 --- a/src/memory.rs +++ b/src/memory.rs @@ -31,8 +31,8 @@ use crate::multipart::{MultipartStore, PartId}; use crate::util::InvalidGetRange; use crate::{ path::Path, Attributes, GetRange, GetResult, GetResultPayload, ListResult, MultipartId, - MultipartUpload, ObjectMeta, ObjectStore, PutMode, PutOptions, PutResult, Result, - UpdateVersion, UploadPart, + MultipartUpload, ObjectMeta, ObjectStore, PutMode, PutMultipartOpts, PutOptions, PutResult, + Result, UpdateVersion, UploadPart, }; use crate::{GetOptions, PutPayload}; @@ -223,9 +223,14 @@ impl ObjectStore for InMemory { }) } - async fn put_multipart(&self, location: &Path) -> Result> { + async fn put_multipart_opts( + &self, + location: &Path, + opts: PutMultipartOpts, + ) -> Result> { Ok(Box::new(InMemoryUpload { location: location.clone(), + attributes: opts.attributes, parts: vec![], storage: Arc::clone(&self.storage), })) @@ -487,6 +492,7 @@ impl InMemory { #[derive(Debug)] struct InMemoryUpload { location: Path, + attributes: Attributes, parts: Vec, storage: Arc>, } @@ -503,10 +509,11 @@ impl MultipartUpload for InMemoryUpload { let mut buf = Vec::with_capacity(cap); let parts = self.parts.iter().flatten(); parts.for_each(|x| buf.extend_from_slice(x)); - let etag = self - .storage - .write() - .insert(&self.location, buf.into(), Attributes::new()); + let etag = self.storage.write().insert( + &self.location, + buf.into(), + std::mem::take(&mut self.attributes), + ); Ok(PutResult { e_tag: Some(etag.to_string()), diff --git a/src/prefix.rs b/src/prefix.rs index 1d1ffee..7c9ea58 100644 --- a/src/prefix.rs +++ b/src/prefix.rs @@ -22,8 +22,8 @@ use std::ops::Range; use crate::path::Path; use crate::{ - GetOptions, GetResult, ListResult, MultipartUpload, ObjectMeta, ObjectStore, PutOptions, - PutPayload, PutResult, Result, + GetOptions, GetResult, ListResult, MultipartUpload, ObjectMeta, ObjectStore, PutMultipartOpts, + PutOptions, PutPayload, PutResult, Result, }; #[doc(hidden)] @@ -100,6 +100,15 @@ impl ObjectStore for PrefixStore { self.inner.put_multipart(&full_path).await } + async fn put_multipart_opts( + &self, + location: &Path, + opts: PutMultipartOpts, + ) -> Result> { + let full_path = self.full_path(location); + self.inner.put_multipart_opts(&full_path, opts).await + } + async fn get(&self, location: &Path) -> Result { let full_path = self.full_path(location); self.inner.get(&full_path).await diff --git a/src/throttle.rs b/src/throttle.rs index d089784..38b6d7c 100644 --- a/src/throttle.rs +++ b/src/throttle.rs @@ -23,7 +23,7 @@ use std::{convert::TryInto, sync::Arc}; use crate::multipart::{MultipartStore, PartId}; use crate::{ path::Path, GetResult, GetResultPayload, ListResult, MultipartId, MultipartUpload, ObjectMeta, - ObjectStore, PutOptions, PutPayload, PutResult, Result, + ObjectStore, PutMultipartOpts, PutOptions, PutPayload, PutResult, Result, }; use crate::{GetOptions, UploadPart}; use async_trait::async_trait; @@ -171,6 +171,18 @@ impl ObjectStore for ThrottledStore { })) } + async fn put_multipart_opts( + &self, + location: &Path, + opts: PutMultipartOpts, + ) -> Result> { + let upload = self.inner.put_multipart_opts(location, opts).await?; + Ok(Box::new(ThrottledUpload { + upload, + sleep: self.config().wait_put_per_call, + })) + } + async fn get(&self, location: &Path) -> Result { sleep(self.config().wait_get_per_call).await; diff --git a/tests/get_range_file.rs b/tests/get_range_file.rs index 59c5934..c5550ac 100644 --- a/tests/get_range_file.rs +++ b/tests/get_range_file.rs @@ -46,7 +46,11 @@ impl ObjectStore for MyStore { self.0.put_opts(location, payload, opts).await } - async fn put_multipart(&self, _location: &Path) -> Result> { + async fn put_multipart_opts( + &self, + _location: &Path, + _opts: PutMultipartOpts, + ) -> Result> { todo!() } From a8e052869fea2a8648fa7fa8a9bc21a5873335c4 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 18 Apr 2024 11:40:33 +0100 Subject: [PATCH 296/397] Prepare object_store 0.10.0 (#5658) --- CHANGELOG-old.md | 42 ++++++++++++++++++ CHANGELOG.md | 74 +++++++++++++++++++------------- Cargo.toml | 2 +- dev/release/update_change_log.sh | 4 +- 4 files changed, 88 insertions(+), 34 deletions(-) diff --git a/CHANGELOG-old.md b/CHANGELOG-old.md index 141a8b9..3ccfcad 100644 --- a/CHANGELOG-old.md +++ b/CHANGELOG-old.md @@ -21,6 +21,48 @@ # Changelog +## [object_store_0.9.1](https://github.com/apache/arrow-rs/tree/object_store_0.9.1) (2024-03-01) + +[Full Changelog](https://github.com/apache/arrow-rs/compare/object_store_0.9.0...object_store_0.9.1) + +**Implemented enhancements:** + +- \[object\_store\] Enable anonymous read access for Azure [\#5424](https://github.com/apache/arrow-rs/issues/5424) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Support for additional URL formats in object\_store for Azure blob [\#5370](https://github.com/apache/arrow-rs/issues/5370) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Mention "Http" support in README [\#5320](https://github.com/apache/arrow-rs/issues/5320) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Pass Options to HttpBuilder in parse\_url\_opts [\#5310](https://github.com/apache/arrow-rs/issues/5310) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Remove Localstack DynamoDB Workaround Once Fixed Upstream [\#5267](https://github.com/apache/arrow-rs/issues/5267) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Can I use S3 server side encryption [\#5087](https://github.com/apache/arrow-rs/issues/5087) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] + +**Fixed bugs:** + +- delete\_stream fails in MinIO [\#5414](https://github.com/apache/arrow-rs/issues/5414) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- \[object\_store\] Completing an empty Multipart Upload fails for AWS S3 [\#5404](https://github.com/apache/arrow-rs/issues/5404) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Multipart upload can leave futures unpolled, leading to timeout [\#5366](https://github.com/apache/arrow-rs/issues/5366) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Broken Link in README \(Rust Object Store\) Content [\#5309](https://github.com/apache/arrow-rs/issues/5309) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] + +**Merged pull requests:** + +- Expose path\_to\_filesystem public [\#5441](https://github.com/apache/arrow-rs/pull/5441) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([metesynnada](https://github.com/metesynnada)) +- Update nix requirement from 0.27.1 to 0.28.0 in /object\_store [\#5432](https://github.com/apache/arrow-rs/pull/5432) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- Add BufWriter for Adapative Put / Multipart Upload [\#5431](https://github.com/apache/arrow-rs/pull/5431) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Enable anonymous access for MicrosoftAzure [\#5425](https://github.com/apache/arrow-rs/pull/5425) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([andrebsguedes](https://github.com/andrebsguedes)) +- fix\(object\_store\): Include Content-MD5 header for S3 DeleteObjects [\#5415](https://github.com/apache/arrow-rs/pull/5415) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([paraseba](https://github.com/paraseba)) +- docds\(object\_store\): Mention HTTP/WebDAV in README [\#5409](https://github.com/apache/arrow-rs/pull/5409) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([Xuanwo](https://github.com/Xuanwo)) +- \[object\_store\] Fix empty Multipart Upload for AWS S3 [\#5405](https://github.com/apache/arrow-rs/pull/5405) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([andrebsguedes](https://github.com/andrebsguedes)) +- feat: S3 server-side encryption [\#5402](https://github.com/apache/arrow-rs/pull/5402) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([wjones127](https://github.com/wjones127)) +- Pull container name from URL for Azure blob [\#5371](https://github.com/apache/arrow-rs/pull/5371) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([bradvoth](https://github.com/bradvoth)) +- docs\(object-store\): add warning to flush [\#5369](https://github.com/apache/arrow-rs/pull/5369) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([wjones127](https://github.com/wjones127)) +- Minor\(docs\): update master to main for DataFusion/Ballista [\#5363](https://github.com/apache/arrow-rs/pull/5363) ([caicancai](https://github.com/caicancai)) +- Test parse\_url\_opts for HTTP \(\#5310\) [\#5316](https://github.com/apache/arrow-rs/pull/5316) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Update IOx links [\#5312](https://github.com/apache/arrow-rs/pull/5312) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Pass options to HTTPBuilder in parse\_url\_opts \(\#5310\) [\#5311](https://github.com/apache/arrow-rs/pull/5311) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Bump actions/cache from 3 to 4 [\#5308](https://github.com/apache/arrow-rs/pull/5308) ([dependabot[bot]](https://github.com/apps/dependabot)) +- Remove localstack DynamoDB workaround \(\#5267\) [\#5307](https://github.com/apache/arrow-rs/pull/5307) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- refactor: log server error during object store retries [\#5294](https://github.com/apache/arrow-rs/pull/5294) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([crepererum](https://github.com/crepererum)) +- Prepare arrow 50.0.0 [\#5291](https://github.com/apache/arrow-rs/pull/5291) ([tustvold](https://github.com/tustvold)) +- Enable JS tests again [\#5287](https://github.com/apache/arrow-rs/pull/5287) ([domoritz](https://github.com/domoritz)) + ## [object_store_0.9.0](https://github.com/apache/arrow-rs/tree/object_store_0.9.0) (2024-01-05) [Full Changelog](https://github.com/apache/arrow-rs/compare/object_store_0.8.0...object_store_0.9.0) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0c6af67..dc58ecb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,46 +19,58 @@ # Changelog -## [object_store_0.9.1](https://github.com/apache/arrow-rs/tree/object_store_0.9.1) (2024-03-01) +## [object_store_0.10.0](https://github.com/apache/arrow-rs/tree/object_store_0.10.0) (2024-04-17) -[Full Changelog](https://github.com/apache/arrow-rs/compare/object_store_0.9.0...object_store_0.9.1) +[Full Changelog](https://github.com/apache/arrow-rs/compare/object_store_0.9.1...object_store_0.10.0) + +**Breaking changes:** + +- Add put\_multipart\_opts \(\#5435\) [\#5652](https://github.com/apache/arrow-rs/pull/5652) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Add Attributes API \(\#5329\) [\#5650](https://github.com/apache/arrow-rs/pull/5650) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Support non-contiguous put payloads / vectored writes \(\#5514\) [\#5538](https://github.com/apache/arrow-rs/pull/5538) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Replace AsyncWrite with Upload trait and rename MultiPartStore to MultipartStore \(\#5458\) [\#5500](https://github.com/apache/arrow-rs/pull/5500) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) **Implemented enhancements:** -- \[object\_store\] Enable anonymous read access for Azure [\#5424](https://github.com/apache/arrow-rs/issues/5424) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- Support for additional URL formats in object\_store for Azure blob [\#5370](https://github.com/apache/arrow-rs/issues/5370) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- Mention "Http" support in README [\#5320](https://github.com/apache/arrow-rs/issues/5320) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- Pass Options to HttpBuilder in parse\_url\_opts [\#5310](https://github.com/apache/arrow-rs/issues/5310) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- Remove Localstack DynamoDB Workaround Once Fixed Upstream [\#5267](https://github.com/apache/arrow-rs/issues/5267) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- Can I use S3 server side encryption [\#5087](https://github.com/apache/arrow-rs/issues/5087) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Improve Retry Coverage [\#5608](https://github.com/apache/arrow-rs/issues/5608) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Zero Copy Support [\#5593](https://github.com/apache/arrow-rs/issues/5593) +- ObjectStore bulk delete [\#5591](https://github.com/apache/arrow-rs/issues/5591) +- Retry on Broken Connection [\#5589](https://github.com/apache/arrow-rs/issues/5589) +- Inconsistent Multipart Nomenclature [\#5526](https://github.com/apache/arrow-rs/issues/5526) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- \[ObjectStore\] Non-Contiguous Write Payloads [\#5514](https://github.com/apache/arrow-rs/issues/5514) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- In Object Store, return version & etag on multipart put. [\#5443](https://github.com/apache/arrow-rs/issues/5443) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Release Object Store 0.9.1 [\#5436](https://github.com/apache/arrow-rs/issues/5436) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- object\_store: allow setting content-type per request [\#5329](https://github.com/apache/arrow-rs/issues/5329) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- GCS Signed URL Support [\#5233](https://github.com/apache/arrow-rs/issues/5233) **Fixed bugs:** -- delete\_stream fails in MinIO [\#5414](https://github.com/apache/arrow-rs/issues/5414) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- \[object\_store\] Completing an empty Multipart Upload fails for AWS S3 [\#5404](https://github.com/apache/arrow-rs/issues/5404) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- Multipart upload can leave futures unpolled, leading to timeout [\#5366](https://github.com/apache/arrow-rs/issues/5366) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- Broken Link in README \(Rust Object Store\) Content [\#5309](https://github.com/apache/arrow-rs/issues/5309) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- \[object\_store\] minor bug: typos present in local variable [\#5628](https://github.com/apache/arrow-rs/issues/5628) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- \[arrow-csv\] Schema inference requires csv on disk [\#5551](https://github.com/apache/arrow-rs/issues/5551) +- Local object store copy/rename with nonexistent `from` file loops forever instead of erroring [\#5503](https://github.com/apache/arrow-rs/issues/5503) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- object store ApplicationDefaultCredentials auth is not working on windows [\#5466](https://github.com/apache/arrow-rs/issues/5466) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- MicrosoftAzure store list result omits empty objects [\#5451](https://github.com/apache/arrow-rs/issues/5451) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] + +**Documentation updates:** + +- Minor: add additional documentation about `BufWriter` [\#5519](https://github.com/apache/arrow-rs/pull/5519) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([alamb](https://github.com/alamb)) **Merged pull requests:** -- Expose path\_to\_filesystem public [\#5441](https://github.com/apache/arrow-rs/pull/5441) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([metesynnada](https://github.com/metesynnada)) -- Update nix requirement from 0.27.1 to 0.28.0 in /object\_store [\#5432](https://github.com/apache/arrow-rs/pull/5432) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([dependabot[bot]](https://github.com/apps/dependabot)) -- Add BufWriter for Adapative Put / Multipart Upload [\#5431](https://github.com/apache/arrow-rs/pull/5431) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- Enable anonymous access for MicrosoftAzure [\#5425](https://github.com/apache/arrow-rs/pull/5425) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([andrebsguedes](https://github.com/andrebsguedes)) -- fix\(object\_store\): Include Content-MD5 header for S3 DeleteObjects [\#5415](https://github.com/apache/arrow-rs/pull/5415) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([paraseba](https://github.com/paraseba)) -- docds\(object\_store\): Mention HTTP/WebDAV in README [\#5409](https://github.com/apache/arrow-rs/pull/5409) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([Xuanwo](https://github.com/Xuanwo)) -- \[object\_store\] Fix empty Multipart Upload for AWS S3 [\#5405](https://github.com/apache/arrow-rs/pull/5405) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([andrebsguedes](https://github.com/andrebsguedes)) -- feat: S3 server-side encryption [\#5402](https://github.com/apache/arrow-rs/pull/5402) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([wjones127](https://github.com/wjones127)) -- Pull container name from URL for Azure blob [\#5371](https://github.com/apache/arrow-rs/pull/5371) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([bradvoth](https://github.com/bradvoth)) -- docs\(object-store\): add warning to flush [\#5369](https://github.com/apache/arrow-rs/pull/5369) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([wjones127](https://github.com/wjones127)) -- Minor\(docs\): update master to main for DataFusion/Ballista [\#5363](https://github.com/apache/arrow-rs/pull/5363) ([caicancai](https://github.com/caicancai)) -- Test parse\_url\_opts for HTTP \(\#5310\) [\#5316](https://github.com/apache/arrow-rs/pull/5316) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- Update IOx links [\#5312](https://github.com/apache/arrow-rs/pull/5312) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- Pass options to HTTPBuilder in parse\_url\_opts \(\#5310\) [\#5311](https://github.com/apache/arrow-rs/pull/5311) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- Bump actions/cache from 3 to 4 [\#5308](https://github.com/apache/arrow-rs/pull/5308) ([dependabot[bot]](https://github.com/apps/dependabot)) -- Remove localstack DynamoDB workaround \(\#5267\) [\#5307](https://github.com/apache/arrow-rs/pull/5307) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- refactor: log server error during object store retries [\#5294](https://github.com/apache/arrow-rs/pull/5294) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([crepererum](https://github.com/crepererum)) -- Prepare arrow 50.0.0 [\#5291](https://github.com/apache/arrow-rs/pull/5291) ([tustvold](https://github.com/tustvold)) -- Enable JS tests again [\#5287](https://github.com/apache/arrow-rs/pull/5287) ([domoritz](https://github.com/domoritz)) +- minor-fix: removed typos in object\_store sub crate [\#5629](https://github.com/apache/arrow-rs/pull/5629) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([Silemo](https://github.com/Silemo)) +- Retry on More Error Classes [\#5609](https://github.com/apache/arrow-rs/pull/5609) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([andrebsguedes](https://github.com/andrebsguedes)) +- Fix handling of empty multipart uploads for GCS [\#5590](https://github.com/apache/arrow-rs/pull/5590) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Upgrade object\_store dependency to use chrono `0.4.34` [\#5578](https://github.com/apache/arrow-rs/pull/5578) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([l1nxy](https://github.com/l1nxy)) +- Fix Latest Clippy Lints for object\_store [\#5546](https://github.com/apache/arrow-rs/pull/5546) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Update reqwest 0.12 and http 1.0 [\#5536](https://github.com/apache/arrow-rs/pull/5536) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Implement MultipartStore for ThrottledStore [\#5533](https://github.com/apache/arrow-rs/pull/5533) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- fix: copy/rename return error if source is nonexistent [\#5528](https://github.com/apache/arrow-rs/pull/5528) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([dimbtp](https://github.com/dimbtp)) +- Prepare arrow 51.0.0 [\#5516](https://github.com/apache/arrow-rs/pull/5516) ([tustvold](https://github.com/tustvold)) +- Implement MultiPartStore for InMemory [\#5495](https://github.com/apache/arrow-rs/pull/5495) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Add more comprehensive documentation on testing and benchmarking to CONTRIBUTING.md [\#5478](https://github.com/apache/arrow-rs/pull/5478) ([monkwire](https://github.com/monkwire)) +- add support for gcp application default auth on windows in object store [\#5473](https://github.com/apache/arrow-rs/pull/5473) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([Itayazolay](https://github.com/Itayazolay)) +- Update base64 requirement from 0.21 to 0.22 in /object\_store [\#5465](https://github.com/apache/arrow-rs/pull/5465) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- Uses ResourceType for filtering list directories instead of workaround [\#5452](https://github.com/apache/arrow-rs/pull/5452) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([andrebsguedes](https://github.com/andrebsguedes)) +- Add GCS signed URL support [\#5300](https://github.com/apache/arrow-rs/pull/5300) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([l1nxy](https://github.com/l1nxy)) \* *This Changelog was automatically generated by [github_changelog_generator](https://github.com/github-changelog-generator/github-changelog-generator)* diff --git a/Cargo.toml b/Cargo.toml index d0c3af2..ca1e5b3 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "object_store" -version = "0.9.1" +version = "0.10.0" edition = "2021" license = "MIT/Apache-2.0" readme = "README.md" diff --git a/dev/release/update_change_log.sh b/dev/release/update_change_log.sh index 83d5c32..5a28409 100755 --- a/dev/release/update_change_log.sh +++ b/dev/release/update_change_log.sh @@ -29,8 +29,8 @@ set -e -SINCE_TAG="object_store_0.9.0" -FUTURE_RELEASE="object_store_0.9.1" +SINCE_TAG="object_store_0.9.1" +FUTURE_RELEASE="object_store_0.10.0" SOURCE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" SOURCE_TOP_DIR="$(cd "${SOURCE_DIR}/../../" && pwd)" From 01cd3180dd533acb532652f771caffa7d2aa9692 Mon Sep 17 00:00:00 2001 From: nett_hier <66856670+netthier@users.noreply.github.com> Date: Fri, 26 Apr 2024 12:40:28 +0200 Subject: [PATCH 297/397] Add more attributes (#5690) Signed-off-by: netthier --- src/attributes.rs | 36 +++++++++++++++++++++++--- src/aws/client.rs | 10 ++++++-- src/azure/client.rs | 9 +++++++ src/client/get.rs | 62 ++++++++++++++++++++++++++++++++++++++------- src/gcp/client.rs | 8 +++++- src/http/client.rs | 11 ++++++-- src/lib.rs | 8 +++++- 7 files changed, 125 insertions(+), 19 deletions(-) diff --git a/src/attributes.rs b/src/attributes.rs index 9b90b53..ecef32e 100644 --- a/src/attributes.rs +++ b/src/attributes.rs @@ -23,6 +23,18 @@ use std::ops::Deref; #[non_exhaustive] #[derive(Debug, Hash, Eq, PartialEq, Clone)] pub enum Attribute { + /// Specifies how the object should be handled by a browser + /// + /// See [Content-Disposition](https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Content-Disposition) + ContentDisposition, + /// Specifies the encodings applied to the object + /// + /// See [Content-Encoding](https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Content-Encoding) + ContentEncoding, + /// Specifies the language of the object + /// + /// See [Content-Language](https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Content-Language) + ContentLanguage, /// Specifies the MIME type of the object /// /// This takes precedence over any [ClientOptions](crate::ClientOptions) configuration @@ -177,12 +189,15 @@ mod tests { #[test] fn test_attributes_basic() { let mut attributes = Attributes::from_iter([ + (Attribute::ContentDisposition, "inline"), + (Attribute::ContentEncoding, "gzip"), + (Attribute::ContentLanguage, "en-US"), (Attribute::ContentType, "test"), (Attribute::CacheControl, "control"), ]); assert!(!attributes.is_empty()); - assert_eq!(attributes.len(), 2); + assert_eq!(attributes.len(), 5); assert_eq!( attributes.get(&Attribute::ContentType), @@ -195,17 +210,30 @@ mod tests { attributes.insert(Attribute::CacheControl, "v1".into()), Some(metav) ); - assert_eq!(attributes.len(), 2); + assert_eq!(attributes.len(), 5); assert_eq!( attributes.remove(&Attribute::CacheControl).unwrap(), "v1".into() ); - assert_eq!(attributes.len(), 1); + assert_eq!(attributes.len(), 4); let metav: AttributeValue = "v2".into(); attributes.insert(Attribute::CacheControl, metav.clone()); assert_eq!(attributes.get(&Attribute::CacheControl), Some(&metav)); - assert_eq!(attributes.len(), 2); + assert_eq!(attributes.len(), 5); + + assert_eq!( + attributes.get(&Attribute::ContentDisposition), + Some(&"inline".into()) + ); + assert_eq!( + attributes.get(&Attribute::ContentEncoding), + Some(&"gzip".into()) + ); + assert_eq!( + attributes.get(&Attribute::ContentLanguage), + Some(&"en-US".into()) + ); } } diff --git a/src/aws/client.rs b/src/aws/client.rs index 4a4dc17..2424768 100644 --- a/src/aws/client.rs +++ b/src/aws/client.rs @@ -42,14 +42,17 @@ use async_trait::async_trait; use base64::prelude::BASE64_STANDARD; use base64::Engine; use bytes::{Buf, Bytes}; -use hyper::header::{CACHE_CONTROL, CONTENT_LENGTH}; +use hyper::header::{ + CACHE_CONTROL, CONTENT_DISPOSITION, CONTENT_ENCODING, CONTENT_LANGUAGE, CONTENT_LENGTH, + CONTENT_TYPE, +}; use hyper::http::HeaderName; use hyper::{http, HeaderMap}; use itertools::Itertools; use md5::{Digest, Md5}; use percent_encoding::{utf8_percent_encode, PercentEncode}; use quick_xml::events::{self as xml_events}; -use reqwest::{header::CONTENT_TYPE, Client as ReqwestClient, Method, RequestBuilder, Response}; +use reqwest::{Client as ReqwestClient, Method, RequestBuilder, Response}; use ring::digest; use ring::digest::Context; use serde::{Deserialize, Serialize}; @@ -322,6 +325,9 @@ impl<'a> Request<'a> { for (k, v) in &attributes { builder = match k { Attribute::CacheControl => builder.header(CACHE_CONTROL, v.as_ref()), + Attribute::ContentDisposition => builder.header(CONTENT_DISPOSITION, v.as_ref()), + Attribute::ContentEncoding => builder.header(CONTENT_ENCODING, v.as_ref()), + Attribute::ContentLanguage => builder.header(CONTENT_LANGUAGE, v.as_ref()), Attribute::ContentType => { has_content_type = true; builder.header(CONTENT_TYPE, v.as_ref()) diff --git a/src/azure/client.rs b/src/azure/client.rs index 918fcd0..311bd72 100644 --- a/src/azure/client.rs +++ b/src/azure/client.rs @@ -50,6 +50,10 @@ use url::Url; const VERSION_HEADER: &str = "x-ms-version-id"; static MS_CACHE_CONTROL: HeaderName = HeaderName::from_static("x-ms-blob-cache-control"); static MS_CONTENT_TYPE: HeaderName = HeaderName::from_static("x-ms-blob-content-type"); +static MS_CONTENT_DISPOSITION: HeaderName = + HeaderName::from_static("x-ms-blob-content-disposition"); +static MS_CONTENT_ENCODING: HeaderName = HeaderName::from_static("x-ms-blob-content-encoding"); +static MS_CONTENT_LANGUAGE: HeaderName = HeaderName::from_static("x-ms-blob-content-language"); static TAGS_HEADER: HeaderName = HeaderName::from_static("x-ms-tags"); @@ -206,6 +210,11 @@ impl<'a> PutRequest<'a> { for (k, v) in &attributes { builder = match k { Attribute::CacheControl => builder.header(&MS_CACHE_CONTROL, v.as_ref()), + Attribute::ContentDisposition => { + builder.header(&MS_CONTENT_DISPOSITION, v.as_ref()) + } + Attribute::ContentEncoding => builder.header(&MS_CONTENT_ENCODING, v.as_ref()), + Attribute::ContentLanguage => builder.header(&MS_CONTENT_LANGUAGE, v.as_ref()), Attribute::ContentType => { has_content_type = true; builder.header(&MS_CONTENT_TYPE, v.as_ref()) diff --git a/src/client/get.rs b/src/client/get.rs index f700457..430b87b 100644 --- a/src/client/get.rs +++ b/src/client/get.rs @@ -22,7 +22,10 @@ use crate::path::Path; use crate::{Attribute, Attributes, GetOptions, GetRange, GetResult, GetResultPayload, Result}; use async_trait::async_trait; use futures::{StreamExt, TryStreamExt}; -use hyper::header::{CACHE_CONTROL, CONTENT_RANGE, CONTENT_TYPE}; +use hyper::header::{ + CACHE_CONTROL, CONTENT_DISPOSITION, CONTENT_ENCODING, CONTENT_LANGUAGE, CONTENT_RANGE, + CONTENT_TYPE, +}; use hyper::StatusCode; use reqwest::header::ToStrError; use reqwest::Response; @@ -120,6 +123,15 @@ enum GetResultError { #[snafu(display("Cache-Control header contained non UTF-8 characters"))] InvalidCacheControl { source: ToStrError }, + #[snafu(display("Content-Disposition header contained non UTF-8 characters"))] + InvalidContentDisposition { source: ToStrError }, + + #[snafu(display("Content-Encoding header contained non UTF-8 characters"))] + InvalidContentEncoding { source: ToStrError }, + + #[snafu(display("Content-Language header contained non UTF-8 characters"))] + InvalidContentLanguage { source: ToStrError }, + #[snafu(display("Content-Type header contained non UTF-8 characters"))] InvalidContentType { source: ToStrError }, @@ -167,16 +179,48 @@ fn get_result( 0..meta.size }; - let mut attributes = Attributes::new(); - if let Some(x) = response.headers().get(CACHE_CONTROL) { - let x = x.to_str().context(InvalidCacheControlSnafu)?; - attributes.insert(Attribute::CacheControl, x.to_string().into()); - } - if let Some(x) = response.headers().get(CONTENT_TYPE) { - let x = x.to_str().context(InvalidContentTypeSnafu)?; - attributes.insert(Attribute::ContentType, x.to_string().into()); + macro_rules! parse_attributes { + ($headers:expr, $(($header:expr, $attr:expr, $err:expr)),*) => {{ + let mut attributes = Attributes::new(); + $( + if let Some(x) = $headers.get($header) { + let x = x.to_str().context($err)?; + attributes.insert($attr, x.to_string().into()); + } + )* + attributes + }} } + let attributes = parse_attributes!( + response.headers(), + ( + CACHE_CONTROL, + Attribute::CacheControl, + InvalidCacheControlSnafu + ), + ( + CONTENT_DISPOSITION, + Attribute::ContentDisposition, + InvalidContentDispositionSnafu + ), + ( + CONTENT_ENCODING, + Attribute::ContentEncoding, + InvalidContentEncodingSnafu + ), + ( + CONTENT_LANGUAGE, + Attribute::ContentLanguage, + InvalidContentLanguageSnafu + ), + ( + CONTENT_TYPE, + Attribute::ContentType, + InvalidContentTypeSnafu + ) + ); + let stream = response .bytes_stream() .map_err(|source| crate::Error::Generic { diff --git a/src/gcp/client.rs b/src/gcp/client.rs index 9c39efe..a549325 100644 --- a/src/gcp/client.rs +++ b/src/gcp/client.rs @@ -36,7 +36,10 @@ use async_trait::async_trait; use base64::prelude::BASE64_STANDARD; use base64::Engine; use bytes::Buf; -use hyper::header::{CACHE_CONTROL, CONTENT_LENGTH, CONTENT_TYPE}; +use hyper::header::{ + CACHE_CONTROL, CONTENT_DISPOSITION, CONTENT_ENCODING, CONTENT_LANGUAGE, CONTENT_LENGTH, + CONTENT_TYPE, +}; use percent_encoding::{percent_encode, utf8_percent_encode, NON_ALPHANUMERIC}; use reqwest::header::HeaderName; use reqwest::{Client, Method, RequestBuilder, Response, StatusCode}; @@ -195,6 +198,9 @@ impl<'a> Request<'a> { for (k, v) in &attributes { builder = match k { Attribute::CacheControl => builder.header(CACHE_CONTROL, v.as_ref()), + Attribute::ContentDisposition => builder.header(CONTENT_DISPOSITION, v.as_ref()), + Attribute::ContentEncoding => builder.header(CONTENT_ENCODING, v.as_ref()), + Attribute::ContentLanguage => builder.header(CONTENT_LANGUAGE, v.as_ref()), Attribute::ContentType => { has_content_type = true; builder.header(CONTENT_TYPE, v.as_ref()) diff --git a/src/http/client.rs b/src/http/client.rs index cf25919..4dccef8 100644 --- a/src/http/client.rs +++ b/src/http/client.rs @@ -25,9 +25,11 @@ use crate::{Attribute, Attributes, ClientOptions, GetOptions, ObjectMeta, PutPay use async_trait::async_trait; use bytes::Buf; use chrono::{DateTime, Utc}; -use hyper::header::{CACHE_CONTROL, CONTENT_LENGTH}; +use hyper::header::{ + CACHE_CONTROL, CONTENT_DISPOSITION, CONTENT_ENCODING, CONTENT_LANGUAGE, CONTENT_LENGTH, + CONTENT_TYPE, +}; use percent_encoding::percent_decode_str; -use reqwest::header::CONTENT_TYPE; use reqwest::{Method, Response, StatusCode}; use serde::Deserialize; use snafu::{OptionExt, ResultExt, Snafu}; @@ -172,6 +174,11 @@ impl Client { for (k, v) in &attributes { builder = match k { Attribute::CacheControl => builder.header(CACHE_CONTROL, v.as_ref()), + Attribute::ContentDisposition => { + builder.header(CONTENT_DISPOSITION, v.as_ref()) + } + Attribute::ContentEncoding => builder.header(CONTENT_ENCODING, v.as_ref()), + Attribute::ContentLanguage => builder.header(CONTENT_LANGUAGE, v.as_ref()), Attribute::ContentType => { has_content_type = true; builder.header(CONTENT_TYPE, v.as_ref()) diff --git a/src/lib.rs b/src/lib.rs index ad72bd2..c99e15a 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1744,8 +1744,14 @@ mod tests { pub(crate) async fn put_get_attributes(integration: &dyn ObjectStore) { // Test handling of attributes let attributes = Attributes::from_iter([ - (Attribute::ContentType, "text/html; charset=utf-8"), (Attribute::CacheControl, "max-age=604800"), + ( + Attribute::ContentDisposition, + r#"attachment; filename="test.html""#, + ), + (Attribute::ContentEncoding, "gzip"), + (Attribute::ContentLanguage, "en-US"), + (Attribute::ContentType, "text/html; charset=utf-8"), ]); let path = Path::from("attributes"); From f7201110e1c4a8b42c7fa27a8a47c0c03ec615e9 Mon Sep 17 00:00:00 2001 From: nett_hier <66856670+netthier@users.noreply.github.com> Date: Fri, 26 Apr 2024 13:06:52 +0200 Subject: [PATCH 298/397] Add `BufWriter::with_attributes` and `::with_tags` in `object_store` (#5693) * Add `BufWriter::with_attributes` Signed-off-by: netthier * Add `BufWriter::with_tags` Signed-off-by: netthier --------- Signed-off-by: netthier --- src/aws/mod.rs | 14 ++++++--- src/azure/mod.rs | 14 ++++++--- src/buffered.rs | 77 +++++++++++++++++++++++++++++++++++++++++++----- src/lib.rs | 13 ++++++-- 4 files changed, 99 insertions(+), 19 deletions(-) diff --git a/src/aws/mod.rs b/src/aws/mod.rs index 7f1edf1..5bc6d56 100644 --- a/src/aws/mod.rs +++ b/src/aws/mod.rs @@ -437,10 +437,16 @@ mod tests { // Object tagging is not supported by S3 Express One Zone if config.session_provider.is_none() { - tagging(&integration, !config.disable_tagging, |p| { - let client = Arc::clone(&integration.client); - async move { client.get_object_tagging(&p).await } - }) + tagging( + Arc::new(AmazonS3 { + client: Arc::clone(&integration.client), + }), + !config.disable_tagging, + |p| { + let client = Arc::clone(&integration.client); + async move { client.get_object_tagging(&p).await } + }, + ) .await; } diff --git a/src/azure/mod.rs b/src/azure/mod.rs index 25ae6dd..755f3b1 100644 --- a/src/azure/mod.rs +++ b/src/azure/mod.rs @@ -296,10 +296,16 @@ mod tests { signing(&integration).await; let validate = !integration.client.config().disable_tagging; - tagging(&integration, validate, |p| { - let client = Arc::clone(&integration.client); - async move { client.get_blob_tagging(&p).await } - }) + tagging( + Arc::new(MicrosoftAzure { + client: Arc::clone(&integration.client), + }), + validate, + |p| { + let client = Arc::clone(&integration.client); + async move { client.get_blob_tagging(&p).await } + }, + ) .await; // Azurite doesn't support attributes properly diff --git a/src/buffered.rs b/src/buffered.rs index d412241..feb84d4 100644 --- a/src/buffered.rs +++ b/src/buffered.rs @@ -18,7 +18,10 @@ //! Utilities for performing tokio-style buffered IO use crate::path::Path; -use crate::{ObjectMeta, ObjectStore, PutPayloadMut, WriteMultipart}; +use crate::{ + Attributes, ObjectMeta, ObjectStore, PutMultipartOpts, PutOptions, PutPayloadMut, TagSet, + WriteMultipart, +}; use bytes::Bytes; use futures::future::{BoxFuture, FutureExt}; use futures::ready; @@ -217,6 +220,8 @@ impl AsyncBufRead for BufReader { pub struct BufWriter { capacity: usize, max_concurrency: usize, + attributes: Option, + tags: Option, state: BufWriterState, store: Arc, } @@ -252,6 +257,8 @@ impl BufWriter { capacity, store, max_concurrency: 8, + attributes: None, + tags: None, state: BufWriterState::Buffer(path, PutPayloadMut::new()), } } @@ -266,6 +273,22 @@ impl BufWriter { } } + /// Set the attributes of the uploaded object + pub fn with_attributes(self, attributes: Attributes) -> Self { + Self { + attributes: Some(attributes), + ..self + } + } + + /// Set the tags of the uploaded object + pub fn with_tags(self, tags: TagSet) -> Self { + Self { + tags: Some(tags), + ..self + } + } + /// Abort this writer, cleaning up any partially uploaded state /// /// # Panic @@ -306,9 +329,13 @@ impl AsyncWrite for BufWriter { if b.content_length().saturating_add(buf.len()) >= cap { let buffer = std::mem::take(b); let path = std::mem::take(path); + let opts = PutMultipartOpts { + attributes: self.attributes.take().unwrap_or_default(), + tags: self.tags.take().unwrap_or_default(), + }; let store = Arc::clone(&self.store); self.state = BufWriterState::Prepare(Box::pin(async move { - let upload = store.put_multipart(&path).await?; + let upload = store.put_multipart_opts(&path, opts).await?; let mut chunked = WriteMultipart::new_with_chunk_size(upload, cap); for chunk in buffer.freeze() { chunked.put(chunk); @@ -346,9 +373,14 @@ impl AsyncWrite for BufWriter { BufWriterState::Buffer(p, b) => { let buf = std::mem::take(b); let path = std::mem::take(p); + let opts = PutOptions { + attributes: self.attributes.take().unwrap_or_default(), + tags: self.tags.take().unwrap_or_default(), + ..Default::default() + }; let store = Arc::clone(&self.store); self.state = BufWriterState::Flush(Box::pin(async move { - store.put(&path, buf.into()).await?; + store.put_opts(&path, buf.into(), opts).await?; Ok(()) })); } @@ -383,6 +415,7 @@ mod tests { use super::*; use crate::memory::InMemory; use crate::path::Path; + use crate::{Attribute, GetOptions}; use tokio::io::{AsyncBufReadExt, AsyncReadExt, AsyncSeekExt, AsyncWriteExt}; #[tokio::test] @@ -464,26 +497,54 @@ mod tests { } } + // Note: `BufWriter::with_tags` functionality is tested in `crate::tests::tagging` #[tokio::test] async fn test_buf_writer() { let store = Arc::new(InMemory::new()) as Arc; let path = Path::from("file.txt"); + let attributes = Attributes::from_iter([ + (Attribute::ContentType, "text/html"), + (Attribute::CacheControl, "max-age=604800"), + ]); // Test put - let mut writer = BufWriter::with_capacity(Arc::clone(&store), path.clone(), 30); + let mut writer = BufWriter::with_capacity(Arc::clone(&store), path.clone(), 30) + .with_attributes(attributes.clone()); writer.write_all(&[0; 20]).await.unwrap(); writer.flush().await.unwrap(); writer.write_all(&[0; 5]).await.unwrap(); writer.shutdown().await.unwrap(); - assert_eq!(store.head(&path).await.unwrap().size, 25); + let response = store + .get_opts( + &path, + GetOptions { + head: true, + ..Default::default() + }, + ) + .await + .unwrap(); + assert_eq!(response.meta.size, 25); + assert_eq!(response.attributes, attributes); // Test multipart - let mut writer = BufWriter::with_capacity(Arc::clone(&store), path.clone(), 30); + let mut writer = BufWriter::with_capacity(Arc::clone(&store), path.clone(), 30) + .with_attributes(attributes.clone()); writer.write_all(&[0; 20]).await.unwrap(); writer.flush().await.unwrap(); writer.write_all(&[0; 20]).await.unwrap(); writer.shutdown().await.unwrap(); - - assert_eq!(store.head(&path).await.unwrap().size, 40); + let response = store + .get_opts( + &path, + GetOptions { + head: true, + ..Default::default() + }, + ) + .await + .unwrap(); + assert_eq!(response.meta.size, 40); + assert_eq!(response.attributes, attributes); } } diff --git a/src/lib.rs b/src/lib.rs index c99e15a..9a8f77b 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1314,12 +1314,14 @@ mod test_util { #[cfg(test)] mod tests { use super::*; + use crate::buffered::BufWriter; use crate::multipart::MultipartStore; use crate::test_util::flatten_list_stream; use chrono::TimeZone; use futures::stream::FuturesUnordered; use rand::distributions::Alphanumeric; use rand::{thread_rng, Rng}; + use tokio::io::AsyncWriteExt; pub(crate) async fn put_get_delete_list(storage: &DynObjectStore) { delete_fixtures(storage).await; @@ -2365,7 +2367,7 @@ mod tests { } #[cfg(any(feature = "aws", feature = "azure"))] - pub(crate) async fn tagging(storage: &dyn ObjectStore, validate: bool, get_tags: F) + pub(crate) async fn tagging(storage: Arc, validate: bool, get_tags: F) where F: Fn(Path) -> Fut + Send + Sync, Fut: std::future::Future> + Send, @@ -2415,19 +2417,24 @@ mod tests { let multi_path = Path::from("tag_test_multi"); let mut write = storage - .put_multipart_opts(&multi_path, tag_set.into()) + .put_multipart_opts(&multi_path, tag_set.clone().into()) .await .unwrap(); write.put_part("foo".into()).await.unwrap(); write.complete().await.unwrap(); + let buf_path = Path::from("tag_test_buf"); + let mut buf = BufWriter::new(storage, buf_path.clone()).with_tags(tag_set); + buf.write_all(b"foo").await.unwrap(); + buf.shutdown().await.unwrap(); + // Write should always succeed, but certain configurations may simply ignore tags if !validate { return; } - for path in [path, multi_path] { + for path in [path, multi_path, buf_path] { let resp = get_tags(path.clone()).await.unwrap(); let body = resp.bytes().await.unwrap(); From cbe489b3aaa6a01bdfe1dedcf0d53f7a38244547 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Sat, 4 May 2024 22:13:25 +0100 Subject: [PATCH 299/397] Export object_store integration tests (#5709) * Export object_store integration tests * Clippy * Clippy * Even more clippy * Format --- Cargo.toml | 1 + src/aws/builder.rs | 12 - src/aws/client.rs | 6 - src/aws/dynamo.rs | 12 - src/aws/mod.rs | 18 +- src/azure/builder.rs | 5 - src/azure/client.rs | 11 - src/azure/mod.rs | 3 +- src/chunked.rs | 2 +- src/client/list.rs | 1 + src/gcp/builder.rs | 5 - src/gcp/client.rs | 6 - src/gcp/credential.rs | 2 +- src/gcp/mod.rs | 15 +- src/http/mod.rs | 6 +- src/integration.rs | 1082 +++++++++++++++++++++++++++++++++++++++++ src/lib.rs | 1082 +---------------------------------------- src/limit.rs | 2 +- src/local.rs | 3 +- src/memory.rs | 2 +- src/parse.rs | 6 - src/prefix.rs | 3 +- src/throttle.rs | 2 +- 23 files changed, 1130 insertions(+), 1157 deletions(-) create mode 100644 src/integration.rs diff --git a/Cargo.toml b/Cargo.toml index ca1e5b3..c61946f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -66,6 +66,7 @@ gcp = ["cloud", "rustls-pemfile"] aws = ["cloud", "md-5"] http = ["cloud"] tls-webpki-roots = ["reqwest?/rustls-tls-webpki-roots"] +integration = [] [dev-dependencies] # In alphabetical order futures-test = "0.3" diff --git a/src/aws/builder.rs b/src/aws/builder.rs index 664e183..1e42093 100644 --- a/src/aws/builder.rs +++ b/src/aws/builder.rs @@ -70,21 +70,9 @@ enum Error { #[snafu(display("Configuration key: '{}' is not known.", key))] UnknownConfigurationKey { key: String }, - #[snafu(display("Bucket '{}' not found", bucket))] - BucketNotFound { bucket: String }, - - #[snafu(display("Failed to resolve region for bucket '{}'", bucket))] - ResolveRegion { - bucket: String, - source: reqwest::Error, - }, - #[snafu(display("Invalid Zone suffix for bucket '{bucket}'"))] ZoneSuffix { bucket: String }, - #[snafu(display("Failed to parse the region for bucket '{}'", bucket))] - RegionParse { bucket: String }, - #[snafu(display("Invalid encryption type: {}. Valid values are \"AES256\", \"sse:kms\", and \"sse:kms:dsse\".", passed))] InvalidEncryptionType { passed: String }, diff --git a/src/aws/client.rs b/src/aws/client.rs index 2424768..98226c4 100644 --- a/src/aws/client.rs +++ b/src/aws/client.rs @@ -66,12 +66,6 @@ const SHA256_CHECKSUM: &str = "x-amz-checksum-sha256"; #[derive(Debug, Snafu)] #[allow(missing_docs)] pub(crate) enum Error { - #[snafu(display("Error fetching get response body {}: {}", path, source))] - GetResponseBody { - source: reqwest::Error, - path: String, - }, - #[snafu(display("Error performing DeleteObjects request: {}", source))] DeleteObjectsRequest { source: crate::client::retry::Error }, diff --git a/src/aws/dynamo.rs b/src/aws/dynamo.rs index 2390187..9de67e5 100644 --- a/src/aws/dynamo.rs +++ b/src/aws/dynamo.rs @@ -451,18 +451,6 @@ struct PutItem<'a> { return_values_on_condition_check_failure: Option, } -/// A DynamoDB [GetItem] payload -/// -/// [GetItem]: https://docs.aws.amazon.com/amazondynamodb/latest/APIReference/API_GetItem.html -#[derive(Serialize)] -#[serde(rename_all = "PascalCase")] -struct GetItem<'a> { - /// The table name - table_name: &'a str, - /// The primary key - key: Map<'a, &'a str, AttributeValue<'a>>, -} - #[derive(Deserialize)] struct ErrorResponse<'a> { #[serde(rename = "__type")] diff --git a/src/aws/mod.rs b/src/aws/mod.rs index 5bc6d56..f5204a5 100644 --- a/src/aws/mod.rs +++ b/src/aws/mod.rs @@ -409,14 +409,16 @@ impl MultipartStore for AmazonS3 { #[cfg(test)] mod tests { use super::*; - use crate::{client::get::GetClient, tests::*}; + use crate::client::get::GetClient; + use crate::integration::*; + use crate::tests::*; use hyper::HeaderMap; const NON_EXISTENT_NAME: &str = "nonexistentname"; #[tokio::test] async fn s3_test() { - crate::test_util::maybe_skip_integration!(); + maybe_skip_integration!(); let config = AmazonS3Builder::from_env(); let integration = config.build().unwrap(); @@ -475,7 +477,7 @@ mod tests { #[tokio::test] async fn s3_test_get_nonexistent_location() { - crate::test_util::maybe_skip_integration!(); + maybe_skip_integration!(); let integration = AmazonS3Builder::from_env().build().unwrap(); let location = Path::from_iter([NON_EXISTENT_NAME]); @@ -488,7 +490,7 @@ mod tests { #[tokio::test] async fn s3_test_get_nonexistent_bucket() { - crate::test_util::maybe_skip_integration!(); + maybe_skip_integration!(); let config = AmazonS3Builder::from_env().with_bucket_name(NON_EXISTENT_NAME); let integration = config.build().unwrap(); @@ -500,7 +502,7 @@ mod tests { #[tokio::test] async fn s3_test_put_nonexistent_bucket() { - crate::test_util::maybe_skip_integration!(); + maybe_skip_integration!(); let config = AmazonS3Builder::from_env().with_bucket_name(NON_EXISTENT_NAME); let integration = config.build().unwrap(); @@ -513,7 +515,7 @@ mod tests { #[tokio::test] async fn s3_test_delete_nonexistent_location() { - crate::test_util::maybe_skip_integration!(); + maybe_skip_integration!(); let integration = AmazonS3Builder::from_env().build().unwrap(); let location = Path::from_iter([NON_EXISTENT_NAME]); @@ -523,7 +525,7 @@ mod tests { #[tokio::test] async fn s3_test_delete_nonexistent_bucket() { - crate::test_util::maybe_skip_integration!(); + maybe_skip_integration!(); let config = AmazonS3Builder::from_env().with_bucket_name(NON_EXISTENT_NAME); let integration = config.build().unwrap(); @@ -560,7 +562,7 @@ mod tests { } async fn s3_encryption(store: &AmazonS3) { - crate::test_util::maybe_skip_integration!(); + maybe_skip_integration!(); let data = PutPayload::from(vec![3u8; 1024]); diff --git a/src/azure/builder.rs b/src/azure/builder.rs index ee09534..c0c4e89 100644 --- a/src/azure/builder.rs +++ b/src/azure/builder.rs @@ -89,11 +89,6 @@ enum Error { #[snafu(display("Configuration key: '{}' is not known.", key))] UnknownConfigurationKey { key: String }, - - #[snafu(display("Unable to extract metadata from headers: {}", source))] - Metadata { - source: crate::client::header::Error, - }, } impl From for crate::Error { diff --git a/src/azure/client.rs b/src/azure/client.rs index 311bd72..be760c7 100644 --- a/src/azure/client.rs +++ b/src/azure/client.rs @@ -67,12 +67,6 @@ pub(crate) enum Error { path: String, }, - #[snafu(display("Error getting get response body {}: {}", path, source))] - GetResponseBody { - source: reqwest::Error, - path: String, - }, - #[snafu(display("Error performing put request {}: {}", path, source))] PutRequest { source: crate::client::retry::Error, @@ -94,11 +88,6 @@ pub(crate) enum Error { #[snafu(display("Got invalid list response: {}", source))] InvalidListResponse { source: quick_xml::de::DeError }, - #[snafu(display("Error authorizing request: {}", source))] - Authorization { - source: crate::azure::credential::Error, - }, - #[snafu(display("Unable to extract metadata from headers: {}", source))] Metadata { source: crate::client::header::Error, diff --git a/src/azure/mod.rs b/src/azure/mod.rs index 755f3b1..f89a184 100644 --- a/src/azure/mod.rs +++ b/src/azure/mod.rs @@ -276,12 +276,13 @@ impl MultipartStore for MicrosoftAzure { #[cfg(test)] mod tests { use super::*; + use crate::integration::*; use crate::tests::*; use bytes::Bytes; #[tokio::test] async fn azure_blob_test() { - crate::test_util::maybe_skip_integration!(); + maybe_skip_integration!(); let integration = MicrosoftAzureBuilder::from_env().build().unwrap(); put_get_delete_list(&integration).await; diff --git a/src/chunked.rs b/src/chunked.rs index a3bd762..98cc204 100644 --- a/src/chunked.rs +++ b/src/chunked.rs @@ -178,10 +178,10 @@ impl ObjectStore for ChunkedStore { mod tests { use futures::StreamExt; + use crate::integration::*; use crate::local::LocalFileSystem; use crate::memory::InMemory; use crate::path::Path; - use crate::tests::*; use super::*; diff --git a/src/client/list.rs b/src/client/list.rs index 371894d..2dbe20f 100644 --- a/src/client/list.rs +++ b/src/client/list.rs @@ -48,6 +48,7 @@ pub trait ListClientExt { fn list(&self, prefix: Option<&Path>) -> BoxStream<'_, Result>; + #[allow(unused)] fn list_with_offset( &self, prefix: Option<&Path>, diff --git a/src/gcp/builder.rs b/src/gcp/builder.rs index e6da312..82dab14 100644 --- a/src/gcp/builder.rs +++ b/src/gcp/builder.rs @@ -60,11 +60,6 @@ enum Error { #[snafu(display("Configuration key: '{}' is not known.", key))] UnknownConfigurationKey { key: String }, - #[snafu(display("Unable to extract metadata from headers: {}", source))] - Metadata { - source: crate::client::header::Error, - }, - #[snafu(display("GCP credential error: {}", source))] Credential { source: credential::Error }, } diff --git a/src/gcp/client.rs b/src/gcp/client.rs index a549325..35c64cc 100644 --- a/src/gcp/client.rs +++ b/src/gcp/client.rs @@ -81,12 +81,6 @@ enum Error { #[snafu(display("Got invalid put response: {}", source))] InvalidPutResponse { source: quick_xml::de::DeError }, - #[snafu(display("Error performing post request {}: {}", path, source))] - PostRequest { - source: crate::client::retry::Error, - path: String, - }, - #[snafu(display("Unable to extract metadata from headers: {}", source))] Metadata { source: crate::client::header::Error, diff --git a/src/gcp/credential.rs b/src/gcp/credential.rs index d7fc2ce..829db9b 100644 --- a/src/gcp/credential.rs +++ b/src/gcp/credential.rs @@ -536,7 +536,7 @@ impl ApplicationDefaultCredentials { let path = Path::new(&home).join(Self::CREDENTIALS_PATH); // It's expected for this file to not exist unless it has been explicitly configured by the user. - if path.try_exists().unwrap_or(false) { + if path.exists() { return read_credentials_file::(path).map(Some); } } diff --git a/src/gcp/mod.rs b/src/gcp/mod.rs index 0ec6e7e..039ec46 100644 --- a/src/gcp/mod.rs +++ b/src/gcp/mod.rs @@ -273,6 +273,7 @@ mod test { use credential::DEFAULT_GCS_BASE_URL; + use crate::integration::*; use crate::tests::*; use super::*; @@ -281,7 +282,7 @@ mod test { #[tokio::test] async fn gcs_test() { - crate::test_util::maybe_skip_integration!(); + maybe_skip_integration!(); let integration = GoogleCloudStorageBuilder::from_env().build().unwrap(); put_get_delete_list(&integration).await; @@ -307,7 +308,7 @@ mod test { #[tokio::test] #[ignore] async fn gcs_test_sign() { - crate::test_util::maybe_skip_integration!(); + maybe_skip_integration!(); let integration = GoogleCloudStorageBuilder::from_env().build().unwrap(); let client = reqwest::Client::new(); @@ -336,7 +337,7 @@ mod test { #[tokio::test] async fn gcs_test_get_nonexistent_location() { - crate::test_util::maybe_skip_integration!(); + maybe_skip_integration!(); let integration = GoogleCloudStorageBuilder::from_env().build().unwrap(); let location = Path::from_iter([NON_EXISTENT_NAME]); @@ -351,7 +352,7 @@ mod test { #[tokio::test] async fn gcs_test_get_nonexistent_bucket() { - crate::test_util::maybe_skip_integration!(); + maybe_skip_integration!(); let config = GoogleCloudStorageBuilder::from_env(); let integration = config.with_bucket_name(NON_EXISTENT_NAME).build().unwrap(); @@ -369,7 +370,7 @@ mod test { #[tokio::test] async fn gcs_test_delete_nonexistent_location() { - crate::test_util::maybe_skip_integration!(); + maybe_skip_integration!(); let integration = GoogleCloudStorageBuilder::from_env().build().unwrap(); let location = Path::from_iter([NON_EXISTENT_NAME]); @@ -383,7 +384,7 @@ mod test { #[tokio::test] async fn gcs_test_delete_nonexistent_bucket() { - crate::test_util::maybe_skip_integration!(); + maybe_skip_integration!(); let config = GoogleCloudStorageBuilder::from_env(); let integration = config.with_bucket_name(NON_EXISTENT_NAME).build().unwrap(); @@ -398,7 +399,7 @@ mod test { #[tokio::test] async fn gcs_test_put_nonexistent_bucket() { - crate::test_util::maybe_skip_integration!(); + maybe_skip_integration!(); let config = GoogleCloudStorageBuilder::from_env(); let integration = config.with_bucket_name(NON_EXISTENT_NAME).build().unwrap(); diff --git a/src/http/mod.rs b/src/http/mod.rs index 404211e..4b1c927 100644 --- a/src/http/mod.rs +++ b/src/http/mod.rs @@ -64,9 +64,6 @@ enum Error { Metadata { source: crate::client::header::Error, }, - - #[snafu(display("Request error: {}", source))] - Reqwest { source: reqwest::Error }, } impl From for crate::Error { @@ -249,13 +246,14 @@ impl HttpBuilder { #[cfg(test)] mod tests { + use crate::integration::*; use crate::tests::*; use super::*; #[tokio::test] async fn http_test() { - crate::test_util::maybe_skip_integration!(); + maybe_skip_integration!(); let url = std::env::var("HTTP_URL").expect("HTTP_URL must be set"); let options = ClientOptions::new().with_allow_http(true); let integration = HttpBuilder::new() diff --git a/src/integration.rs b/src/integration.rs new file mode 100644 index 0000000..9a7d117 --- /dev/null +++ b/src/integration.rs @@ -0,0 +1,1082 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Integration tests for custom object store implementations +//! +//! NB: These tests will delete everything present in the provided [`DynObjectStore`]. +//! +//! These tests are not a stable part of the public API and breaking changes may be made +//! in patch releases. +//! +//! They are intended solely for testing purposes. + +use crate::multipart::MultipartStore; +use crate::path::Path; +use crate::{ + Attribute, Attributes, DynObjectStore, Error, GetOptions, GetRange, ObjectStore, PutMode, + PutPayload, UpdateVersion, WriteMultipart, +}; +use bytes::Bytes; +use futures::stream::FuturesUnordered; +use futures::{StreamExt, TryStreamExt}; +use rand::distributions::Alphanumeric; +use rand::{thread_rng, Rng}; + +pub(crate) async fn flatten_list_stream( + storage: &DynObjectStore, + prefix: Option<&Path>, +) -> crate::Result> { + storage + .list(prefix) + .map_ok(|meta| meta.location) + .try_collect::>() + .await +} + +/// Tests basic read/write and listing operations +pub async fn put_get_delete_list(storage: &DynObjectStore) { + delete_fixtures(storage).await; + + let content_list = flatten_list_stream(storage, None).await.unwrap(); + assert!( + content_list.is_empty(), + "Expected list to be empty; found: {content_list:?}" + ); + + let location = Path::from("test_dir/test_file.json"); + + let data = Bytes::from("arbitrary data"); + storage.put(&location, data.clone().into()).await.unwrap(); + + let root = Path::from("/"); + + // List everything + let content_list = flatten_list_stream(storage, None).await.unwrap(); + assert_eq!(content_list, &[location.clone()]); + + // Should behave the same as no prefix + let content_list = flatten_list_stream(storage, Some(&root)).await.unwrap(); + assert_eq!(content_list, &[location.clone()]); + + // List with delimiter + let result = storage.list_with_delimiter(None).await.unwrap(); + assert_eq!(&result.objects, &[]); + assert_eq!(result.common_prefixes.len(), 1); + assert_eq!(result.common_prefixes[0], Path::from("test_dir")); + + // Should behave the same as no prefix + let result = storage.list_with_delimiter(Some(&root)).await.unwrap(); + assert!(result.objects.is_empty()); + assert_eq!(result.common_prefixes.len(), 1); + assert_eq!(result.common_prefixes[0], Path::from("test_dir")); + + // Should return not found + let err = storage.get(&Path::from("test_dir")).await.unwrap_err(); + assert!(matches!(err, crate::Error::NotFound { .. }), "{}", err); + + // Should return not found + let err = storage.head(&Path::from("test_dir")).await.unwrap_err(); + assert!(matches!(err, crate::Error::NotFound { .. }), "{}", err); + + // List everything starting with a prefix that should return results + let prefix = Path::from("test_dir"); + let content_list = flatten_list_stream(storage, Some(&prefix)).await.unwrap(); + assert_eq!(content_list, &[location.clone()]); + + // List everything starting with a prefix that shouldn't return results + let prefix = Path::from("something"); + let content_list = flatten_list_stream(storage, Some(&prefix)).await.unwrap(); + assert!(content_list.is_empty()); + + let read_data = storage.get(&location).await.unwrap().bytes().await.unwrap(); + assert_eq!(&*read_data, data); + + // Test range request + let range = 3..7; + let range_result = storage.get_range(&location, range.clone()).await; + + let bytes = range_result.unwrap(); + assert_eq!(bytes, data.slice(range.clone())); + + let opts = GetOptions { + range: Some(GetRange::Bounded(2..5)), + ..Default::default() + }; + let result = storage.get_opts(&location, opts).await.unwrap(); + // Data is `"arbitrary data"`, length 14 bytes + assert_eq!(result.meta.size, 14); // Should return full object size (#5272) + assert_eq!(result.range, 2..5); + let bytes = result.bytes().await.unwrap(); + assert_eq!(bytes, b"bit".as_ref()); + + let out_of_range = 200..300; + let out_of_range_result = storage.get_range(&location, out_of_range).await; + + // Should be a non-fatal error + out_of_range_result.unwrap_err(); + + let opts = GetOptions { + range: Some(GetRange::Bounded(2..100)), + ..Default::default() + }; + let result = storage.get_opts(&location, opts).await.unwrap(); + assert_eq!(result.range, 2..14); + assert_eq!(result.meta.size, 14); + let bytes = result.bytes().await.unwrap(); + assert_eq!(bytes, b"bitrary data".as_ref()); + + let opts = GetOptions { + range: Some(GetRange::Suffix(2)), + ..Default::default() + }; + match storage.get_opts(&location, opts).await { + Ok(result) => { + assert_eq!(result.range, 12..14); + assert_eq!(result.meta.size, 14); + let bytes = result.bytes().await.unwrap(); + assert_eq!(bytes, b"ta".as_ref()); + } + Err(Error::NotSupported { .. }) => {} + Err(e) => panic!("{e}"), + } + + let opts = GetOptions { + range: Some(GetRange::Suffix(100)), + ..Default::default() + }; + match storage.get_opts(&location, opts).await { + Ok(result) => { + assert_eq!(result.range, 0..14); + assert_eq!(result.meta.size, 14); + let bytes = result.bytes().await.unwrap(); + assert_eq!(bytes, b"arbitrary data".as_ref()); + } + Err(Error::NotSupported { .. }) => {} + Err(e) => panic!("{e}"), + } + + let opts = GetOptions { + range: Some(GetRange::Offset(3)), + ..Default::default() + }; + let result = storage.get_opts(&location, opts).await.unwrap(); + assert_eq!(result.range, 3..14); + assert_eq!(result.meta.size, 14); + let bytes = result.bytes().await.unwrap(); + assert_eq!(bytes, b"itrary data".as_ref()); + + let opts = GetOptions { + range: Some(GetRange::Offset(100)), + ..Default::default() + }; + storage.get_opts(&location, opts).await.unwrap_err(); + + let ranges = vec![0..1, 2..3, 0..5]; + let bytes = storage.get_ranges(&location, &ranges).await.unwrap(); + for (range, bytes) in ranges.iter().zip(bytes) { + assert_eq!(bytes, data.slice(range.clone())) + } + + let head = storage.head(&location).await.unwrap(); + assert_eq!(head.size, data.len()); + + storage.delete(&location).await.unwrap(); + + let content_list = flatten_list_stream(storage, None).await.unwrap(); + assert!(content_list.is_empty()); + + let err = storage.get(&location).await.unwrap_err(); + assert!(matches!(err, crate::Error::NotFound { .. }), "{}", err); + + let err = storage.head(&location).await.unwrap_err(); + assert!(matches!(err, crate::Error::NotFound { .. }), "{}", err); + + // Test handling of paths containing an encoded delimiter + + let file_with_delimiter = Path::from_iter(["a", "b/c", "foo.file"]); + storage + .put(&file_with_delimiter, "arbitrary".into()) + .await + .unwrap(); + + let files = flatten_list_stream(storage, None).await.unwrap(); + assert_eq!(files, vec![file_with_delimiter.clone()]); + + let files = flatten_list_stream(storage, Some(&Path::from("a/b"))) + .await + .unwrap(); + assert!(files.is_empty()); + + let files = storage + .list_with_delimiter(Some(&Path::from("a/b"))) + .await + .unwrap(); + assert!(files.common_prefixes.is_empty()); + assert!(files.objects.is_empty()); + + let files = storage + .list_with_delimiter(Some(&Path::from("a"))) + .await + .unwrap(); + assert_eq!(files.common_prefixes, vec![Path::from_iter(["a", "b/c"])]); + assert!(files.objects.is_empty()); + + let files = storage + .list_with_delimiter(Some(&Path::from_iter(["a", "b/c"]))) + .await + .unwrap(); + assert!(files.common_prefixes.is_empty()); + assert_eq!(files.objects.len(), 1); + assert_eq!(files.objects[0].location, file_with_delimiter); + + storage.delete(&file_with_delimiter).await.unwrap(); + + // Test handling of paths containing non-ASCII characters, e.g. emoji + + let emoji_prefix = Path::from("🙀"); + let emoji_file = Path::from("🙀/😀.parquet"); + storage.put(&emoji_file, "arbitrary".into()).await.unwrap(); + + storage.head(&emoji_file).await.unwrap(); + storage + .get(&emoji_file) + .await + .unwrap() + .bytes() + .await + .unwrap(); + + let files = flatten_list_stream(storage, Some(&emoji_prefix)) + .await + .unwrap(); + + assert_eq!(files, vec![emoji_file.clone()]); + + let dst = Path::from("foo.parquet"); + storage.copy(&emoji_file, &dst).await.unwrap(); + let mut files = flatten_list_stream(storage, None).await.unwrap(); + files.sort_unstable(); + assert_eq!(files, vec![emoji_file.clone(), dst.clone()]); + + let dst2 = Path::from("new/nested/foo.parquet"); + storage.copy(&emoji_file, &dst2).await.unwrap(); + let mut files = flatten_list_stream(storage, None).await.unwrap(); + files.sort_unstable(); + assert_eq!(files, vec![emoji_file.clone(), dst.clone(), dst2.clone()]); + + let dst3 = Path::from("new/nested2/bar.parquet"); + storage.rename(&dst, &dst3).await.unwrap(); + let mut files = flatten_list_stream(storage, None).await.unwrap(); + files.sort_unstable(); + assert_eq!(files, vec![emoji_file.clone(), dst2.clone(), dst3.clone()]); + + let err = storage.head(&dst).await.unwrap_err(); + assert!(matches!(err, Error::NotFound { .. })); + + storage.delete(&emoji_file).await.unwrap(); + storage.delete(&dst3).await.unwrap(); + storage.delete(&dst2).await.unwrap(); + let files = flatten_list_stream(storage, Some(&emoji_prefix)) + .await + .unwrap(); + assert!(files.is_empty()); + + // Test handling of paths containing percent-encoded sequences + + // "HELLO" percent encoded + let hello_prefix = Path::parse("%48%45%4C%4C%4F").unwrap(); + let path = hello_prefix.child("foo.parquet"); + + storage.put(&path, vec![0, 1].into()).await.unwrap(); + let files = flatten_list_stream(storage, Some(&hello_prefix)) + .await + .unwrap(); + assert_eq!(files, vec![path.clone()]); + + // Cannot list by decoded representation + let files = flatten_list_stream(storage, Some(&Path::from("HELLO"))) + .await + .unwrap(); + assert!(files.is_empty()); + + // Cannot access by decoded representation + let err = storage + .head(&Path::from("HELLO/foo.parquet")) + .await + .unwrap_err(); + assert!(matches!(err, crate::Error::NotFound { .. }), "{}", err); + + storage.delete(&path).await.unwrap(); + + // Test handling of unicode paths + let path = Path::parse("🇦🇺/$shenanigans@@~.txt").unwrap(); + storage.put(&path, "test".into()).await.unwrap(); + + let r = storage.get(&path).await.unwrap(); + assert_eq!(r.bytes().await.unwrap(), "test"); + + let dir = Path::parse("🇦🇺").unwrap(); + let r = storage.list_with_delimiter(None).await.unwrap(); + assert!(r.common_prefixes.contains(&dir)); + + let r = storage.list_with_delimiter(Some(&dir)).await.unwrap(); + assert_eq!(r.objects.len(), 1); + assert_eq!(r.objects[0].location, path); + + storage.delete(&path).await.unwrap(); + + // Can also write non-percent encoded sequences + let path = Path::parse("%Q.parquet").unwrap(); + storage.put(&path, vec![0, 1].into()).await.unwrap(); + + let files = flatten_list_stream(storage, None).await.unwrap(); + assert_eq!(files, vec![path.clone()]); + + storage.delete(&path).await.unwrap(); + + let path = Path::parse("foo bar/I contain spaces.parquet").unwrap(); + storage.put(&path, vec![0, 1].into()).await.unwrap(); + storage.head(&path).await.unwrap(); + + let files = flatten_list_stream(storage, Some(&Path::from("foo bar"))) + .await + .unwrap(); + assert_eq!(files, vec![path.clone()]); + + storage.delete(&path).await.unwrap(); + + let files = flatten_list_stream(storage, None).await.unwrap(); + assert!(files.is_empty(), "{files:?}"); + + // Test list order + let files = vec![ + Path::from("a a/b.file"), + Path::parse("a%2Fa.file").unwrap(), + Path::from("a/😀.file"), + Path::from("a/a file"), + Path::parse("a/a%2F.file").unwrap(), + Path::from("a/a.file"), + Path::from("a/a/b.file"), + Path::from("a/b.file"), + Path::from("aa/a.file"), + Path::from("ab/a.file"), + ]; + + for file in &files { + storage.put(file, "foo".into()).await.unwrap(); + } + + let cases = [ + (None, Path::from("a")), + (None, Path::from("a/a file")), + (None, Path::from("a/a/b.file")), + (None, Path::from("ab/a.file")), + (None, Path::from("a%2Fa.file")), + (None, Path::from("a/😀.file")), + (Some(Path::from("a")), Path::from("")), + (Some(Path::from("a")), Path::from("a")), + (Some(Path::from("a")), Path::from("a/😀")), + (Some(Path::from("a")), Path::from("a/😀.file")), + (Some(Path::from("a")), Path::from("a/b")), + (Some(Path::from("a")), Path::from("a/a/b.file")), + ]; + + for (prefix, offset) in cases { + let s = storage.list_with_offset(prefix.as_ref(), &offset); + let mut actual: Vec<_> = s.map_ok(|x| x.location).try_collect().await.unwrap(); + + actual.sort_unstable(); + + let expected: Vec<_> = files + .iter() + .filter(|x| { + let prefix_match = prefix.as_ref().map(|p| x.prefix_matches(p)).unwrap_or(true); + prefix_match && *x > &offset + }) + .cloned() + .collect(); + + assert_eq!(actual, expected, "{prefix:?} - {offset:?}"); + } + + // Test bulk delete + let paths = vec![ + Path::from("a/a.file"), + Path::from("a/a/b.file"), + Path::from("aa/a.file"), + Path::from("does_not_exist"), + Path::from("I'm a < & weird path"), + Path::from("ab/a.file"), + Path::from("a/😀.file"), + ]; + + storage.put(&paths[4], "foo".into()).await.unwrap(); + + let out_paths = storage + .delete_stream(futures::stream::iter(paths.clone()).map(Ok).boxed()) + .collect::>() + .await; + + assert_eq!(out_paths.len(), paths.len()); + + let expect_errors = [3]; + + for (i, input_path) in paths.iter().enumerate() { + let err = storage.head(input_path).await.unwrap_err(); + assert!(matches!(err, crate::Error::NotFound { .. }), "{}", err); + + if expect_errors.contains(&i) { + // Some object stores will report NotFound, but others (such as S3) will + // report success regardless. + match &out_paths[i] { + Err(Error::NotFound { path: out_path, .. }) => { + assert!(out_path.ends_with(&input_path.to_string())); + } + Ok(out_path) => { + assert_eq!(out_path, input_path); + } + _ => panic!("unexpected error"), + } + } else { + assert_eq!(out_paths[i].as_ref().unwrap(), input_path); + } + } + + delete_fixtures(storage).await; + + let path = Path::from("empty"); + storage.put(&path, PutPayload::default()).await.unwrap(); + let meta = storage.head(&path).await.unwrap(); + assert_eq!(meta.size, 0); + let data = storage.get(&path).await.unwrap().bytes().await.unwrap(); + assert_eq!(data.len(), 0); + + storage.delete(&path).await.unwrap(); +} + +/// Tests the ability to read and write [`Attributes`] +pub async fn put_get_attributes(integration: &dyn ObjectStore) { + // Test handling of attributes + let attributes = Attributes::from_iter([ + (Attribute::CacheControl, "max-age=604800"), + ( + Attribute::ContentDisposition, + r#"attachment; filename="test.html""#, + ), + (Attribute::ContentEncoding, "gzip"), + (Attribute::ContentLanguage, "en-US"), + (Attribute::ContentType, "text/html; charset=utf-8"), + ]); + + let path = Path::from("attributes"); + let opts = attributes.clone().into(); + match integration.put_opts(&path, "foo".into(), opts).await { + Ok(_) => { + let r = integration.get(&path).await.unwrap(); + assert_eq!(r.attributes, attributes); + } + Err(Error::NotImplemented) => {} + Err(e) => panic!("{e}"), + } + + let opts = attributes.clone().into(); + match integration.put_multipart_opts(&path, opts).await { + Ok(mut w) => { + w.put_part("foo".into()).await.unwrap(); + w.complete().await.unwrap(); + + let r = integration.get(&path).await.unwrap(); + assert_eq!(r.attributes, attributes); + } + Err(Error::NotImplemented) => {} + Err(e) => panic!("{e}"), + } +} + +/// Tests conditional read requests +pub async fn get_opts(storage: &dyn ObjectStore) { + let path = Path::from("test"); + storage.put(&path, "foo".into()).await.unwrap(); + let meta = storage.head(&path).await.unwrap(); + + let options = GetOptions { + if_unmodified_since: Some(meta.last_modified), + ..GetOptions::default() + }; + match storage.get_opts(&path, options).await { + Ok(_) | Err(Error::NotSupported { .. }) => {} + Err(e) => panic!("{e}"), + } + + let options = GetOptions { + if_unmodified_since: Some(meta.last_modified + chrono::Duration::try_hours(10).unwrap()), + ..GetOptions::default() + }; + match storage.get_opts(&path, options).await { + Ok(_) | Err(Error::NotSupported { .. }) => {} + Err(e) => panic!("{e}"), + } + + let options = GetOptions { + if_unmodified_since: Some(meta.last_modified - chrono::Duration::try_hours(10).unwrap()), + ..GetOptions::default() + }; + match storage.get_opts(&path, options).await { + Err(Error::Precondition { .. } | Error::NotSupported { .. }) => {} + d => panic!("{d:?}"), + } + + let options = GetOptions { + if_modified_since: Some(meta.last_modified), + ..GetOptions::default() + }; + match storage.get_opts(&path, options).await { + Err(Error::NotModified { .. } | Error::NotSupported { .. }) => {} + d => panic!("{d:?}"), + } + + let options = GetOptions { + if_modified_since: Some(meta.last_modified - chrono::Duration::try_hours(10).unwrap()), + ..GetOptions::default() + }; + match storage.get_opts(&path, options).await { + Ok(_) | Err(Error::NotSupported { .. }) => {} + Err(e) => panic!("{e}"), + } + + let tag = meta.e_tag.unwrap(); + let options = GetOptions { + if_match: Some(tag.clone()), + ..GetOptions::default() + }; + storage.get_opts(&path, options).await.unwrap(); + + let options = GetOptions { + if_match: Some("invalid".to_string()), + ..GetOptions::default() + }; + let err = storage.get_opts(&path, options).await.unwrap_err(); + assert!(matches!(err, Error::Precondition { .. }), "{err}"); + + let options = GetOptions { + if_none_match: Some(tag.clone()), + ..GetOptions::default() + }; + let err = storage.get_opts(&path, options).await.unwrap_err(); + assert!(matches!(err, Error::NotModified { .. }), "{err}"); + + let options = GetOptions { + if_none_match: Some("invalid".to_string()), + ..GetOptions::default() + }; + storage.get_opts(&path, options).await.unwrap(); + + let result = storage.put(&path, "test".into()).await.unwrap(); + let new_tag = result.e_tag.unwrap(); + assert_ne!(tag, new_tag); + + let meta = storage.head(&path).await.unwrap(); + assert_eq!(meta.e_tag.unwrap(), new_tag); + + let options = GetOptions { + if_match: Some(new_tag), + ..GetOptions::default() + }; + storage.get_opts(&path, options).await.unwrap(); + + let options = GetOptions { + if_match: Some(tag), + ..GetOptions::default() + }; + let err = storage.get_opts(&path, options).await.unwrap_err(); + assert!(matches!(err, Error::Precondition { .. }), "{err}"); + + if let Some(version) = meta.version { + storage.put(&path, "bar".into()).await.unwrap(); + + let options = GetOptions { + version: Some(version), + ..GetOptions::default() + }; + + // Can retrieve previous version + let get_opts = storage.get_opts(&path, options).await.unwrap(); + let old = get_opts.bytes().await.unwrap(); + assert_eq!(old, b"test".as_slice()); + + // Current version contains the updated data + let current = storage.get(&path).await.unwrap().bytes().await.unwrap(); + assert_eq!(¤t, b"bar".as_slice()); + } +} + +/// Tests conditional writes +pub async fn put_opts(storage: &dyn ObjectStore, supports_update: bool) { + // When using DynamoCommit repeated runs of this test will produce the same sequence of records in DynamoDB + // As a result each conditional operation will need to wait for the lease to timeout before proceeding + // One solution would be to clear DynamoDB before each test, but this would require non-trivial additional code + // so we instead just generate a random suffix for the filenames + let rng = thread_rng(); + let suffix = String::from_utf8(rng.sample_iter(Alphanumeric).take(32).collect()).unwrap(); + + delete_fixtures(storage).await; + let path = Path::from(format!("put_opts_{suffix}")); + let v1 = storage + .put_opts(&path, "a".into(), PutMode::Create.into()) + .await + .unwrap(); + + let err = storage + .put_opts(&path, "b".into(), PutMode::Create.into()) + .await + .unwrap_err(); + assert!(matches!(err, Error::AlreadyExists { .. }), "{err}"); + + let b = storage.get(&path).await.unwrap().bytes().await.unwrap(); + assert_eq!(b.as_ref(), b"a"); + + if !supports_update { + return; + } + + let v2 = storage + .put_opts(&path, "c".into(), PutMode::Update(v1.clone().into()).into()) + .await + .unwrap(); + + let b = storage.get(&path).await.unwrap().bytes().await.unwrap(); + assert_eq!(b.as_ref(), b"c"); + + let err = storage + .put_opts(&path, "d".into(), PutMode::Update(v1.into()).into()) + .await + .unwrap_err(); + assert!(matches!(err, Error::Precondition { .. }), "{err}"); + + storage + .put_opts(&path, "e".into(), PutMode::Update(v2.clone().into()).into()) + .await + .unwrap(); + + let b = storage.get(&path).await.unwrap().bytes().await.unwrap(); + assert_eq!(b.as_ref(), b"e"); + + // Update not exists + let path = Path::from("I don't exist"); + let err = storage + .put_opts(&path, "e".into(), PutMode::Update(v2.into()).into()) + .await + .unwrap_err(); + assert!(matches!(err, Error::Precondition { .. }), "{err}"); + + const NUM_WORKERS: usize = 5; + const NUM_INCREMENTS: usize = 10; + + let path = Path::from(format!("RACE-{suffix}")); + let mut futures: FuturesUnordered<_> = (0..NUM_WORKERS) + .map(|_| async { + for _ in 0..NUM_INCREMENTS { + loop { + match storage.get(&path).await { + Ok(r) => { + let mode = PutMode::Update(UpdateVersion { + e_tag: r.meta.e_tag.clone(), + version: r.meta.version.clone(), + }); + + let b = r.bytes().await.unwrap(); + let v: usize = std::str::from_utf8(&b).unwrap().parse().unwrap(); + let new = (v + 1).to_string(); + + match storage.put_opts(&path, new.into(), mode.into()).await { + Ok(_) => break, + Err(Error::Precondition { .. }) => continue, + Err(e) => return Err(e), + } + } + Err(Error::NotFound { .. }) => { + let mode = PutMode::Create; + match storage.put_opts(&path, "1".into(), mode.into()).await { + Ok(_) => break, + Err(Error::AlreadyExists { .. }) => continue, + Err(e) => return Err(e), + } + } + Err(e) => return Err(e), + } + } + } + Ok(()) + }) + .collect(); + + while futures.next().await.transpose().unwrap().is_some() {} + let b = storage.get(&path).await.unwrap().bytes().await.unwrap(); + let v = std::str::from_utf8(&b).unwrap().parse::().unwrap(); + assert_eq!(v, NUM_WORKERS * NUM_INCREMENTS); +} + +/// Returns a chunk of length `chunk_length` +fn get_chunk(chunk_length: usize) -> Bytes { + let mut data = vec![0_u8; chunk_length]; + let mut rng = thread_rng(); + // Set a random selection of bytes + for _ in 0..1000 { + data[rng.gen_range(0..chunk_length)] = rng.gen(); + } + data.into() +} + +/// Returns `num_chunks` of length `chunks` +fn get_chunks(chunk_length: usize, num_chunks: usize) -> Vec { + (0..num_chunks).map(|_| get_chunk(chunk_length)).collect() +} + +/// Tests the ability to perform multipart writes +pub async fn stream_get(storage: &DynObjectStore) { + let location = Path::from("test_dir/test_upload_file.txt"); + + // Can write to storage + let data = get_chunks(5 * 1024 * 1024, 3); + let bytes_expected = data.concat(); + let mut upload = storage.put_multipart(&location).await.unwrap(); + let uploads = data.into_iter().map(|x| upload.put_part(x.into())); + futures::future::try_join_all(uploads).await.unwrap(); + + // Object should not yet exist in store + let meta_res = storage.head(&location).await; + assert!(meta_res.is_err()); + assert!(matches!( + meta_res.unwrap_err(), + crate::Error::NotFound { .. } + )); + + let files = flatten_list_stream(storage, None).await.unwrap(); + assert_eq!(&files, &[]); + + let result = storage.list_with_delimiter(None).await.unwrap(); + assert_eq!(&result.objects, &[]); + + upload.complete().await.unwrap(); + + let bytes_written = storage.get(&location).await.unwrap().bytes().await.unwrap(); + assert_eq!(bytes_expected, bytes_written); + + // Can overwrite some storage + // Sizes chosen to ensure we write three parts + let data = get_chunks(3_200_000, 7); + let bytes_expected = data.concat(); + let upload = storage.put_multipart(&location).await.unwrap(); + let mut writer = WriteMultipart::new(upload); + for chunk in &data { + writer.write(chunk) + } + writer.finish().await.unwrap(); + let bytes_written = storage.get(&location).await.unwrap().bytes().await.unwrap(); + assert_eq!(bytes_expected, bytes_written); + + // We can abort an empty write + let location = Path::from("test_dir/test_abort_upload.txt"); + let mut upload = storage.put_multipart(&location).await.unwrap(); + upload.abort().await.unwrap(); + let get_res = storage.get(&location).await; + assert!(get_res.is_err()); + assert!(matches!( + get_res.unwrap_err(), + crate::Error::NotFound { .. } + )); + + // We can abort an in-progress write + let mut upload = storage.put_multipart(&location).await.unwrap(); + upload + .put_part(data.first().unwrap().clone().into()) + .await + .unwrap(); + + upload.abort().await.unwrap(); + let get_res = storage.get(&location).await; + assert!(get_res.is_err()); + assert!(matches!(get_res.unwrap_err(), Error::NotFound { .. })); +} + +/// Tests that directories are transparent +pub async fn list_uses_directories_correctly(storage: &DynObjectStore) { + delete_fixtures(storage).await; + + let content_list = flatten_list_stream(storage, None).await.unwrap(); + assert!( + content_list.is_empty(), + "Expected list to be empty; found: {content_list:?}" + ); + + let location1 = Path::from("foo/x.json"); + let location2 = Path::from("foo.bar/y.json"); + + let data = PutPayload::from("arbitrary data"); + storage.put(&location1, data.clone()).await.unwrap(); + storage.put(&location2, data).await.unwrap(); + + let prefix = Path::from("foo"); + let content_list = flatten_list_stream(storage, Some(&prefix)).await.unwrap(); + assert_eq!(content_list, &[location1.clone()]); + + let result = storage.list_with_delimiter(Some(&prefix)).await.unwrap(); + assert_eq!(result.objects.len(), 1); + assert_eq!(result.objects[0].location, location1); + assert_eq!(result.common_prefixes, &[]); + + // Listing an existing path (file) should return an empty list: + // https://github.com/apache/arrow-rs/issues/3712 + let content_list = flatten_list_stream(storage, Some(&location1)) + .await + .unwrap(); + assert_eq!(content_list, &[]); + + let list = storage.list_with_delimiter(Some(&location1)).await.unwrap(); + assert_eq!(list.objects, &[]); + assert_eq!(list.common_prefixes, &[]); + + let prefix = Path::from("foo/x"); + let content_list = flatten_list_stream(storage, Some(&prefix)).await.unwrap(); + assert_eq!(content_list, &[]); + + let list = storage.list_with_delimiter(Some(&prefix)).await.unwrap(); + assert_eq!(list.objects, &[]); + assert_eq!(list.common_prefixes, &[]); +} + +/// Tests listing with delimiter +pub async fn list_with_delimiter(storage: &DynObjectStore) { + delete_fixtures(storage).await; + + // ==================== check: store is empty ==================== + let content_list = flatten_list_stream(storage, None).await.unwrap(); + assert!(content_list.is_empty()); + + // ==================== do: create files ==================== + let data = Bytes::from("arbitrary data"); + + let files: Vec<_> = [ + "test_file", + "mydb/wb/000/000/000.segment", + "mydb/wb/000/000/001.segment", + "mydb/wb/000/000/002.segment", + "mydb/wb/001/001/000.segment", + "mydb/wb/foo.json", + "mydb/wbwbwb/111/222/333.segment", + "mydb/data/whatevs", + ] + .iter() + .map(|&s| Path::from(s)) + .collect(); + + for f in &files { + storage.put(f, data.clone().into()).await.unwrap(); + } + + // ==================== check: prefix-list `mydb/wb` (directory) ==================== + let prefix = Path::from("mydb/wb"); + + let expected_000 = Path::from("mydb/wb/000"); + let expected_001 = Path::from("mydb/wb/001"); + let expected_location = Path::from("mydb/wb/foo.json"); + + let result = storage.list_with_delimiter(Some(&prefix)).await.unwrap(); + + assert_eq!(result.common_prefixes, vec![expected_000, expected_001]); + assert_eq!(result.objects.len(), 1); + + let object = &result.objects[0]; + + assert_eq!(object.location, expected_location); + assert_eq!(object.size, data.len()); + + // ==================== check: prefix-list `mydb/wb/000/000/001` (partial filename doesn't match) ==================== + let prefix = Path::from("mydb/wb/000/000/001"); + + let result = storage.list_with_delimiter(Some(&prefix)).await.unwrap(); + assert!(result.common_prefixes.is_empty()); + assert_eq!(result.objects.len(), 0); + + // ==================== check: prefix-list `not_there` (non-existing prefix) ==================== + let prefix = Path::from("not_there"); + + let result = storage.list_with_delimiter(Some(&prefix)).await.unwrap(); + assert!(result.common_prefixes.is_empty()); + assert!(result.objects.is_empty()); + + // ==================== do: remove all files ==================== + for f in &files { + storage.delete(f).await.unwrap(); + } + + // ==================== check: store is empty ==================== + let content_list = flatten_list_stream(storage, None).await.unwrap(); + assert!(content_list.is_empty()); +} + +/// Tests fetching a non-existent object returns a not found error +pub async fn get_nonexistent_object( + storage: &DynObjectStore, + location: Option, +) -> crate::Result { + let location = location.unwrap_or_else(|| Path::from("this_file_should_not_exist")); + + let err = storage.head(&location).await.unwrap_err(); + assert!(matches!(err, Error::NotFound { .. })); + + storage.get(&location).await?.bytes().await +} + +/// Tests copying +pub async fn rename_and_copy(storage: &DynObjectStore) { + // Create two objects + let path1 = Path::from("test1"); + let path2 = Path::from("test2"); + let contents1 = Bytes::from("cats"); + let contents2 = Bytes::from("dogs"); + + // copy() make both objects identical + storage.put(&path1, contents1.clone().into()).await.unwrap(); + storage.put(&path2, contents2.clone().into()).await.unwrap(); + storage.copy(&path1, &path2).await.unwrap(); + let new_contents = storage.get(&path2).await.unwrap().bytes().await.unwrap(); + assert_eq!(&new_contents, &contents1); + + // rename() copies contents and deletes original + storage.put(&path1, contents1.clone().into()).await.unwrap(); + storage.put(&path2, contents2.clone().into()).await.unwrap(); + storage.rename(&path1, &path2).await.unwrap(); + let new_contents = storage.get(&path2).await.unwrap().bytes().await.unwrap(); + assert_eq!(&new_contents, &contents1); + let result = storage.get(&path1).await; + assert!(result.is_err()); + assert!(matches!(result.unwrap_err(), Error::NotFound { .. })); + + // Clean up + storage.delete(&path2).await.unwrap(); +} + +/// Tests copy if not exists +pub async fn copy_if_not_exists(storage: &DynObjectStore) { + // Create two objects + let path1 = Path::from("test1"); + let path2 = Path::from("not_exists_nested/test2"); + let contents1 = Bytes::from("cats"); + let contents2 = Bytes::from("dogs"); + + // copy_if_not_exists() errors if destination already exists + storage.put(&path1, contents1.clone().into()).await.unwrap(); + storage.put(&path2, contents2.clone().into()).await.unwrap(); + let result = storage.copy_if_not_exists(&path1, &path2).await; + assert!(result.is_err()); + assert!(matches!( + result.unwrap_err(), + crate::Error::AlreadyExists { .. } + )); + + // copy_if_not_exists() copies contents and allows deleting original + storage.delete(&path2).await.unwrap(); + storage.copy_if_not_exists(&path1, &path2).await.unwrap(); + storage.delete(&path1).await.unwrap(); + let new_contents = storage.get(&path2).await.unwrap().bytes().await.unwrap(); + assert_eq!(&new_contents, &contents1); + let result = storage.get(&path1).await; + assert!(result.is_err()); + assert!(matches!(result.unwrap_err(), crate::Error::NotFound { .. })); + + // Clean up + storage.delete(&path2).await.unwrap(); +} + +/// Tests copy and renaming behaviour of non-existent objects +pub async fn copy_rename_nonexistent_object(storage: &DynObjectStore) { + // Create empty source object + let path1 = Path::from("test1"); + + // Create destination object + let path2 = Path::from("test2"); + storage.put(&path2, "hello".into()).await.unwrap(); + + // copy() errors if source does not exist + let result = storage.copy(&path1, &path2).await; + assert!(result.is_err()); + assert!(matches!(result.unwrap_err(), crate::Error::NotFound { .. })); + + // rename() errors if source does not exist + let result = storage.rename(&path1, &path2).await; + assert!(result.is_err()); + assert!(matches!(result.unwrap_err(), crate::Error::NotFound { .. })); + + // copy_if_not_exists() errors if source does not exist + let result = storage.copy_if_not_exists(&path1, &path2).await; + assert!(result.is_err()); + assert!(matches!(result.unwrap_err(), crate::Error::NotFound { .. })); + + // Clean up + storage.delete(&path2).await.unwrap(); +} + +/// Tests [`MultipartStore`] +pub async fn multipart(storage: &dyn ObjectStore, multipart: &dyn MultipartStore) { + let path = Path::from("test_multipart"); + let chunk_size = 5 * 1024 * 1024; + + let chunks = get_chunks(chunk_size, 2); + + let id = multipart.create_multipart(&path).await.unwrap(); + + let parts: Vec<_> = futures::stream::iter(chunks) + .enumerate() + .map(|(idx, b)| multipart.put_part(&path, &id, idx, b.into())) + .buffered(2) + .try_collect() + .await + .unwrap(); + + multipart + .complete_multipart(&path, &id, parts) + .await + .unwrap(); + + let meta = storage.head(&path).await.unwrap(); + assert_eq!(meta.size, chunk_size * 2); + + // Empty case + let path = Path::from("test_empty_multipart"); + + let id = multipart.create_multipart(&path).await.unwrap(); + + let parts = vec![]; + + multipart + .complete_multipart(&path, &id, parts) + .await + .unwrap(); + + let meta = storage.head(&path).await.unwrap(); + assert_eq!(meta.size, 0); +} + +async fn delete_fixtures(storage: &DynObjectStore) { + let paths = storage.list(None).map_ok(|meta| meta.location).boxed(); + storage + .delete_stream(paths) + .try_collect::>() + .await + .unwrap(); +} diff --git a/src/lib.rs b/src/lib.rs index 9a8f77b..bdf870f 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -545,6 +545,9 @@ mod util; mod attributes; +#[cfg(any(feature = "integration", test))] +pub mod integration; + pub use attributes::*; pub use parse::{parse_url, parse_url_opts}; @@ -1285,9 +1288,11 @@ impl From for std::io::Error { } #[cfg(test)] -mod test_util { +mod tests { use super::*; - use futures::TryStreamExt; + use crate::buffered::BufWriter; + use chrono::TimeZone; + use tokio::io::AsyncWriteExt; macro_rules! maybe_skip_integration { () => { @@ -1299,1054 +1304,19 @@ mod test_util { } pub(crate) use maybe_skip_integration; - pub async fn flatten_list_stream( - storage: &DynObjectStore, - prefix: Option<&Path>, - ) -> Result> { - storage - .list(prefix) - .map_ok(|meta| meta.location) - .try_collect::>() - .await - } -} - -#[cfg(test)] -mod tests { - use super::*; - use crate::buffered::BufWriter; - use crate::multipart::MultipartStore; - use crate::test_util::flatten_list_stream; - use chrono::TimeZone; - use futures::stream::FuturesUnordered; - use rand::distributions::Alphanumeric; - use rand::{thread_rng, Rng}; - use tokio::io::AsyncWriteExt; - - pub(crate) async fn put_get_delete_list(storage: &DynObjectStore) { - delete_fixtures(storage).await; - - let content_list = flatten_list_stream(storage, None).await.unwrap(); - assert!( - content_list.is_empty(), - "Expected list to be empty; found: {content_list:?}" - ); - - let location = Path::from("test_dir/test_file.json"); - - let data = Bytes::from("arbitrary data"); - storage.put(&location, data.clone().into()).await.unwrap(); - - let root = Path::from("/"); - - // List everything - let content_list = flatten_list_stream(storage, None).await.unwrap(); - assert_eq!(content_list, &[location.clone()]); - - // Should behave the same as no prefix - let content_list = flatten_list_stream(storage, Some(&root)).await.unwrap(); - assert_eq!(content_list, &[location.clone()]); - - // List with delimiter - let result = storage.list_with_delimiter(None).await.unwrap(); - assert_eq!(&result.objects, &[]); - assert_eq!(result.common_prefixes.len(), 1); - assert_eq!(result.common_prefixes[0], Path::from("test_dir")); - - // Should behave the same as no prefix - let result = storage.list_with_delimiter(Some(&root)).await.unwrap(); - assert!(result.objects.is_empty()); - assert_eq!(result.common_prefixes.len(), 1); - assert_eq!(result.common_prefixes[0], Path::from("test_dir")); - - // Should return not found - let err = storage.get(&Path::from("test_dir")).await.unwrap_err(); - assert!(matches!(err, crate::Error::NotFound { .. }), "{}", err); - - // Should return not found - let err = storage.head(&Path::from("test_dir")).await.unwrap_err(); - assert!(matches!(err, crate::Error::NotFound { .. }), "{}", err); - - // List everything starting with a prefix that should return results - let prefix = Path::from("test_dir"); - let content_list = flatten_list_stream(storage, Some(&prefix)).await.unwrap(); - assert_eq!(content_list, &[location.clone()]); - - // List everything starting with a prefix that shouldn't return results - let prefix = Path::from("something"); - let content_list = flatten_list_stream(storage, Some(&prefix)).await.unwrap(); - assert!(content_list.is_empty()); - - let read_data = storage.get(&location).await.unwrap().bytes().await.unwrap(); - assert_eq!(&*read_data, data); - - // Test range request - let range = 3..7; - let range_result = storage.get_range(&location, range.clone()).await; - - let bytes = range_result.unwrap(); - assert_eq!(bytes, data.slice(range.clone())); - - let opts = GetOptions { - range: Some(GetRange::Bounded(2..5)), - ..Default::default() - }; - let result = storage.get_opts(&location, opts).await.unwrap(); - // Data is `"arbitrary data"`, length 14 bytes - assert_eq!(result.meta.size, 14); // Should return full object size (#5272) - assert_eq!(result.range, 2..5); - let bytes = result.bytes().await.unwrap(); - assert_eq!(bytes, b"bit".as_ref()); - - let out_of_range = 200..300; - let out_of_range_result = storage.get_range(&location, out_of_range).await; - - // Should be a non-fatal error - out_of_range_result.unwrap_err(); - - let opts = GetOptions { - range: Some(GetRange::Bounded(2..100)), - ..Default::default() - }; - let result = storage.get_opts(&location, opts).await.unwrap(); - assert_eq!(result.range, 2..14); - assert_eq!(result.meta.size, 14); - let bytes = result.bytes().await.unwrap(); - assert_eq!(bytes, b"bitrary data".as_ref()); - - let opts = GetOptions { - range: Some(GetRange::Suffix(2)), - ..Default::default() - }; - match storage.get_opts(&location, opts).await { - Ok(result) => { - assert_eq!(result.range, 12..14); - assert_eq!(result.meta.size, 14); - let bytes = result.bytes().await.unwrap(); - assert_eq!(bytes, b"ta".as_ref()); - } - Err(Error::NotSupported { .. }) => {} - Err(e) => panic!("{e}"), - } - - let opts = GetOptions { - range: Some(GetRange::Suffix(100)), - ..Default::default() - }; - match storage.get_opts(&location, opts).await { - Ok(result) => { - assert_eq!(result.range, 0..14); - assert_eq!(result.meta.size, 14); - let bytes = result.bytes().await.unwrap(); - assert_eq!(bytes, b"arbitrary data".as_ref()); - } - Err(Error::NotSupported { .. }) => {} - Err(e) => panic!("{e}"), - } - - let opts = GetOptions { - range: Some(GetRange::Offset(3)), - ..Default::default() - }; - let result = storage.get_opts(&location, opts).await.unwrap(); - assert_eq!(result.range, 3..14); - assert_eq!(result.meta.size, 14); - let bytes = result.bytes().await.unwrap(); - assert_eq!(bytes, b"itrary data".as_ref()); - - let opts = GetOptions { - range: Some(GetRange::Offset(100)), - ..Default::default() - }; - storage.get_opts(&location, opts).await.unwrap_err(); - - let ranges = vec![0..1, 2..3, 0..5]; - let bytes = storage.get_ranges(&location, &ranges).await.unwrap(); - for (range, bytes) in ranges.iter().zip(bytes) { - assert_eq!(bytes, data.slice(range.clone())) - } - - let head = storage.head(&location).await.unwrap(); - assert_eq!(head.size, data.len()); - - storage.delete(&location).await.unwrap(); - - let content_list = flatten_list_stream(storage, None).await.unwrap(); - assert!(content_list.is_empty()); - - let err = storage.get(&location).await.unwrap_err(); - assert!(matches!(err, crate::Error::NotFound { .. }), "{}", err); - - let err = storage.head(&location).await.unwrap_err(); - assert!(matches!(err, crate::Error::NotFound { .. }), "{}", err); - - // Test handling of paths containing an encoded delimiter - - let file_with_delimiter = Path::from_iter(["a", "b/c", "foo.file"]); - storage - .put(&file_with_delimiter, "arbitrary".into()) - .await - .unwrap(); - - let files = flatten_list_stream(storage, None).await.unwrap(); - assert_eq!(files, vec![file_with_delimiter.clone()]); - - let files = flatten_list_stream(storage, Some(&Path::from("a/b"))) - .await - .unwrap(); - assert!(files.is_empty()); - - let files = storage - .list_with_delimiter(Some(&Path::from("a/b"))) - .await - .unwrap(); - assert!(files.common_prefixes.is_empty()); - assert!(files.objects.is_empty()); - - let files = storage - .list_with_delimiter(Some(&Path::from("a"))) - .await - .unwrap(); - assert_eq!(files.common_prefixes, vec![Path::from_iter(["a", "b/c"])]); - assert!(files.objects.is_empty()); - - let files = storage - .list_with_delimiter(Some(&Path::from_iter(["a", "b/c"]))) - .await - .unwrap(); - assert!(files.common_prefixes.is_empty()); - assert_eq!(files.objects.len(), 1); - assert_eq!(files.objects[0].location, file_with_delimiter); - - storage.delete(&file_with_delimiter).await.unwrap(); - - // Test handling of paths containing non-ASCII characters, e.g. emoji - - let emoji_prefix = Path::from("🙀"); - let emoji_file = Path::from("🙀/😀.parquet"); - storage.put(&emoji_file, "arbitrary".into()).await.unwrap(); - - storage.head(&emoji_file).await.unwrap(); - storage - .get(&emoji_file) - .await - .unwrap() - .bytes() - .await - .unwrap(); - - let files = flatten_list_stream(storage, Some(&emoji_prefix)) - .await - .unwrap(); - - assert_eq!(files, vec![emoji_file.clone()]); - - let dst = Path::from("foo.parquet"); - storage.copy(&emoji_file, &dst).await.unwrap(); - let mut files = flatten_list_stream(storage, None).await.unwrap(); - files.sort_unstable(); - assert_eq!(files, vec![emoji_file.clone(), dst.clone()]); - - let dst2 = Path::from("new/nested/foo.parquet"); - storage.copy(&emoji_file, &dst2).await.unwrap(); - let mut files = flatten_list_stream(storage, None).await.unwrap(); - files.sort_unstable(); - assert_eq!(files, vec![emoji_file.clone(), dst.clone(), dst2.clone()]); - - let dst3 = Path::from("new/nested2/bar.parquet"); - storage.rename(&dst, &dst3).await.unwrap(); - let mut files = flatten_list_stream(storage, None).await.unwrap(); - files.sort_unstable(); - assert_eq!(files, vec![emoji_file.clone(), dst2.clone(), dst3.clone()]); - - let err = storage.head(&dst).await.unwrap_err(); - assert!(matches!(err, Error::NotFound { .. })); - - storage.delete(&emoji_file).await.unwrap(); - storage.delete(&dst3).await.unwrap(); - storage.delete(&dst2).await.unwrap(); - let files = flatten_list_stream(storage, Some(&emoji_prefix)) - .await - .unwrap(); - assert!(files.is_empty()); - - // Test handling of paths containing percent-encoded sequences - - // "HELLO" percent encoded - let hello_prefix = Path::parse("%48%45%4C%4C%4F").unwrap(); - let path = hello_prefix.child("foo.parquet"); - - storage.put(&path, vec![0, 1].into()).await.unwrap(); - let files = flatten_list_stream(storage, Some(&hello_prefix)) - .await - .unwrap(); - assert_eq!(files, vec![path.clone()]); - - // Cannot list by decoded representation - let files = flatten_list_stream(storage, Some(&Path::from("HELLO"))) - .await - .unwrap(); - assert!(files.is_empty()); - - // Cannot access by decoded representation - let err = storage - .head(&Path::from("HELLO/foo.parquet")) - .await - .unwrap_err(); - assert!(matches!(err, crate::Error::NotFound { .. }), "{}", err); - - storage.delete(&path).await.unwrap(); - - // Test handling of unicode paths - let path = Path::parse("🇦🇺/$shenanigans@@~.txt").unwrap(); - storage.put(&path, "test".into()).await.unwrap(); - - let r = storage.get(&path).await.unwrap(); - assert_eq!(r.bytes().await.unwrap(), "test"); - - let dir = Path::parse("🇦🇺").unwrap(); - let r = storage.list_with_delimiter(None).await.unwrap(); - assert!(r.common_prefixes.contains(&dir)); - - let r = storage.list_with_delimiter(Some(&dir)).await.unwrap(); - assert_eq!(r.objects.len(), 1); - assert_eq!(r.objects[0].location, path); - - storage.delete(&path).await.unwrap(); - - // Can also write non-percent encoded sequences - let path = Path::parse("%Q.parquet").unwrap(); - storage.put(&path, vec![0, 1].into()).await.unwrap(); - - let files = flatten_list_stream(storage, None).await.unwrap(); - assert_eq!(files, vec![path.clone()]); - - storage.delete(&path).await.unwrap(); - - let path = Path::parse("foo bar/I contain spaces.parquet").unwrap(); - storage.put(&path, vec![0, 1].into()).await.unwrap(); - storage.head(&path).await.unwrap(); - - let files = flatten_list_stream(storage, Some(&Path::from("foo bar"))) - .await - .unwrap(); - assert_eq!(files, vec![path.clone()]); - - storage.delete(&path).await.unwrap(); - - let files = flatten_list_stream(storage, None).await.unwrap(); - assert!(files.is_empty(), "{files:?}"); - - // Test list order - let files = vec![ - Path::from("a a/b.file"), - Path::parse("a%2Fa.file").unwrap(), - Path::from("a/😀.file"), - Path::from("a/a file"), - Path::parse("a/a%2F.file").unwrap(), - Path::from("a/a.file"), - Path::from("a/a/b.file"), - Path::from("a/b.file"), - Path::from("aa/a.file"), - Path::from("ab/a.file"), - ]; - - for file in &files { - storage.put(file, "foo".into()).await.unwrap(); - } - - let cases = [ - (None, Path::from("a")), - (None, Path::from("a/a file")), - (None, Path::from("a/a/b.file")), - (None, Path::from("ab/a.file")), - (None, Path::from("a%2Fa.file")), - (None, Path::from("a/😀.file")), - (Some(Path::from("a")), Path::from("")), - (Some(Path::from("a")), Path::from("a")), - (Some(Path::from("a")), Path::from("a/😀")), - (Some(Path::from("a")), Path::from("a/😀.file")), - (Some(Path::from("a")), Path::from("a/b")), - (Some(Path::from("a")), Path::from("a/a/b.file")), - ]; - - for (prefix, offset) in cases { - let s = storage.list_with_offset(prefix.as_ref(), &offset); - let mut actual: Vec<_> = s.map_ok(|x| x.location).try_collect().await.unwrap(); - - actual.sort_unstable(); - - let expected: Vec<_> = files - .iter() - .filter(|x| { - let prefix_match = prefix.as_ref().map(|p| x.prefix_matches(p)).unwrap_or(true); - prefix_match && *x > &offset - }) - .cloned() - .collect(); - - assert_eq!(actual, expected, "{prefix:?} - {offset:?}"); - } - - // Test bulk delete - let paths = vec![ - Path::from("a/a.file"), - Path::from("a/a/b.file"), - Path::from("aa/a.file"), - Path::from("does_not_exist"), - Path::from("I'm a < & weird path"), - Path::from("ab/a.file"), - Path::from("a/😀.file"), - ]; - - storage.put(&paths[4], "foo".into()).await.unwrap(); - - let out_paths = storage - .delete_stream(futures::stream::iter(paths.clone()).map(Ok).boxed()) - .collect::>() - .await; - - assert_eq!(out_paths.len(), paths.len()); - - let expect_errors = [3]; - - for (i, input_path) in paths.iter().enumerate() { - let err = storage.head(input_path).await.unwrap_err(); - assert!(matches!(err, crate::Error::NotFound { .. }), "{}", err); - - if expect_errors.contains(&i) { - // Some object stores will report NotFound, but others (such as S3) will - // report success regardless. - match &out_paths[i] { - Err(Error::NotFound { path: out_path, .. }) => { - assert!(out_path.ends_with(&input_path.to_string())); - } - Ok(out_path) => { - assert_eq!(out_path, input_path); - } - _ => panic!("unexpected error"), - } - } else { - assert_eq!(out_paths[i].as_ref().unwrap(), input_path); - } - } - - delete_fixtures(storage).await; - - let path = Path::from("empty"); - storage.put(&path, PutPayload::default()).await.unwrap(); - let meta = storage.head(&path).await.unwrap(); - assert_eq!(meta.size, 0); - let data = storage.get(&path).await.unwrap().bytes().await.unwrap(); - assert_eq!(data.len(), 0); - - storage.delete(&path).await.unwrap(); - } - - pub(crate) async fn put_get_attributes(integration: &dyn ObjectStore) { - // Test handling of attributes - let attributes = Attributes::from_iter([ - (Attribute::CacheControl, "max-age=604800"), - ( - Attribute::ContentDisposition, - r#"attachment; filename="test.html""#, - ), - (Attribute::ContentEncoding, "gzip"), - (Attribute::ContentLanguage, "en-US"), - (Attribute::ContentType, "text/html; charset=utf-8"), - ]); - - let path = Path::from("attributes"); - let opts = attributes.clone().into(); - match integration.put_opts(&path, "foo".into(), opts).await { - Ok(_) => { - let r = integration.get(&path).await.unwrap(); - assert_eq!(r.attributes, attributes); - } - Err(Error::NotImplemented) => {} - Err(e) => panic!("{e}"), - } - - let opts = attributes.clone().into(); - match integration.put_multipart_opts(&path, opts).await { - Ok(mut w) => { - w.put_part("foo".into()).await.unwrap(); - w.complete().await.unwrap(); - - let r = integration.get(&path).await.unwrap(); - assert_eq!(r.attributes, attributes); - } - Err(Error::NotImplemented) => {} - Err(e) => panic!("{e}"), - } - } - - pub(crate) async fn get_opts(storage: &dyn ObjectStore) { - let path = Path::from("test"); - storage.put(&path, "foo".into()).await.unwrap(); - let meta = storage.head(&path).await.unwrap(); - - let options = GetOptions { - if_unmodified_since: Some(meta.last_modified), - ..GetOptions::default() - }; - match storage.get_opts(&path, options).await { - Ok(_) | Err(Error::NotSupported { .. }) => {} - Err(e) => panic!("{e}"), - } - - let options = GetOptions { - if_unmodified_since: Some( - meta.last_modified + chrono::Duration::try_hours(10).unwrap(), - ), - ..GetOptions::default() - }; - match storage.get_opts(&path, options).await { - Ok(_) | Err(Error::NotSupported { .. }) => {} - Err(e) => panic!("{e}"), - } - - let options = GetOptions { - if_unmodified_since: Some( - meta.last_modified - chrono::Duration::try_hours(10).unwrap(), - ), - ..GetOptions::default() - }; - match storage.get_opts(&path, options).await { - Err(Error::Precondition { .. } | Error::NotSupported { .. }) => {} - d => panic!("{d:?}"), - } - - let options = GetOptions { - if_modified_since: Some(meta.last_modified), - ..GetOptions::default() - }; - match storage.get_opts(&path, options).await { - Err(Error::NotModified { .. } | Error::NotSupported { .. }) => {} - d => panic!("{d:?}"), - } - - let options = GetOptions { - if_modified_since: Some(meta.last_modified - chrono::Duration::try_hours(10).unwrap()), - ..GetOptions::default() - }; - match storage.get_opts(&path, options).await { - Ok(_) | Err(Error::NotSupported { .. }) => {} - Err(e) => panic!("{e}"), - } - - let tag = meta.e_tag.unwrap(); - let options = GetOptions { - if_match: Some(tag.clone()), - ..GetOptions::default() - }; - storage.get_opts(&path, options).await.unwrap(); - - let options = GetOptions { - if_match: Some("invalid".to_string()), - ..GetOptions::default() - }; - let err = storage.get_opts(&path, options).await.unwrap_err(); - assert!(matches!(err, Error::Precondition { .. }), "{err}"); - - let options = GetOptions { - if_none_match: Some(tag.clone()), - ..GetOptions::default() - }; - let err = storage.get_opts(&path, options).await.unwrap_err(); - assert!(matches!(err, Error::NotModified { .. }), "{err}"); - - let options = GetOptions { - if_none_match: Some("invalid".to_string()), - ..GetOptions::default() - }; - storage.get_opts(&path, options).await.unwrap(); - - let result = storage.put(&path, "test".into()).await.unwrap(); - let new_tag = result.e_tag.unwrap(); - assert_ne!(tag, new_tag); - - let meta = storage.head(&path).await.unwrap(); - assert_eq!(meta.e_tag.unwrap(), new_tag); - - let options = GetOptions { - if_match: Some(new_tag), - ..GetOptions::default() - }; - storage.get_opts(&path, options).await.unwrap(); - - let options = GetOptions { - if_match: Some(tag), - ..GetOptions::default() - }; - let err = storage.get_opts(&path, options).await.unwrap_err(); - assert!(matches!(err, Error::Precondition { .. }), "{err}"); - - if let Some(version) = meta.version { - storage.put(&path, "bar".into()).await.unwrap(); - - let options = GetOptions { - version: Some(version), - ..GetOptions::default() - }; - - // Can retrieve previous version - let get_opts = storage.get_opts(&path, options).await.unwrap(); - let old = get_opts.bytes().await.unwrap(); - assert_eq!(old, b"test".as_slice()); - - // Current version contains the updated data - let current = storage.get(&path).await.unwrap().bytes().await.unwrap(); - assert_eq!(¤t, b"bar".as_slice()); - } - } - - pub(crate) async fn put_opts(storage: &dyn ObjectStore, supports_update: bool) { - // When using DynamoCommit repeated runs of this test will produce the same sequence of records in DynamoDB - // As a result each conditional operation will need to wait for the lease to timeout before proceeding - // One solution would be to clear DynamoDB before each test, but this would require non-trivial additional code - // so we instead just generate a random suffix for the filenames - let rng = thread_rng(); - let suffix = String::from_utf8(rng.sample_iter(Alphanumeric).take(32).collect()).unwrap(); - - delete_fixtures(storage).await; - let path = Path::from(format!("put_opts_{suffix}")); - let v1 = storage - .put_opts(&path, "a".into(), PutMode::Create.into()) - .await - .unwrap(); - - let err = storage - .put_opts(&path, "b".into(), PutMode::Create.into()) - .await - .unwrap_err(); - assert!(matches!(err, Error::AlreadyExists { .. }), "{err}"); - - let b = storage.get(&path).await.unwrap().bytes().await.unwrap(); - assert_eq!(b.as_ref(), b"a"); - - if !supports_update { - return; - } - - let v2 = storage - .put_opts(&path, "c".into(), PutMode::Update(v1.clone().into()).into()) - .await - .unwrap(); - - let b = storage.get(&path).await.unwrap().bytes().await.unwrap(); - assert_eq!(b.as_ref(), b"c"); - - let err = storage - .put_opts(&path, "d".into(), PutMode::Update(v1.into()).into()) - .await - .unwrap_err(); - assert!(matches!(err, Error::Precondition { .. }), "{err}"); - - storage - .put_opts(&path, "e".into(), PutMode::Update(v2.clone().into()).into()) - .await - .unwrap(); - - let b = storage.get(&path).await.unwrap().bytes().await.unwrap(); - assert_eq!(b.as_ref(), b"e"); - - // Update not exists - let path = Path::from("I don't exist"); - let err = storage - .put_opts(&path, "e".into(), PutMode::Update(v2.into()).into()) - .await - .unwrap_err(); - assert!(matches!(err, Error::Precondition { .. }), "{err}"); - - const NUM_WORKERS: usize = 5; - const NUM_INCREMENTS: usize = 10; - - let path = Path::from(format!("RACE-{suffix}")); - let mut futures: FuturesUnordered<_> = (0..NUM_WORKERS) - .map(|_| async { - for _ in 0..NUM_INCREMENTS { - loop { - match storage.get(&path).await { - Ok(r) => { - let mode = PutMode::Update(UpdateVersion { - e_tag: r.meta.e_tag.clone(), - version: r.meta.version.clone(), - }); - - let b = r.bytes().await.unwrap(); - let v: usize = std::str::from_utf8(&b).unwrap().parse().unwrap(); - let new = (v + 1).to_string(); - - match storage.put_opts(&path, new.into(), mode.into()).await { - Ok(_) => break, - Err(Error::Precondition { .. }) => continue, - Err(e) => return Err(e), - } - } - Err(Error::NotFound { .. }) => { - let mode = PutMode::Create; - match storage.put_opts(&path, "1".into(), mode.into()).await { - Ok(_) => break, - Err(Error::AlreadyExists { .. }) => continue, - Err(e) => return Err(e), - } - } - Err(e) => return Err(e), - } - } - } - Ok(()) - }) - .collect(); - - while futures.next().await.transpose().unwrap().is_some() {} - let b = storage.get(&path).await.unwrap().bytes().await.unwrap(); - let v = std::str::from_utf8(&b).unwrap().parse::().unwrap(); - assert_eq!(v, NUM_WORKERS * NUM_INCREMENTS); - } - - /// Returns a chunk of length `chunk_length` - fn get_chunk(chunk_length: usize) -> Bytes { - let mut data = vec![0_u8; chunk_length]; - let mut rng = thread_rng(); - // Set a random selection of bytes - for _ in 0..1000 { - data[rng.gen_range(0..chunk_length)] = rng.gen(); - } - data.into() - } - - /// Returns `num_chunks` of length `chunks` - fn get_chunks(chunk_length: usize, num_chunks: usize) -> Vec { - (0..num_chunks).map(|_| get_chunk(chunk_length)).collect() - } - - pub(crate) async fn stream_get(storage: &DynObjectStore) { - let location = Path::from("test_dir/test_upload_file.txt"); - - // Can write to storage - let data = get_chunks(5 * 1024 * 1024, 3); - let bytes_expected = data.concat(); - let mut upload = storage.put_multipart(&location).await.unwrap(); - let uploads = data.into_iter().map(|x| upload.put_part(x.into())); - futures::future::try_join_all(uploads).await.unwrap(); - - // Object should not yet exist in store - let meta_res = storage.head(&location).await; - assert!(meta_res.is_err()); - assert!(matches!( - meta_res.unwrap_err(), - crate::Error::NotFound { .. } - )); - - let files = flatten_list_stream(storage, None).await.unwrap(); - assert_eq!(&files, &[]); - - let result = storage.list_with_delimiter(None).await.unwrap(); - assert_eq!(&result.objects, &[]); - - upload.complete().await.unwrap(); - - let bytes_written = storage.get(&location).await.unwrap().bytes().await.unwrap(); - assert_eq!(bytes_expected, bytes_written); - - // Can overwrite some storage - // Sizes chosen to ensure we write three parts - let data = get_chunks(3_200_000, 7); - let bytes_expected = data.concat(); - let upload = storage.put_multipart(&location).await.unwrap(); - let mut writer = WriteMultipart::new(upload); - for chunk in &data { - writer.write(chunk) - } - writer.finish().await.unwrap(); - let bytes_written = storage.get(&location).await.unwrap().bytes().await.unwrap(); - assert_eq!(bytes_expected, bytes_written); - - // We can abort an empty write - let location = Path::from("test_dir/test_abort_upload.txt"); - let mut upload = storage.put_multipart(&location).await.unwrap(); - upload.abort().await.unwrap(); - let get_res = storage.get(&location).await; - assert!(get_res.is_err()); - assert!(matches!( - get_res.unwrap_err(), - crate::Error::NotFound { .. } - )); - - // We can abort an in-progress write - let mut upload = storage.put_multipart(&location).await.unwrap(); - upload - .put_part(data.first().unwrap().clone().into()) - .await - .unwrap(); - - upload.abort().await.unwrap(); - let get_res = storage.get(&location).await; - assert!(get_res.is_err()); - assert!(matches!( - get_res.unwrap_err(), - crate::Error::NotFound { .. } - )); - } - - pub(crate) async fn list_uses_directories_correctly(storage: &DynObjectStore) { - delete_fixtures(storage).await; - - let content_list = flatten_list_stream(storage, None).await.unwrap(); - assert!( - content_list.is_empty(), - "Expected list to be empty; found: {content_list:?}" - ); - - let location1 = Path::from("foo/x.json"); - let location2 = Path::from("foo.bar/y.json"); - - let data = PutPayload::from("arbitrary data"); - storage.put(&location1, data.clone()).await.unwrap(); - storage.put(&location2, data).await.unwrap(); - - let prefix = Path::from("foo"); - let content_list = flatten_list_stream(storage, Some(&prefix)).await.unwrap(); - assert_eq!(content_list, &[location1.clone()]); - - let result = storage.list_with_delimiter(Some(&prefix)).await.unwrap(); - assert_eq!(result.objects.len(), 1); - assert_eq!(result.objects[0].location, location1); - assert_eq!(result.common_prefixes, &[]); - - // Listing an existing path (file) should return an empty list: - // https://github.com/apache/arrow-rs/issues/3712 - let content_list = flatten_list_stream(storage, Some(&location1)) - .await - .unwrap(); - assert_eq!(content_list, &[]); - - let list = storage.list_with_delimiter(Some(&location1)).await.unwrap(); - assert_eq!(list.objects, &[]); - assert_eq!(list.common_prefixes, &[]); - - let prefix = Path::from("foo/x"); - let content_list = flatten_list_stream(storage, Some(&prefix)).await.unwrap(); - assert_eq!(content_list, &[]); - - let list = storage.list_with_delimiter(Some(&prefix)).await.unwrap(); - assert_eq!(list.objects, &[]); - assert_eq!(list.common_prefixes, &[]); - } - - pub(crate) async fn list_with_delimiter(storage: &DynObjectStore) { - delete_fixtures(storage).await; - - // ==================== check: store is empty ==================== - let content_list = flatten_list_stream(storage, None).await.unwrap(); - assert!(content_list.is_empty()); - - // ==================== do: create files ==================== - let data = Bytes::from("arbitrary data"); - - let files: Vec<_> = [ - "test_file", - "mydb/wb/000/000/000.segment", - "mydb/wb/000/000/001.segment", - "mydb/wb/000/000/002.segment", - "mydb/wb/001/001/000.segment", - "mydb/wb/foo.json", - "mydb/wbwbwb/111/222/333.segment", - "mydb/data/whatevs", - ] - .iter() - .map(|&s| Path::from(s)) - .collect(); - - for f in &files { - storage.put(f, data.clone().into()).await.unwrap(); - } - - // ==================== check: prefix-list `mydb/wb` (directory) ==================== - let prefix = Path::from("mydb/wb"); - - let expected_000 = Path::from("mydb/wb/000"); - let expected_001 = Path::from("mydb/wb/001"); - let expected_location = Path::from("mydb/wb/foo.json"); - - let result = storage.list_with_delimiter(Some(&prefix)).await.unwrap(); - - assert_eq!(result.common_prefixes, vec![expected_000, expected_001]); - assert_eq!(result.objects.len(), 1); - - let object = &result.objects[0]; - - assert_eq!(object.location, expected_location); - assert_eq!(object.size, data.len()); - - // ==================== check: prefix-list `mydb/wb/000/000/001` (partial filename doesn't match) ==================== - let prefix = Path::from("mydb/wb/000/000/001"); - - let result = storage.list_with_delimiter(Some(&prefix)).await.unwrap(); - assert!(result.common_prefixes.is_empty()); - assert_eq!(result.objects.len(), 0); - - // ==================== check: prefix-list `not_there` (non-existing prefix) ==================== - let prefix = Path::from("not_there"); - - let result = storage.list_with_delimiter(Some(&prefix)).await.unwrap(); - assert!(result.common_prefixes.is_empty()); - assert!(result.objects.is_empty()); - - // ==================== do: remove all files ==================== - for f in &files { - storage.delete(f).await.unwrap(); - } - - // ==================== check: store is empty ==================== - let content_list = flatten_list_stream(storage, None).await.unwrap(); - assert!(content_list.is_empty()); - } - - pub(crate) async fn get_nonexistent_object( - storage: &DynObjectStore, - location: Option, - ) -> crate::Result { - let location = location.unwrap_or_else(|| Path::from("this_file_should_not_exist")); - - let err = storage.head(&location).await.unwrap_err(); - assert!(matches!(err, crate::Error::NotFound { .. })); - - storage.get(&location).await?.bytes().await - } - - pub(crate) async fn rename_and_copy(storage: &DynObjectStore) { - // Create two objects - let path1 = Path::from("test1"); - let path2 = Path::from("test2"); - let contents1 = Bytes::from("cats"); - let contents2 = Bytes::from("dogs"); - - // copy() make both objects identical - storage.put(&path1, contents1.clone().into()).await.unwrap(); - storage.put(&path2, contents2.clone().into()).await.unwrap(); - storage.copy(&path1, &path2).await.unwrap(); - let new_contents = storage.get(&path2).await.unwrap().bytes().await.unwrap(); - assert_eq!(&new_contents, &contents1); - - // rename() copies contents and deletes original - storage.put(&path1, contents1.clone().into()).await.unwrap(); - storage.put(&path2, contents2.clone().into()).await.unwrap(); - storage.rename(&path1, &path2).await.unwrap(); - let new_contents = storage.get(&path2).await.unwrap().bytes().await.unwrap(); - assert_eq!(&new_contents, &contents1); - let result = storage.get(&path1).await; - assert!(result.is_err()); - assert!(matches!(result.unwrap_err(), crate::Error::NotFound { .. })); - - // Clean up - storage.delete(&path2).await.unwrap(); - } - - pub(crate) async fn copy_if_not_exists(storage: &DynObjectStore) { - // Create two objects - let path1 = Path::from("test1"); - let path2 = Path::from("not_exists_nested/test2"); - let contents1 = Bytes::from("cats"); - let contents2 = Bytes::from("dogs"); - - // copy_if_not_exists() errors if destination already exists - storage.put(&path1, contents1.clone().into()).await.unwrap(); - storage.put(&path2, contents2.clone().into()).await.unwrap(); - let result = storage.copy_if_not_exists(&path1, &path2).await; - assert!(result.is_err()); - assert!(matches!( - result.unwrap_err(), - crate::Error::AlreadyExists { .. } - )); - - // copy_if_not_exists() copies contents and allows deleting original - storage.delete(&path2).await.unwrap(); - storage.copy_if_not_exists(&path1, &path2).await.unwrap(); - storage.delete(&path1).await.unwrap(); - let new_contents = storage.get(&path2).await.unwrap().bytes().await.unwrap(); - assert_eq!(&new_contents, &contents1); - let result = storage.get(&path1).await; - assert!(result.is_err()); - assert!(matches!(result.unwrap_err(), crate::Error::NotFound { .. })); - - // Clean up - storage.delete(&path2).await.unwrap(); - } - - pub(crate) async fn copy_rename_nonexistent_object(storage: &DynObjectStore) { - // Create empty source object - let path1 = Path::from("test1"); - - // Create destination object - let path2 = Path::from("test2"); - storage.put(&path2, "hello".into()).await.unwrap(); - - // copy() errors if source does not exist - let result = storage.copy(&path1, &path2).await; - assert!(result.is_err()); - assert!(matches!(result.unwrap_err(), crate::Error::NotFound { .. })); - - // rename() errors if source does not exist - let result = storage.rename(&path1, &path2).await; - assert!(result.is_err()); - assert!(matches!(result.unwrap_err(), crate::Error::NotFound { .. })); - - // copy_if_not_exists() errors if source does not exist - let result = storage.copy_if_not_exists(&path1, &path2).await; - assert!(result.is_err()); - assert!(matches!(result.unwrap_err(), crate::Error::NotFound { .. })); - - // Clean up - storage.delete(&path2).await.unwrap(); - } - - pub(crate) async fn multipart(storage: &dyn ObjectStore, multipart: &dyn MultipartStore) { - let path = Path::from("test_multipart"); - let chunk_size = 5 * 1024 * 1024; - - let chunks = get_chunks(chunk_size, 2); - - let id = multipart.create_multipart(&path).await.unwrap(); - - let parts: Vec<_> = futures::stream::iter(chunks) - .enumerate() - .map(|(idx, b)| multipart.put_part(&path, &id, idx, b.into())) - .buffered(2) - .try_collect() - .await - .unwrap(); - - multipart - .complete_multipart(&path, &id, parts) - .await - .unwrap(); - - let meta = storage.head(&path).await.unwrap(); - assert_eq!(meta.size, chunk_size * 2); - - // Empty case - let path = Path::from("test_empty_multipart"); - - let id = multipart.create_multipart(&path).await.unwrap(); - - let parts = vec![]; - - multipart - .complete_multipart(&path, &id, parts) - .await - .unwrap(); - - let meta = storage.head(&path).await.unwrap(); - assert_eq!(meta.size, 0); + /// Test that the returned stream does not borrow the lifetime of Path + fn list_store<'a>( + store: &'a dyn ObjectStore, + path_str: &str, + ) -> BoxStream<'a, Result> { + let path = Path::from(path_str); + store.list(Some(&path)) } #[cfg(any(feature = "azure", feature = "aws"))] - pub(crate) async fn signing(integration: &T) + pub async fn signing(integration: &T) where - T: ObjectStore + crate::signer::Signer, + T: ObjectStore + signer::Signer, { use reqwest::Method; use std::time::Duration; @@ -2367,7 +1337,7 @@ mod tests { } #[cfg(any(feature = "aws", feature = "azure"))] - pub(crate) async fn tagging(storage: Arc, validate: bool, get_tags: F) + pub async fn tagging(storage: Arc, validate: bool, get_tags: F) where F: Fn(Path) -> Fut + Send + Sync, Fut: std::future::Future> + Send, @@ -2444,24 +1414,6 @@ mod tests { } } - async fn delete_fixtures(storage: &DynObjectStore) { - let paths = storage.list(None).map_ok(|meta| meta.location).boxed(); - storage - .delete_stream(paths) - .try_collect::>() - .await - .unwrap(); - } - - /// Test that the returned stream does not borrow the lifetime of Path - fn list_store<'a>( - store: &'a dyn ObjectStore, - path_str: &str, - ) -> BoxStream<'a, Result> { - let path = Path::from(path_str); - store.list(Some(&path)) - } - #[tokio::test] async fn test_list_lifetimes() { let store = memory::InMemory::new(); diff --git a/src/limit.rs b/src/limit.rs index f3e1d42..64b96ad 100644 --- a/src/limit.rs +++ b/src/limit.rs @@ -273,9 +273,9 @@ impl MultipartUpload for LimitUpload { #[cfg(test)] mod tests { + use crate::integration::*; use crate::limit::LimitStore; use crate::memory::InMemory; - use crate::tests::*; use crate::ObjectStore; use futures::stream::StreamExt; use std::pin::Pin; diff --git a/src/local.rs b/src/local.rs index 8dec5be..95b50d6 100644 --- a/src/local.rs +++ b/src/local.rs @@ -1005,8 +1005,7 @@ mod tests { use futures::TryStreamExt; use tempfile::{NamedTempFile, TempDir}; - use crate::test_util::flatten_list_stream; - use crate::tests::*; + use crate::integration::*; use super::*; diff --git a/src/memory.rs b/src/memory.rs index daf14e1..0d72983 100644 --- a/src/memory.rs +++ b/src/memory.rs @@ -528,7 +528,7 @@ impl MultipartUpload for InMemoryUpload { #[cfg(test)] mod tests { - use crate::tests::*; + use crate::integration::*; use super::*; diff --git a/src/parse.rs b/src/parse.rs index 5549fd3..e5d5149 100644 --- a/src/parse.rs +++ b/src/parse.rs @@ -25,15 +25,9 @@ use url::Url; #[derive(Debug, Snafu)] enum Error { - #[snafu(display("Unable to convert URL \"{}\" to filesystem path", url))] - InvalidUrl { url: Url }, - #[snafu(display("Unable to recognise URL \"{}\"", url))] Unrecognised { url: Url }, - #[snafu(display("Feature {scheme:?} not enabled"))] - NotEnabled { scheme: ObjectStoreScheme }, - #[snafu(context(false))] Path { source: crate::path::Error }, } diff --git a/src/prefix.rs b/src/prefix.rs index 7c9ea58..9b10fea 100644 --- a/src/prefix.rs +++ b/src/prefix.rs @@ -204,9 +204,8 @@ impl ObjectStore for PrefixStore { #[cfg(test)] mod tests { use super::*; + use crate::integration::*; use crate::local::LocalFileSystem; - use crate::test_util::flatten_list_stream; - use crate::tests::*; use tempfile::TempDir; diff --git a/src/throttle.rs b/src/throttle.rs index 38b6d7c..d07276c 100644 --- a/src/throttle.rs +++ b/src/throttle.rs @@ -398,7 +398,7 @@ impl MultipartUpload for ThrottledUpload { #[cfg(test)] mod tests { use super::*; - use crate::{memory::InMemory, tests::*, GetResultPayload}; + use crate::{integration::*, memory::InMemory, GetResultPayload}; use futures::TryStreamExt; use tokio::time::Duration; use tokio::time::Instant; From 502e3c57c50aacfcaac283cfaab950f137dcc19e Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Fri, 10 May 2024 17:59:21 +0100 Subject: [PATCH 300/397] Fix PutPayloadMut::push not updating content_length (#5743) (#5744) --- src/integration.rs | 10 ++++++++++ src/payload.rs | 16 +++++++++++++++- 2 files changed, 25 insertions(+), 1 deletion(-) diff --git a/src/integration.rs b/src/integration.rs index 9a7d117..d08c450 100644 --- a/src/integration.rs +++ b/src/integration.rs @@ -789,6 +789,16 @@ pub async fn stream_get(storage: &DynObjectStore) { let bytes_written = storage.get(&location).await.unwrap().bytes().await.unwrap(); assert_eq!(bytes_expected, bytes_written); + let location = Path::from("test_dir/test_put_part.txt"); + let upload = storage.put_multipart(&location).await.unwrap(); + let mut write = WriteMultipart::new(upload); + write.put(vec![0; 2].into()); + write.put(vec![3; 4].into()); + write.finish().await.unwrap(); + + let meta = storage.head(&location).await.unwrap(); + assert_eq!(meta.size, 6); + // We can abort an empty write let location = Path::from("test_dir/test_abort_upload.txt"); let mut upload = storage.put_multipart(&location).await.unwrap(); diff --git a/src/payload.rs b/src/payload.rs index 486bea3..d71f016 100644 --- a/src/payload.rs +++ b/src/payload.rs @@ -252,7 +252,8 @@ impl PutPayloadMut { let completed = std::mem::take(&mut self.in_progress); self.completed.push(completed.into()) } - self.completed.push(bytes) + self.len += bytes.len(); + self.completed.push(bytes); } /// Returns `true` if this [`PutPayloadMut`] contains no bytes @@ -311,4 +312,17 @@ mod test { assert_eq!(chunks[4].len(), 20); assert_eq!(chunks[5].len(), 6); } + + #[test] + fn test_content_length() { + let mut chunk = PutPayloadMut::new(); + chunk.push(vec![0; 23].into()); + assert_eq!(chunk.content_length(), 23); + chunk.extend_from_slice(&[0; 4]); + assert_eq!(chunk.content_length(), 27); + chunk.push(vec![0; 121].into()); + assert_eq!(chunk.content_length(), 148); + let payload = chunk.freeze(); + assert_eq!(payload.content_length(), 148); + } } From 932be92a17246cd0887a25d9c7ad88a85e9ed040 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Fri, 10 May 2024 18:22:08 +0100 Subject: [PATCH 301/397] Prepare object_store 0.10.1 (#5745) --- CHANGELOG-old.md | 54 ++++++++++++++++++++++++++++++++ CHANGELOG.md | 54 +++++++------------------------- Cargo.toml | 2 +- dev/release/update_change_log.sh | 4 +-- 4 files changed, 69 insertions(+), 45 deletions(-) diff --git a/CHANGELOG-old.md b/CHANGELOG-old.md index 3ccfcad..f52d900 100644 --- a/CHANGELOG-old.md +++ b/CHANGELOG-old.md @@ -21,6 +21,60 @@ # Changelog +## [object_store_0.10.0](https://github.com/apache/arrow-rs/tree/object_store_0.10.0) (2024-04-17) + +[Full Changelog](https://github.com/apache/arrow-rs/compare/object_store_0.9.1...object_store_0.10.0) + +**Breaking changes:** + +- Add put\_multipart\_opts \(\#5435\) [\#5652](https://github.com/apache/arrow-rs/pull/5652) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Add Attributes API \(\#5329\) [\#5650](https://github.com/apache/arrow-rs/pull/5650) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Support non-contiguous put payloads / vectored writes \(\#5514\) [\#5538](https://github.com/apache/arrow-rs/pull/5538) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Replace AsyncWrite with Upload trait and rename MultiPartStore to MultipartStore \(\#5458\) [\#5500](https://github.com/apache/arrow-rs/pull/5500) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) + +**Implemented enhancements:** + +- Improve Retry Coverage [\#5608](https://github.com/apache/arrow-rs/issues/5608) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Zero Copy Support [\#5593](https://github.com/apache/arrow-rs/issues/5593) +- ObjectStore bulk delete [\#5591](https://github.com/apache/arrow-rs/issues/5591) +- Retry on Broken Connection [\#5589](https://github.com/apache/arrow-rs/issues/5589) +- Inconsistent Multipart Nomenclature [\#5526](https://github.com/apache/arrow-rs/issues/5526) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- \[ObjectStore\] Non-Contiguous Write Payloads [\#5514](https://github.com/apache/arrow-rs/issues/5514) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- In Object Store, return version & etag on multipart put. [\#5443](https://github.com/apache/arrow-rs/issues/5443) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Release Object Store 0.9.1 [\#5436](https://github.com/apache/arrow-rs/issues/5436) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- object\_store: allow setting content-type per request [\#5329](https://github.com/apache/arrow-rs/issues/5329) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- GCS Signed URL Support [\#5233](https://github.com/apache/arrow-rs/issues/5233) + +**Fixed bugs:** + +- \[object\_store\] minor bug: typos present in local variable [\#5628](https://github.com/apache/arrow-rs/issues/5628) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- \[arrow-csv\] Schema inference requires csv on disk [\#5551](https://github.com/apache/arrow-rs/issues/5551) +- Local object store copy/rename with nonexistent `from` file loops forever instead of erroring [\#5503](https://github.com/apache/arrow-rs/issues/5503) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- object store ApplicationDefaultCredentials auth is not working on windows [\#5466](https://github.com/apache/arrow-rs/issues/5466) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- MicrosoftAzure store list result omits empty objects [\#5451](https://github.com/apache/arrow-rs/issues/5451) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] + +**Documentation updates:** + +- Minor: add additional documentation about `BufWriter` [\#5519](https://github.com/apache/arrow-rs/pull/5519) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([alamb](https://github.com/alamb)) + +**Merged pull requests:** + +- minor-fix: removed typos in object\_store sub crate [\#5629](https://github.com/apache/arrow-rs/pull/5629) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([Silemo](https://github.com/Silemo)) +- Retry on More Error Classes [\#5609](https://github.com/apache/arrow-rs/pull/5609) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([andrebsguedes](https://github.com/andrebsguedes)) +- Fix handling of empty multipart uploads for GCS [\#5590](https://github.com/apache/arrow-rs/pull/5590) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Upgrade object\_store dependency to use chrono `0.4.34` [\#5578](https://github.com/apache/arrow-rs/pull/5578) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([l1nxy](https://github.com/l1nxy)) +- Fix Latest Clippy Lints for object\_store [\#5546](https://github.com/apache/arrow-rs/pull/5546) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Update reqwest 0.12 and http 1.0 [\#5536](https://github.com/apache/arrow-rs/pull/5536) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Implement MultipartStore for ThrottledStore [\#5533](https://github.com/apache/arrow-rs/pull/5533) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- fix: copy/rename return error if source is nonexistent [\#5528](https://github.com/apache/arrow-rs/pull/5528) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([dimbtp](https://github.com/dimbtp)) +- Prepare arrow 51.0.0 [\#5516](https://github.com/apache/arrow-rs/pull/5516) ([tustvold](https://github.com/tustvold)) +- Implement MultiPartStore for InMemory [\#5495](https://github.com/apache/arrow-rs/pull/5495) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Add more comprehensive documentation on testing and benchmarking to CONTRIBUTING.md [\#5478](https://github.com/apache/arrow-rs/pull/5478) ([monkwire](https://github.com/monkwire)) +- add support for gcp application default auth on windows in object store [\#5473](https://github.com/apache/arrow-rs/pull/5473) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([Itayazolay](https://github.com/Itayazolay)) +- Update base64 requirement from 0.21 to 0.22 in /object\_store [\#5465](https://github.com/apache/arrow-rs/pull/5465) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- Uses ResourceType for filtering list directories instead of workaround [\#5452](https://github.com/apache/arrow-rs/pull/5452) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([andrebsguedes](https://github.com/andrebsguedes)) +- Add GCS signed URL support [\#5300](https://github.com/apache/arrow-rs/pull/5300) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([l1nxy](https://github.com/l1nxy)) + ## [object_store_0.9.1](https://github.com/apache/arrow-rs/tree/object_store_0.9.1) (2024-03-01) [Full Changelog](https://github.com/apache/arrow-rs/compare/object_store_0.9.0...object_store_0.9.1) diff --git a/CHANGELOG.md b/CHANGELOG.md index dc58ecb..5beda50 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,58 +19,28 @@ # Changelog -## [object_store_0.10.0](https://github.com/apache/arrow-rs/tree/object_store_0.10.0) (2024-04-17) +## [object_store_0.10.1](https://github.com/apache/arrow-rs/tree/object_store_0.10.1) (2024-05-10) -[Full Changelog](https://github.com/apache/arrow-rs/compare/object_store_0.9.1...object_store_0.10.0) - -**Breaking changes:** - -- Add put\_multipart\_opts \(\#5435\) [\#5652](https://github.com/apache/arrow-rs/pull/5652) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- Add Attributes API \(\#5329\) [\#5650](https://github.com/apache/arrow-rs/pull/5650) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- Support non-contiguous put payloads / vectored writes \(\#5514\) [\#5538](https://github.com/apache/arrow-rs/pull/5538) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- Replace AsyncWrite with Upload trait and rename MultiPartStore to MultipartStore \(\#5458\) [\#5500](https://github.com/apache/arrow-rs/pull/5500) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +[Full Changelog](https://github.com/apache/arrow-rs/compare/object_store_0.10.0...object_store_0.10.1) **Implemented enhancements:** -- Improve Retry Coverage [\#5608](https://github.com/apache/arrow-rs/issues/5608) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- Zero Copy Support [\#5593](https://github.com/apache/arrow-rs/issues/5593) -- ObjectStore bulk delete [\#5591](https://github.com/apache/arrow-rs/issues/5591) -- Retry on Broken Connection [\#5589](https://github.com/apache/arrow-rs/issues/5589) -- Inconsistent Multipart Nomenclature [\#5526](https://github.com/apache/arrow-rs/issues/5526) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- \[ObjectStore\] Non-Contiguous Write Payloads [\#5514](https://github.com/apache/arrow-rs/issues/5514) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- In Object Store, return version & etag on multipart put. [\#5443](https://github.com/apache/arrow-rs/issues/5443) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- Release Object Store 0.9.1 [\#5436](https://github.com/apache/arrow-rs/issues/5436) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- object\_store: allow setting content-type per request [\#5329](https://github.com/apache/arrow-rs/issues/5329) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- GCS Signed URL Support [\#5233](https://github.com/apache/arrow-rs/issues/5233) +- Allow specifying PUT options when using `BufWriter` [\#5692](https://github.com/apache/arrow-rs/issues/5692) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Add more attributes to `object_store::Attribute` [\#5689](https://github.com/apache/arrow-rs/issues/5689) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- feat object\_store: moving tests from src/ to a tests/ folder and enabling access to test functions for enabling a shared integration test suite [\#5685](https://github.com/apache/arrow-rs/issues/5685) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Release Object Store 0.10.0 [\#5647](https://github.com/apache/arrow-rs/issues/5647) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] **Fixed bugs:** -- \[object\_store\] minor bug: typos present in local variable [\#5628](https://github.com/apache/arrow-rs/issues/5628) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- \[arrow-csv\] Schema inference requires csv on disk [\#5551](https://github.com/apache/arrow-rs/issues/5551) -- Local object store copy/rename with nonexistent `from` file loops forever instead of erroring [\#5503](https://github.com/apache/arrow-rs/issues/5503) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- object store ApplicationDefaultCredentials auth is not working on windows [\#5466](https://github.com/apache/arrow-rs/issues/5466) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- MicrosoftAzure store list result omits empty objects [\#5451](https://github.com/apache/arrow-rs/issues/5451) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Using WriteMultipart::put results in 0 bytes being written [\#5743](https://github.com/apache/arrow-rs/issues/5743) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -**Documentation updates:** +**Merged pull requests:** -- Minor: add additional documentation about `BufWriter` [\#5519](https://github.com/apache/arrow-rs/pull/5519) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([alamb](https://github.com/alamb)) +- Fix PutPayloadMut::push not updating content\_length \(\#5743\) [\#5744](https://github.com/apache/arrow-rs/pull/5744) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Export object\_store integration tests [\#5709](https://github.com/apache/arrow-rs/pull/5709) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Add `BufWriter::with_attributes` and `::with_tags` in `object_store` [\#5693](https://github.com/apache/arrow-rs/pull/5693) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([netthier](https://github.com/netthier)) +- Add more attributes to `object_store::Attribute` [\#5690](https://github.com/apache/arrow-rs/pull/5690) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([netthier](https://github.com/netthier)) -**Merged pull requests:** -- minor-fix: removed typos in object\_store sub crate [\#5629](https://github.com/apache/arrow-rs/pull/5629) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([Silemo](https://github.com/Silemo)) -- Retry on More Error Classes [\#5609](https://github.com/apache/arrow-rs/pull/5609) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([andrebsguedes](https://github.com/andrebsguedes)) -- Fix handling of empty multipart uploads for GCS [\#5590](https://github.com/apache/arrow-rs/pull/5590) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- Upgrade object\_store dependency to use chrono `0.4.34` [\#5578](https://github.com/apache/arrow-rs/pull/5578) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([l1nxy](https://github.com/l1nxy)) -- Fix Latest Clippy Lints for object\_store [\#5546](https://github.com/apache/arrow-rs/pull/5546) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- Update reqwest 0.12 and http 1.0 [\#5536](https://github.com/apache/arrow-rs/pull/5536) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- Implement MultipartStore for ThrottledStore [\#5533](https://github.com/apache/arrow-rs/pull/5533) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- fix: copy/rename return error if source is nonexistent [\#5528](https://github.com/apache/arrow-rs/pull/5528) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([dimbtp](https://github.com/dimbtp)) -- Prepare arrow 51.0.0 [\#5516](https://github.com/apache/arrow-rs/pull/5516) ([tustvold](https://github.com/tustvold)) -- Implement MultiPartStore for InMemory [\#5495](https://github.com/apache/arrow-rs/pull/5495) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- Add more comprehensive documentation on testing and benchmarking to CONTRIBUTING.md [\#5478](https://github.com/apache/arrow-rs/pull/5478) ([monkwire](https://github.com/monkwire)) -- add support for gcp application default auth on windows in object store [\#5473](https://github.com/apache/arrow-rs/pull/5473) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([Itayazolay](https://github.com/Itayazolay)) -- Update base64 requirement from 0.21 to 0.22 in /object\_store [\#5465](https://github.com/apache/arrow-rs/pull/5465) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([dependabot[bot]](https://github.com/apps/dependabot)) -- Uses ResourceType for filtering list directories instead of workaround [\#5452](https://github.com/apache/arrow-rs/pull/5452) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([andrebsguedes](https://github.com/andrebsguedes)) -- Add GCS signed URL support [\#5300](https://github.com/apache/arrow-rs/pull/5300) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([l1nxy](https://github.com/l1nxy)) \* *This Changelog was automatically generated by [github_changelog_generator](https://github.com/github-changelog-generator/github-changelog-generator)* diff --git a/Cargo.toml b/Cargo.toml index c61946f..20d7c2d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "object_store" -version = "0.10.0" +version = "0.10.1" edition = "2021" license = "MIT/Apache-2.0" readme = "README.md" diff --git a/dev/release/update_change_log.sh b/dev/release/update_change_log.sh index 5a28409..9ba5d89 100755 --- a/dev/release/update_change_log.sh +++ b/dev/release/update_change_log.sh @@ -29,8 +29,8 @@ set -e -SINCE_TAG="object_store_0.9.1" -FUTURE_RELEASE="object_store_0.10.0" +SINCE_TAG="object_store_0.10.0" +FUTURE_RELEASE="object_store_0.10.1" SOURCE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" SOURCE_TOP_DIR="$(cd "${SOURCE_DIR}/../../" && pwd)" From b32010e291dcf46cf39e910636f9b1e91f508b13 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Sat, 11 May 2024 09:34:56 +0100 Subject: [PATCH 302/397] Add additional WriteMultipart tests (#5743) (#5746) * Add additional WriteMultipart tests (#5743) * Clippy --- src/integration.rs | 12 ++++++++ src/upload.rs | 69 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 81 insertions(+) diff --git a/src/integration.rs b/src/integration.rs index d08c450..31b074f 100644 --- a/src/integration.rs +++ b/src/integration.rs @@ -799,6 +799,18 @@ pub async fn stream_get(storage: &DynObjectStore) { let meta = storage.head(&location).await.unwrap(); assert_eq!(meta.size, 6); + let location = Path::from("test_dir/test_put_part_mixed.txt"); + let upload = storage.put_multipart(&location).await.unwrap(); + let mut write = WriteMultipart::new(upload); + write.put(vec![0; 2].into()); + write.write(&[1, 2, 3]); + write.put(vec![4, 5, 6, 7].into()); + write.finish().await.unwrap(); + + let r = storage.get(&location).await.unwrap(); + let r = r.bytes().await.unwrap(); + assert_eq!(r.as_ref(), &[0, 0, 1, 2, 3, 4, 5, 6, 7]); + // We can abort an empty write let location = Path::from("test_dir/test_abort_upload.txt"); let mut upload = storage.put_multipart(&location).await.unwrap(); diff --git a/src/upload.rs b/src/upload.rs index 9805df0..e5f683a 100644 --- a/src/upload.rs +++ b/src/upload.rs @@ -217,9 +217,13 @@ impl WriteMultipart { #[cfg(test)] mod tests { + use std::sync::Arc; use std::time::Duration; use futures::FutureExt; + use parking_lot::Mutex; + use rand::prelude::StdRng; + use rand::{Rng, SeedableRng}; use crate::memory::InMemory; use crate::path::Path; @@ -246,4 +250,69 @@ mod tests { assert!(write.wait_for_capacity(10).now_or_never().is_none()); write.wait_for_capacity(10).await.unwrap() } + + #[derive(Debug, Default)] + struct InstrumentedUpload { + chunks: Arc>>, + } + + #[async_trait] + impl MultipartUpload for InstrumentedUpload { + fn put_part(&mut self, data: PutPayload) -> UploadPart { + self.chunks.lock().push(data); + futures::future::ready(Ok(())).boxed() + } + + async fn complete(&mut self) -> Result { + Ok(PutResult { + e_tag: None, + version: None, + }) + } + + async fn abort(&mut self) -> Result<()> { + unimplemented!() + } + } + + #[tokio::test] + async fn test_write_multipart() { + let mut rng = StdRng::seed_from_u64(42); + + for method in [0.0, 0.5, 1.0] { + for _ in 0..10 { + for chunk_size in [1, 17, 23] { + let upload = Box::::default(); + let chunks = Arc::clone(&upload.chunks); + let mut write = WriteMultipart::new_with_chunk_size(upload, chunk_size); + + let mut expected = Vec::with_capacity(1024); + + for _ in 0..50 { + let chunk_size = rng.gen_range(0..30); + let data: Vec<_> = (0..chunk_size).map(|_| rng.gen()).collect(); + expected.extend_from_slice(&data); + + match rng.gen_bool(method) { + true => write.put(data.into()), + false => write.write(&data), + } + } + write.finish().await.unwrap(); + + let chunks = chunks.lock(); + + let actual: Vec<_> = chunks.iter().flatten().flatten().copied().collect(); + assert_eq!(expected, actual); + + for chunk in chunks.iter().take(chunks.len() - 1) { + assert_eq!(chunk.content_length(), chunk_size) + } + + let last_chunk = chunks.last().unwrap().content_length(); + assert!(last_chunk <= chunk_size, "{chunk_size}"); + } + } + } + } } From cc6288dccda663fcee9403b7aef5b9163e2c7eaf Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 17 May 2024 16:40:56 +0100 Subject: [PATCH 303/397] Update itertools requirement from 0.12.0 to 0.13.0 in /object_store (#5780) Updates the requirements on [itertools](https://github.com/rust-itertools/itertools) to permit the latest version. - [Changelog](https://github.com/rust-itertools/itertools/blob/master/CHANGELOG.md) - [Commits](https://github.com/rust-itertools/itertools/compare/v0.12.0...v0.13.0) --- updated-dependencies: - dependency-name: itertools dependency-type: direct:production ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index 20d7c2d..62cbce7 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -35,7 +35,7 @@ bytes = "1.0" chrono = { version = "0.4.34", default-features = false, features = ["clock"] } futures = "0.3" humantime = "2.1" -itertools = "0.12.0" +itertools = "0.13.0" parking_lot = { version = "0.12" } percent-encoding = "2.1" snafu = "0.7" From 0736cf9de3e82b44caef3d445533d6c77efa923f Mon Sep 17 00:00:00 2001 From: wiedld Date: Thu, 23 May 2024 02:03:27 -0700 Subject: [PATCH 304/397] chore: update docs to delineate which are recursive (#5794) --- src/lib.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index bdf870f..1339d1c 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -716,7 +716,7 @@ pub trait ObjectStore: std::fmt::Display + Send + Sync + Debug + 'static { /// List all the objects with the given prefix. /// /// Prefixes are evaluated on a path segment basis, i.e. `foo/bar/` is a prefix of `foo/bar/x` but not of - /// `foo/bar_baz/x`. + /// `foo/bar_baz/x`. List is recursive, i.e. `foo/bar/more/x` will be included. /// /// Note: the order of returned [`ObjectMeta`] is not guaranteed fn list(&self, prefix: Option<&Path>) -> BoxStream<'_, Result>; @@ -743,7 +743,7 @@ pub trait ObjectStore: std::fmt::Display + Send + Sync + Debug + 'static { /// metadata. /// /// Prefixes are evaluated on a path segment basis, i.e. `foo/bar/` is a prefix of `foo/bar/x` but not of - /// `foo/bar_baz/x`. + /// `foo/bar_baz/x`. List is not recursive, i.e. `foo/bar/more/x` will not be included. async fn list_with_delimiter(&self, prefix: Option<&Path>) -> Result; /// Copy an object from one path to another in the same object store. From a2c6cb8ab3cc9cf81b4c6fa7acadb3ddcbfadce3 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 24 May 2024 15:21:57 +0100 Subject: [PATCH 305/397] Update nix requirement from 0.28.0 to 0.29.0 in /object_store (#5799) Updates the requirements on [nix](https://github.com/nix-rust/nix) to permit the latest version. - [Changelog](https://github.com/nix-rust/nix/blob/master/CHANGELOG.md) - [Commits](https://github.com/nix-rust/nix/compare/v0.28.0...v0.29.0) --- updated-dependencies: - dependency-name: nix dependency-type: direct:production ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index 62cbce7..203e48d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -57,7 +57,7 @@ tokio = { version = "1.25.0", features = ["sync", "macros", "rt", "time", "io-ut md-5 = { version = "0.10.6", default-features = false, optional = true } [target.'cfg(target_family="unix")'.dev-dependencies] -nix = { version = "0.28.0", features = ["fs"] } +nix = { version = "0.29.0", features = ["fs"] } [features] cloud = ["serde", "serde_json", "quick-xml", "hyper", "reqwest", "reqwest/json", "reqwest/stream", "chrono/serde", "base64", "rand", "ring"] From bb1e576d2cac3492006587d442e673e782f9957f Mon Sep 17 00:00:00 2001 From: Hesam Pakdaman <14890379+hesampakdaman@users.noreply.github.com> Date: Sat, 25 May 2024 18:33:53 +0200 Subject: [PATCH 306/397] Fix issue #5800: Handle missing files in list_with_delimiter (#5803) --- src/local.rs | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/src/local.rs b/src/local.rs index 95b50d6..1ce588a 100644 --- a/src/local.rs +++ b/src/local.rs @@ -504,7 +504,7 @@ impl ObjectStore for LocalFileSystem { match config.filesystem_to_path(entry.path()) { Ok(path) => match is_valid_file_path(&path) { - true => Some(convert_entry(entry, path)), + true => convert_entry(entry, path).transpose(), false => None, }, Err(e) => Some(Err(e)), @@ -581,8 +581,8 @@ impl ObjectStore for LocalFileSystem { if is_directory { common_prefixes.insert(prefix.child(common_prefix)); - } else { - objects.push(convert_entry(entry, entry_location)?); + } else if let Some(metadata) = convert_entry(entry, entry_location)? { + objects.push(metadata); } } } @@ -894,12 +894,21 @@ fn open_file(path: &PathBuf) -> Result<(File, Metadata)> { Ok(ret) } -fn convert_entry(entry: DirEntry, location: Path) -> Result { - let metadata = entry.metadata().map_err(|e| Error::Metadata { - source: e.into(), - path: location.to_string(), - })?; - convert_metadata(metadata, location) +fn convert_entry(entry: DirEntry, location: Path) -> Result> { + match entry.metadata() { + Ok(metadata) => convert_metadata(metadata, location).map(Some), + Err(e) => { + if let Some(io_err) = e.io_error() { + if io_err.kind() == ErrorKind::NotFound { + return Ok(None); + } + } + Err(Error::Metadata { + source: e.into(), + path: location.to_string(), + })? + } + } } fn last_modified(metadata: &Metadata) -> DateTime { From 9b2140c147afe3f897ce7f964255c0190b1e0c77 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 12 Jun 2024 11:19:22 +0100 Subject: [PATCH 307/397] Update quick-xml requirement from 0.31.0 to 0.32.0 in /object_store (#5870) Updates the requirements on [quick-xml](https://github.com/tafia/quick-xml) to permit the latest version. - [Release notes](https://github.com/tafia/quick-xml/releases) - [Changelog](https://github.com/tafia/quick-xml/blob/master/Changelog.md) - [Commits](https://github.com/tafia/quick-xml/compare/v0.31.0...v0.32.0) --- updated-dependencies: - dependency-name: quick-xml dependency-type: direct:production ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index 203e48d..13da3a8 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -46,7 +46,7 @@ walkdir = "2" # Cloud storage support base64 = { version = "0.22", default-features = false, features = ["std"], optional = true } hyper = { version = "1.2", default-features = false, optional = true } -quick-xml = { version = "0.31.0", features = ["serialize", "overlapped-lists"], optional = true } +quick-xml = { version = "0.32.0", features = ["serialize", "overlapped-lists"], optional = true } serde = { version = "1.0", default-features = false, features = ["derive"], optional = true } serde_json = { version = "1.0", default-features = false, optional = true } rand = { version = "0.8", default-features = false, features = ["std", "std_rng"], optional = true } From 6d9962aed994fbcfe9829be0aa0f9cb76b52aeb6 Mon Sep 17 00:00:00 2001 From: Xuanwo Date: Thu, 13 Jun 2024 19:23:22 +0800 Subject: [PATCH 308/397] feat(object_store): Add `put` API for buffered::BufWriter (#5835) * feat(object_store): Add buffered::BufUploader Signed-off-by: Xuanwo * Polish tests Signed-off-by: Xuanwo * Merge BufUploader into BufWriter Signed-off-by: Xuanwo * Fix docs Signed-off-by: Xuanwo * Add comment Signed-off-by: Xuanwo --------- Signed-off-by: Xuanwo --- src/buffered.rs | 113 ++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 110 insertions(+), 3 deletions(-) diff --git a/src/buffered.rs b/src/buffered.rs index feb84d4..c7b71aa 100644 --- a/src/buffered.rs +++ b/src/buffered.rs @@ -238,11 +238,11 @@ enum BufWriterState { /// Buffer up to capacity bytes Buffer(Path, PutPayloadMut), /// [`ObjectStore::put_multipart`] - Prepare(BoxFuture<'static, std::io::Result>), + Prepare(BoxFuture<'static, crate::Result>), /// Write to a multipart upload Write(Option), /// [`ObjectStore::put`] - Flush(BoxFuture<'static, std::io::Result<()>>), + Flush(BoxFuture<'static, crate::Result<()>>), } impl BufWriter { @@ -289,6 +289,58 @@ impl BufWriter { } } + /// Write data to the writer in [`Bytes`]. + /// + /// Unlike [`AsyncWrite::poll_write`], `put` can write data without extra copying. + /// + /// This API is recommended while the data source generates [`Bytes`]. + pub async fn put(&mut self, bytes: Bytes) -> crate::Result<()> { + loop { + return match &mut self.state { + BufWriterState::Write(Some(write)) => { + write.wait_for_capacity(self.max_concurrency).await?; + write.put(bytes); + Ok(()) + } + BufWriterState::Write(None) | BufWriterState::Flush(_) => { + panic!("Already shut down") + } + // NOTE + // + // This case should never happen in practice, but rust async API does + // make it possible for users to call `put` before `poll_write` returns `Ready`. + // + // We allow such usage by `await` the future and continue the loop. + BufWriterState::Prepare(f) => { + self.state = BufWriterState::Write(f.await?.into()); + continue; + } + BufWriterState::Buffer(path, b) => { + if b.content_length().saturating_add(bytes.len()) < self.capacity { + b.push(bytes); + Ok(()) + } else { + let buffer = std::mem::take(b); + let path = std::mem::take(path); + let opts = PutMultipartOpts { + attributes: self.attributes.take().unwrap_or_default(), + tags: self.tags.take().unwrap_or_default(), + }; + let upload = self.store.put_multipart_opts(&path, opts).await?; + let mut chunked = + WriteMultipart::new_with_chunk_size(upload, self.capacity); + for chunk in buffer.freeze() { + chunked.put(chunk); + } + chunked.put(bytes); + self.state = BufWriterState::Write(Some(chunked)); + Ok(()) + } + } + }; + } + } + /// Abort this writer, cleaning up any partially uploaded state /// /// # Panic @@ -384,7 +436,7 @@ impl AsyncWrite for BufWriter { Ok(()) })); } - BufWriterState::Flush(f) => return f.poll_unpin(cx), + BufWriterState::Flush(f) => return f.poll_unpin(cx).map_err(std::io::Error::from), BufWriterState::Write(x) => { let upload = x.take().unwrap(); self.state = BufWriterState::Flush( @@ -416,6 +468,7 @@ mod tests { use crate::memory::InMemory; use crate::path::Path; use crate::{Attribute, GetOptions}; + use itertools::Itertools; use tokio::io::{AsyncBufReadExt, AsyncReadExt, AsyncSeekExt, AsyncWriteExt}; #[tokio::test] @@ -547,4 +600,58 @@ mod tests { assert_eq!(response.meta.size, 40); assert_eq!(response.attributes, attributes); } + + #[tokio::test] + async fn test_buf_writer_with_put() { + let store = Arc::new(InMemory::new()) as Arc; + let path = Path::from("file.txt"); + + // Test put + let mut writer = BufWriter::with_capacity(Arc::clone(&store), path.clone(), 30); + writer + .put(Bytes::from((0..20).collect_vec())) + .await + .unwrap(); + writer + .put(Bytes::from((20..25).collect_vec())) + .await + .unwrap(); + writer.shutdown().await.unwrap(); + let response = store + .get_opts( + &path, + GetOptions { + head: true, + ..Default::default() + }, + ) + .await + .unwrap(); + assert_eq!(response.meta.size, 25); + assert_eq!(response.bytes().await.unwrap(), (0..25).collect_vec()); + + // Test multipart + let mut writer = BufWriter::with_capacity(Arc::clone(&store), path.clone(), 30); + writer + .put(Bytes::from((0..20).collect_vec())) + .await + .unwrap(); + writer + .put(Bytes::from((20..40).collect_vec())) + .await + .unwrap(); + writer.shutdown().await.unwrap(); + let response = store + .get_opts( + &path, + GetOptions { + head: true, + ..Default::default() + }, + ) + .await + .unwrap(); + assert_eq!(response.meta.size, 40); + assert_eq!(response.bytes().await.unwrap(), (0..40).collect_vec()); + } } From 9020ac85e352828e1481b2b1d3bf55b24e1c6d14 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Thu, 13 Jun 2024 14:52:18 -0400 Subject: [PATCH 309/397] Fix clippy for object_store (#5883) * Fix clippy for object_store * Update object_store/src/aws/credential.rs --- src/aws/credential.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/aws/credential.rs b/src/aws/credential.rs index 08831fd..01cfb34 100644 --- a/src/aws/credential.rs +++ b/src/aws/credential.rs @@ -424,6 +424,8 @@ fn canonicalize_headers(header_map: &HeaderMap) -> (String, String) { /// #[derive(Debug)] pub struct InstanceCredentialProvider { + // https://github.com/apache/arrow-rs/issues/5884 + #[allow(dead_code)] pub cache: TokenCache>, pub imdsv1_fallback: bool, pub metadata_endpoint: String, From 15d999fc7f5655c1cb4d48536a45db2ba83c27f4 Mon Sep 17 00:00:00 2001 From: Xuanwo Date: Sat, 15 Jun 2024 05:22:36 +0800 Subject: [PATCH 310/397] chore: Remove not used cache in InstanceCredentialProvider (#5888) Signed-off-by: Xuanwo --- src/aws/builder.rs | 1 - src/aws/credential.rs | 3 --- 2 files changed, 4 deletions(-) diff --git a/src/aws/builder.rs b/src/aws/builder.rs index 1e42093..ffef3fb 100644 --- a/src/aws/builder.rs +++ b/src/aws/builder.rs @@ -898,7 +898,6 @@ impl AmazonS3Builder { info!("Using Instance credential provider"); let token = InstanceCredentialProvider { - cache: Default::default(), imdsv1_fallback: self.imdsv1_fallback.get()?, metadata_endpoint: self .metadata_endpoint diff --git a/src/aws/credential.rs b/src/aws/credential.rs index 01cfb34..c13f8aa 100644 --- a/src/aws/credential.rs +++ b/src/aws/credential.rs @@ -424,9 +424,6 @@ fn canonicalize_headers(header_map: &HeaderMap) -> (String, String) { /// #[derive(Debug)] pub struct InstanceCredentialProvider { - // https://github.com/apache/arrow-rs/issues/5884 - #[allow(dead_code)] - pub cache: TokenCache>, pub imdsv1_fallback: bool, pub metadata_endpoint: String, } From 90e94cbe07cd8c634e832d92830a581395a602e7 Mon Sep 17 00:00:00 2001 From: Faiaz Sanaulla <105630300+fsdvh@users.noreply.github.com> Date: Sun, 23 Jun 2024 09:40:35 +0200 Subject: [PATCH 311/397] Add `MultipartUpload` blanket implementation for `Box` (#5919) * add impl for box * update * another update * small fix --- src/upload.rs | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/src/upload.rs b/src/upload.rs index e5f683a..dc499e2 100644 --- a/src/upload.rs +++ b/src/upload.rs @@ -92,6 +92,21 @@ pub trait MultipartUpload: Send + std::fmt::Debug { async fn abort(&mut self) -> Result<()>; } +#[async_trait] +impl MultipartUpload for Box { + fn put_part(&mut self, data: PutPayload) -> UploadPart { + (**self).put_part(data) + } + + async fn complete(&mut self) -> Result { + (**self).complete().await + } + + async fn abort(&mut self) -> Result<()> { + (**self).abort().await + } +} + /// A synchronous write API for uploading data in parallel in fixed size chunks /// /// Uses multiple tokio tasks in a [`JoinSet`] to multiplex upload tasks in parallel From d41b11af89ee464f240ad161996c5b2e988e1309 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 24 Jun 2024 21:44:26 +0100 Subject: [PATCH 312/397] Update quick-xml requirement from 0.32.0 to 0.33.0 in /object_store (#5946) Updates the requirements on [quick-xml](https://github.com/tafia/quick-xml) to permit the latest version. - [Release notes](https://github.com/tafia/quick-xml/releases) - [Changelog](https://github.com/tafia/quick-xml/blob/master/Changelog.md) - [Commits](https://github.com/tafia/quick-xml/compare/v0.32.0...v0.33.0) --- updated-dependencies: - dependency-name: quick-xml dependency-type: direct:production ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index 13da3a8..7f529c7 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -46,7 +46,7 @@ walkdir = "2" # Cloud storage support base64 = { version = "0.22", default-features = false, features = ["std"], optional = true } hyper = { version = "1.2", default-features = false, optional = true } -quick-xml = { version = "0.32.0", features = ["serialize", "overlapped-lists"], optional = true } +quick-xml = { version = "0.33.0", features = ["serialize", "overlapped-lists"], optional = true } serde = { version = "1.0", default-features = false, features = ["derive"], optional = true } serde_json = { version = "1.0", default-features = false, optional = true } rand = { version = "0.8", default-features = false, features = ["std", "std_rng"], optional = true } From 834634aa9883ff79874195aad11a50431063522d Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 25 Jun 2024 17:02:58 +0100 Subject: [PATCH 313/397] Update quick-xml requirement from 0.33.0 to 0.34.0 in /object_store (#5954) Updates the requirements on [quick-xml](https://github.com/tafia/quick-xml) to permit the latest version. - [Release notes](https://github.com/tafia/quick-xml/releases) - [Changelog](https://github.com/tafia/quick-xml/blob/master/Changelog.md) - [Commits](https://github.com/tafia/quick-xml/compare/v0.33.0...v0.34.0) --- updated-dependencies: - dependency-name: quick-xml dependency-type: direct:production ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index 7f529c7..c18509a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -46,7 +46,7 @@ walkdir = "2" # Cloud storage support base64 = { version = "0.22", default-features = false, features = ["std"], optional = true } hyper = { version = "1.2", default-features = false, optional = true } -quick-xml = { version = "0.33.0", features = ["serialize", "overlapped-lists"], optional = true } +quick-xml = { version = "0.34.0", features = ["serialize", "overlapped-lists"], optional = true } serde = { version = "1.0", default-features = false, features = ["derive"], optional = true } serde_json = { version = "1.0", default-features = false, optional = true } rand = { version = "0.8", default-features = false, features = ["std", "std_rng"], optional = true } From 12be570100abce6dcd4add971e9659d5c2f4eaef Mon Sep 17 00:00:00 2001 From: Tom Forbes Date: Sun, 30 Jun 2024 12:39:22 +0100 Subject: [PATCH 314/397] Make ObjectStoreScheme public (#5912) * Make ObjectStoreScheme public * Fix clippy, add docs and examples --------- Co-authored-by: Andrew Lamb --- src/lib.rs | 2 +- src/parse.rs | 48 +++++++++++++++++++++++++++++++++++++++++++----- 2 files changed, 44 insertions(+), 6 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 1339d1c..efbfe0b 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -550,7 +550,7 @@ pub mod integration; pub use attributes::*; -pub use parse::{parse_url, parse_url_opts}; +pub use parse::{parse_url, parse_url_opts, ObjectStoreScheme}; pub use payload::*; pub use upload::*; pub use util::{coalesce_ranges, collect_bytes, GetRange, OBJECT_STORE_COALESCE_DEFAULT}; diff --git a/src/parse.rs b/src/parse.rs index e5d5149..debc9e5 100644 --- a/src/parse.rs +++ b/src/parse.rs @@ -24,7 +24,7 @@ use snafu::Snafu; use url::Url; #[derive(Debug, Snafu)] -enum Error { +pub enum Error { #[snafu(display("Unable to recognise URL \"{}\"", url))] Unrecognised { url: Url }, @@ -41,9 +41,27 @@ impl From for super::Error { } } -/// Recognises various URL formats, identifying the relevant [`ObjectStore`] -#[derive(Debug, Eq, PartialEq)] -enum ObjectStoreScheme { +/// Recognizes various URL formats, identifying the relevant [`ObjectStore`] +/// +/// See [`ObjectStoreScheme::parse`] for more details +/// +/// # Supported formats: +/// - `file:///path/to/my/file` -> [`LocalFileSystem`] +/// - `memory:///` -> [`InMemory`] +/// - `s3://bucket/path` -> [`AmazonS3`](crate::aws::AmazonS3) (also supports `s3a`) +/// - `gs://bucket/path` -> [`GoogleCloudStorage`](crate::gcp::GoogleCloudStorage) +/// - `az://account/container/path` -> [`MicrosoftAzure`](crate::azure::MicrosoftAzure) (also supports `adl`, `azure`, `abfs`, `abfss`) +/// - `http://mydomain/path` -> [`HttpStore`](crate::http::HttpStore) +/// - `https://mydomain/path` -> [`HttpStore`](crate::http::HttpStore) +/// +/// There are also special cases for AWS and Azure for `https://{host?}/path` paths: +/// - `dfs.core.windows.net`, `blob.core.windows.net`, `dfs.fabric.microsoft.com`, `blob.fabric.microsoft.com` -> [`MicrosoftAzure`](crate::azure::MicrosoftAzure) +/// - `amazonaws.com` -> [`AmazonS3`](crate::aws::AmazonS3) +/// - `r2.cloudflarestorage.com` -> [`AmazonS3`](crate::aws::AmazonS3) +/// +#[non_exhaustive] // permit new variants +#[derive(Debug, Eq, PartialEq, Clone)] +pub enum ObjectStoreScheme { /// Url corresponding to [`LocalFileSystem`] Local, /// Url corresponding to [`InMemory`] @@ -62,7 +80,27 @@ impl ObjectStoreScheme { /// Create an [`ObjectStoreScheme`] from the provided [`Url`] /// /// Returns the [`ObjectStoreScheme`] and the remaining [`Path`] - fn parse(url: &Url) -> Result<(Self, Path), Error> { + /// + /// # Example + /// ``` + /// # use url::Url; + /// # use object_store::ObjectStoreScheme; + /// let url: Url = "file:///path/to/my/file".parse().unwrap(); + /// let (scheme, path) = ObjectStoreScheme::parse(&url).unwrap(); + /// assert_eq!(scheme, ObjectStoreScheme::Local); + /// assert_eq!(path.as_ref(), "path/to/my/file"); + /// + /// let url: Url = "https://blob.core.windows.net/path/to/my/file".parse().unwrap(); + /// let (scheme, path) = ObjectStoreScheme::parse(&url).unwrap(); + /// assert_eq!(scheme, ObjectStoreScheme::MicrosoftAzure); + /// assert_eq!(path.as_ref(), "path/to/my/file"); + /// + /// let url: Url = "https://example.com/path/to/my/file".parse().unwrap(); + /// let (scheme, path) = ObjectStoreScheme::parse(&url).unwrap(); + /// assert_eq!(scheme, ObjectStoreScheme::Http); + /// assert_eq!(path.as_ref(), "path/to/my/file"); + /// ``` + pub fn parse(url: &Url) -> Result<(Self, Path), Error> { let strip_bucket = || Some(url.path().strip_prefix('/')?.split_once('/')?.1); let (scheme, path) = match (url.scheme(), url.host_str()) { From f5e999a0e74d9c1a845560623b7fdb9ca65d5f70 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 1 Jul 2024 12:46:06 -0400 Subject: [PATCH 315/397] Update quick-xml requirement from 0.34.0 to 0.35.0 in /object_store (#5983) Updates the requirements on [quick-xml](https://github.com/tafia/quick-xml) to permit the latest version. - [Release notes](https://github.com/tafia/quick-xml/releases) - [Changelog](https://github.com/tafia/quick-xml/blob/master/Changelog.md) - [Commits](https://github.com/tafia/quick-xml/compare/v0.34.0...v0.35.0) --- updated-dependencies: - dependency-name: quick-xml dependency-type: direct:production ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index c18509a..5a0df23 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -46,7 +46,7 @@ walkdir = "2" # Cloud storage support base64 = { version = "0.22", default-features = false, features = ["std"], optional = true } hyper = { version = "1.2", default-features = false, optional = true } -quick-xml = { version = "0.34.0", features = ["serialize", "overlapped-lists"], optional = true } +quick-xml = { version = "0.35.0", features = ["serialize", "overlapped-lists"], optional = true } serde = { version = "1.0", default-features = false, features = ["derive"], optional = true } serde_json = { version = "1.0", default-features = false, optional = true } rand = { version = "0.8", default-features = false, features = ["std", "std_rng"], optional = true } From 7c10e6dce6c9e47d2cb9b2a3a304f6d360268394 Mon Sep 17 00:00:00 2001 From: Chris Riccomini Date: Tue, 2 Jul 2024 01:55:59 -0700 Subject: [PATCH 316/397] Add user defined metadata (#5915) * Add metadata attribute * Add user-defined metadata for AWS/GCP/ABS `with_attributes` * Reads and writes both implemented * Add tests for GetClient * Fix an indentation * Placate clippy * Use `strip_prefix` and mutable attributes * Use static Cow for attribute metadata * Add error for value decode failure * Remove unnecessary into --- src/attributes.rs | 17 ++++++++++--- src/aws/client.rs | 6 +++++ src/azure/client.rs | 6 +++++ src/client/get.rs | 60 +++++++++++++++++++++++++++++++++++++++++--- src/client/header.rs | 4 +++ src/gcp/client.rs | 6 +++++ src/http/client.rs | 3 +++ src/integration.rs | 1 + 8 files changed, 95 insertions(+), 8 deletions(-) diff --git a/src/attributes.rs b/src/attributes.rs index ecef32e..11cf27c 100644 --- a/src/attributes.rs +++ b/src/attributes.rs @@ -45,6 +45,10 @@ pub enum Attribute { /// /// See [Cache-Control](https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Cache-Control) CacheControl, + /// Specifies a user-defined metadata field for the object + /// + /// The String is a user-defined key + Metadata(Cow<'static, str>), } /// The value of an [`Attribute`] @@ -194,10 +198,11 @@ mod tests { (Attribute::ContentLanguage, "en-US"), (Attribute::ContentType, "test"), (Attribute::CacheControl, "control"), + (Attribute::Metadata("key1".into()), "value1"), ]); assert!(!attributes.is_empty()); - assert_eq!(attributes.len(), 5); + assert_eq!(attributes.len(), 6); assert_eq!( attributes.get(&Attribute::ContentType), @@ -210,18 +215,18 @@ mod tests { attributes.insert(Attribute::CacheControl, "v1".into()), Some(metav) ); - assert_eq!(attributes.len(), 5); + assert_eq!(attributes.len(), 6); assert_eq!( attributes.remove(&Attribute::CacheControl).unwrap(), "v1".into() ); - assert_eq!(attributes.len(), 4); + assert_eq!(attributes.len(), 5); let metav: AttributeValue = "v2".into(); attributes.insert(Attribute::CacheControl, metav.clone()); assert_eq!(attributes.get(&Attribute::CacheControl), Some(&metav)); - assert_eq!(attributes.len(), 5); + assert_eq!(attributes.len(), 6); assert_eq!( attributes.get(&Attribute::ContentDisposition), @@ -235,5 +240,9 @@ mod tests { attributes.get(&Attribute::ContentLanguage), Some(&"en-US".into()) ); + assert_eq!( + attributes.get(&Attribute::Metadata("key1".into())), + Some(&"value1".into()) + ); } } diff --git a/src/aws/client.rs b/src/aws/client.rs index 98226c4..ab4da86 100644 --- a/src/aws/client.rs +++ b/src/aws/client.rs @@ -61,6 +61,7 @@ use std::sync::Arc; const VERSION_HEADER: &str = "x-amz-version-id"; const SHA256_CHECKSUM: &str = "x-amz-checksum-sha256"; +const USER_DEFINED_METADATA_HEADER_PREFIX: &str = "x-amz-meta-"; /// A specialized `Error` for object store-related errors #[derive(Debug, Snafu)] @@ -326,6 +327,10 @@ impl<'a> Request<'a> { has_content_type = true; builder.header(CONTENT_TYPE, v.as_ref()) } + Attribute::Metadata(k_suffix) => builder.header( + &format!("{}{}", USER_DEFINED_METADATA_HEADER_PREFIX, k_suffix), + v.as_ref(), + ), }; } @@ -642,6 +647,7 @@ impl GetClient for S3Client { etag_required: false, last_modified_required: false, version_header: Some(VERSION_HEADER), + user_defined_metadata_prefix: Some(USER_DEFINED_METADATA_HEADER_PREFIX), }; /// Make an S3 GET request diff --git a/src/azure/client.rs b/src/azure/client.rs index be760c7..b5e82c2 100644 --- a/src/azure/client.rs +++ b/src/azure/client.rs @@ -48,6 +48,7 @@ use std::time::Duration; use url::Url; const VERSION_HEADER: &str = "x-ms-version-id"; +const USER_DEFINED_METADATA_HEADER_PREFIX: &str = "x-ms-meta-"; static MS_CACHE_CONTROL: HeaderName = HeaderName::from_static("x-ms-blob-cache-control"); static MS_CONTENT_TYPE: HeaderName = HeaderName::from_static("x-ms-blob-content-type"); static MS_CONTENT_DISPOSITION: HeaderName = @@ -208,6 +209,10 @@ impl<'a> PutRequest<'a> { has_content_type = true; builder.header(&MS_CONTENT_TYPE, v.as_ref()) } + Attribute::Metadata(k_suffix) => builder.header( + &format!("{}{}", USER_DEFINED_METADATA_HEADER_PREFIX, k_suffix), + v.as_ref(), + ), }; } @@ -499,6 +504,7 @@ impl GetClient for AzureClient { etag_required: true, last_modified_required: true, version_header: Some(VERSION_HEADER), + user_defined_metadata_prefix: Some(USER_DEFINED_METADATA_HEADER_PREFIX), }; /// Make an Azure GET request diff --git a/src/client/get.rs b/src/client/get.rs index 430b87b..b45eaa1 100644 --- a/src/client/get.rs +++ b/src/client/get.rs @@ -135,6 +135,9 @@ enum GetResultError { #[snafu(display("Content-Type header contained non UTF-8 characters"))] InvalidContentType { source: ToStrError }, + #[snafu(display("Metadata value for \"{key:?}\" contained non UTF-8 characters"))] + InvalidMetadata { key: String }, + #[snafu(display("Requested {expected:?}, got {actual:?}"))] UnexpectedRange { expected: Range, @@ -192,7 +195,7 @@ fn get_result( }} } - let attributes = parse_attributes!( + let mut attributes = parse_attributes!( response.headers(), ( CACHE_CONTROL, @@ -221,6 +224,24 @@ fn get_result( ) ); + // Add attributes that match the user-defined metadata prefix (e.g. x-amz-meta-) + if let Some(prefix) = T::HEADER_CONFIG.user_defined_metadata_prefix { + for (key, val) in response.headers() { + if let Some(suffix) = key.as_str().strip_prefix(prefix) { + if let Ok(val_str) = val.to_str() { + attributes.insert( + Attribute::Metadata(suffix.to_string().into()), + val_str.to_string().into(), + ); + } else { + return Err(GetResultError::InvalidMetadata { + key: key.to_string(), + }); + } + } + } + } + let stream = response .bytes_stream() .map_err(|source| crate::Error::Generic { @@ -253,6 +274,7 @@ mod tests { etag_required: false, last_modified_required: false, version_header: None, + user_defined_metadata_prefix: Some("x-test-meta-"), }; async fn get_request(&self, _: &Path, _: GetOptions) -> Result { @@ -265,6 +287,7 @@ mod tests { range: Option>, status: StatusCode, content_range: Option<&str>, + headers: Option>, ) -> Response { let mut builder = http::Response::builder(); if let Some(range) = content_range { @@ -276,6 +299,12 @@ mod tests { None => vec![0_u8; object_size], }; + if let Some(headers) = headers { + for (key, value) in headers { + builder = builder.header(key, value); + } + } + builder .status(status) .header(CONTENT_LENGTH, object_size) @@ -288,7 +317,7 @@ mod tests { async fn test_get_result() { let path = Path::from("test"); - let resp = make_response(12, None, StatusCode::OK, None); + let resp = make_response(12, None, StatusCode::OK, None, None); let res = get_result::(&path, None, resp).unwrap(); assert_eq!(res.meta.size, 12); assert_eq!(res.range, 0..12); @@ -302,6 +331,7 @@ mod tests { Some(2..3), StatusCode::PARTIAL_CONTENT, Some("bytes 2-2/12"), + None, ); let res = get_result::(&path, Some(get_range.clone()), resp).unwrap(); assert_eq!(res.meta.size, 12); @@ -309,7 +339,7 @@ mod tests { let bytes = res.bytes().await.unwrap(); assert_eq!(bytes.len(), 1); - let resp = make_response(12, Some(2..3), StatusCode::OK, None); + let resp = make_response(12, Some(2..3), StatusCode::OK, None, None); let err = get_result::(&path, Some(get_range.clone()), resp).unwrap_err(); assert_eq!( err.to_string(), @@ -321,6 +351,7 @@ mod tests { Some(2..3), StatusCode::PARTIAL_CONTENT, Some("bytes 2-3/12"), + None, ); let err = get_result::(&path, Some(get_range.clone()), resp).unwrap_err(); assert_eq!(err.to_string(), "Requested 2..3, got 2..4"); @@ -330,6 +361,7 @@ mod tests { Some(2..3), StatusCode::PARTIAL_CONTENT, Some("bytes 2-2/*"), + None, ); let err = get_result::(&path, Some(get_range.clone()), resp).unwrap_err(); assert_eq!( @@ -337,7 +369,7 @@ mod tests { "Failed to parse value for CONTENT_RANGE header: \"bytes 2-2/*\"" ); - let resp = make_response(12, Some(2..3), StatusCode::PARTIAL_CONTENT, None); + let resp = make_response(12, Some(2..3), StatusCode::PARTIAL_CONTENT, None, None); let err = get_result::(&path, Some(get_range.clone()), resp).unwrap_err(); assert_eq!( err.to_string(), @@ -349,6 +381,7 @@ mod tests { Some(2..3), StatusCode::PARTIAL_CONTENT, Some("bytes 2-3/2"), + None, ); let err = get_result::(&path, Some(get_range.clone()), resp).unwrap_err(); assert_eq!( @@ -361,6 +394,7 @@ mod tests { Some(2..6), StatusCode::PARTIAL_CONTENT, Some("bytes 2-5/6"), + None, ); let res = get_result::(&path, Some(GetRange::Suffix(4)), resp).unwrap(); assert_eq!(res.meta.size, 6); @@ -373,8 +407,26 @@ mod tests { Some(2..6), StatusCode::PARTIAL_CONTENT, Some("bytes 2-3/6"), + None, ); let err = get_result::(&path, Some(GetRange::Suffix(4)), resp).unwrap_err(); assert_eq!(err.to_string(), "Requested 2..6, got 2..4"); + + let resp = make_response( + 12, + None, + StatusCode::OK, + None, + Some(vec![("x-test-meta-foo", "bar")]), + ); + let res = get_result::(&path, None, resp).unwrap(); + assert_eq!(res.meta.size, 12); + assert_eq!(res.range, 0..12); + assert_eq!( + res.attributes.get(&Attribute::Metadata("foo".into())), + Some(&"bar".into()) + ); + let bytes = res.bytes().await.unwrap(); + assert_eq!(bytes.len(), 12); } } diff --git a/src/client/header.rs b/src/client/header.rs index e85bf6b..9ce5db4 100644 --- a/src/client/header.rs +++ b/src/client/header.rs @@ -31,6 +31,7 @@ pub struct HeaderConfig { /// /// Defaults to `true` pub etag_required: bool, + /// Whether to require a Last-Modified header when extracting [`ObjectMeta`] from headers. /// /// Defaults to `true` @@ -38,6 +39,9 @@ pub struct HeaderConfig { /// The version header name if any pub version_header: Option<&'static str>, + + /// The user defined metadata prefix if any + pub user_defined_metadata_prefix: Option<&'static str>, } #[derive(Debug, Snafu)] diff --git a/src/gcp/client.rs b/src/gcp/client.rs index 35c64cc..0045383 100644 --- a/src/gcp/client.rs +++ b/src/gcp/client.rs @@ -49,6 +49,7 @@ use std::sync::Arc; const VERSION_HEADER: &str = "x-goog-generation"; const DEFAULT_CONTENT_TYPE: &str = "application/octet-stream"; +const USER_DEFINED_METADATA_HEADER_PREFIX: &str = "x-goog-meta-"; static VERSION_MATCH: HeaderName = HeaderName::from_static("x-goog-if-generation-match"); @@ -199,6 +200,10 @@ impl<'a> Request<'a> { has_content_type = true; builder.header(CONTENT_TYPE, v.as_ref()) } + Attribute::Metadata(k_suffix) => builder.header( + &format!("{}{}", USER_DEFINED_METADATA_HEADER_PREFIX, k_suffix), + v.as_ref(), + ), }; } @@ -567,6 +572,7 @@ impl GetClient for GoogleCloudStorageClient { etag_required: true, last_modified_required: true, version_header: Some(VERSION_HEADER), + user_defined_metadata_prefix: Some(USER_DEFINED_METADATA_HEADER_PREFIX), }; /// Perform a get request diff --git a/src/http/client.rs b/src/http/client.rs index 4dccef8..5def931 100644 --- a/src/http/client.rs +++ b/src/http/client.rs @@ -183,6 +183,8 @@ impl Client { has_content_type = true; builder.header(CONTENT_TYPE, v.as_ref()) } + // Ignore metadata attributes + Attribute::Metadata(_) => builder, }; } @@ -319,6 +321,7 @@ impl GetClient for Client { etag_required: false, last_modified_required: false, version_header: None, + user_defined_metadata_prefix: None, }; async fn get_request(&self, path: &Path, options: GetOptions) -> Result { diff --git a/src/integration.rs b/src/integration.rs index 31b074f..89b21bc 100644 --- a/src/integration.rs +++ b/src/integration.rs @@ -480,6 +480,7 @@ pub async fn put_get_attributes(integration: &dyn ObjectStore) { (Attribute::ContentEncoding, "gzip"), (Attribute::ContentLanguage, "en-US"), (Attribute::ContentType, "text/html; charset=utf-8"), + (Attribute::Metadata("test_key".into()), "test_value"), ]); let path = Path::from("attributes"); From 9e601c94371c7df51521d04c6ae70461e3d30750 Mon Sep 17 00:00:00 2001 From: Faiaz Sanaulla <105630300+fsdvh@users.noreply.github.com> Date: Tue, 2 Jul 2024 11:53:38 +0200 Subject: [PATCH 317/397] WriteMultipart Abort on MultipartUpload::complete Error (#5974) * update * another one * more update * another update * debug * debug * some updates * debug * debug * cleanup * cleanup * simplify * address some comments * cleanup on failure * restore abort method * docs --- src/buffered.rs | 7 ++++++- src/local.rs | 11 +++++------ src/upload.rs | 10 +++++++++- 3 files changed, 20 insertions(+), 8 deletions(-) diff --git a/src/buffered.rs b/src/buffered.rs index c7b71aa..fcd7e06 100644 --- a/src/buffered.rs +++ b/src/buffered.rs @@ -438,7 +438,12 @@ impl AsyncWrite for BufWriter { } BufWriterState::Flush(f) => return f.poll_unpin(cx).map_err(std::io::Error::from), BufWriterState::Write(x) => { - let upload = x.take().unwrap(); + let upload = x.take().ok_or_else(|| { + std::io::Error::new( + ErrorKind::InvalidInput, + "Cannot shutdown a writer that has already been shut down", + ) + })?; self.state = BufWriterState::Flush( async move { upload.finish().await?; diff --git a/src/local.rs b/src/local.rs index 1ce588a..d3bfab8 100644 --- a/src/local.rs +++ b/src/local.rs @@ -724,7 +724,7 @@ struct LocalUpload { #[derive(Debug)] struct UploadState { dest: PathBuf, - file: Mutex>, + file: Mutex, } impl LocalUpload { @@ -732,7 +732,7 @@ impl LocalUpload { Self { state: Arc::new(UploadState { dest, - file: Mutex::new(Some(file)), + file: Mutex::new(file), }), src: Some(src), offset: 0, @@ -748,8 +748,7 @@ impl MultipartUpload for LocalUpload { let s = Arc::clone(&self.state); maybe_spawn_blocking(move || { - let mut f = s.file.lock(); - let file = f.as_mut().context(AbortedSnafu)?; + let mut file = s.file.lock(); file.seek(SeekFrom::Start(offset)) .context(SeekSnafu { path: &s.dest })?; @@ -767,9 +766,9 @@ impl MultipartUpload for LocalUpload { let s = Arc::clone(&self.state); maybe_spawn_blocking(move || { // Ensure no inflight writes - let f = s.file.lock().take().context(AbortedSnafu)?; + let file = s.file.lock(); std::fs::rename(&src, &s.dest).context(UnableToRenameFileSnafu)?; - let metadata = f.metadata().map_err(|e| Error::Metadata { + let metadata = file.metadata().map_err(|e| Error::Metadata { source: e.into(), path: src.to_string_lossy().to_string(), })?; diff --git a/src/upload.rs b/src/upload.rs index dc499e2..4df4d8f 100644 --- a/src/upload.rs +++ b/src/upload.rs @@ -226,7 +226,15 @@ impl WriteMultipart { } self.wait_for_capacity(0).await?; - self.upload.complete().await + + match self.upload.complete().await { + Err(e) => { + self.tasks.shutdown().await; + self.upload.abort().await?; + Err(e) + } + Ok(result) => Ok(result), + } } } From 6b20fd598d67227dd4b80f3dd060e8d28b91a3b8 Mon Sep 17 00:00:00 2001 From: Faiaz Sanaulla <105630300+fsdvh@users.noreply.github.com> Date: Sat, 6 Jul 2024 14:06:20 +0200 Subject: [PATCH 318/397] Automatically cleanup empty dirs in LocalFileSystem (#5978) * automatically cleanup empty dirs * automatic cleanup toggle * configurable cleanup * test for automatic dir deletion * clippy * more comments --- src/local.rs | 75 +++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 69 insertions(+), 6 deletions(-) diff --git a/src/local.rs b/src/local.rs index d3bfab8..4847389 100644 --- a/src/local.rs +++ b/src/local.rs @@ -240,6 +240,8 @@ impl From for super::Error { #[derive(Debug)] pub struct LocalFileSystem { config: Arc, + // if you want to delete empty directories when deleting files + automatic_cleanup: bool, } #[derive(Debug)] @@ -266,6 +268,7 @@ impl LocalFileSystem { config: Arc::new(Config { root: Url::parse("file:///").unwrap(), }), + automatic_cleanup: false, } } @@ -282,6 +285,7 @@ impl LocalFileSystem { config: Arc::new(Config { root: absolute_path_to_url(path)?, }), + automatic_cleanup: false, }) } @@ -295,6 +299,12 @@ impl LocalFileSystem { ); self.config.prefix_to_filesystem(location) } + + /// Enable automatic cleanup of empty directories when deleting files + pub fn with_automatic_cleanup(mut self, automatic_cleanup: bool) -> Self { + self.automatic_cleanup = automatic_cleanup; + self + } } impl Config { @@ -465,13 +475,36 @@ impl ObjectStore for LocalFileSystem { } async fn delete(&self, location: &Path) -> Result<()> { + let config = Arc::clone(&self.config); let path = self.path_to_filesystem(location)?; - maybe_spawn_blocking(move || match std::fs::remove_file(&path) { - Ok(_) => Ok(()), - Err(e) => Err(match e.kind() { - ErrorKind::NotFound => Error::NotFound { path, source: e }.into(), - _ => Error::UnableToDeleteFile { path, source: e }.into(), - }), + let automactic_cleanup = self.automatic_cleanup; + maybe_spawn_blocking(move || { + if let Err(e) = std::fs::remove_file(&path) { + Err(match e.kind() { + ErrorKind::NotFound => Error::NotFound { path, source: e }.into(), + _ => Error::UnableToDeleteFile { path, source: e }.into(), + }) + } else if automactic_cleanup { + let root = &config.root; + let root = root + .to_file_path() + .map_err(|_| Error::InvalidUrl { url: root.clone() })?; + + // here we will try to traverse up and delete an empty dir if possible until we reach the root or get an error + let mut parent = path.parent(); + + while let Some(loc) = parent { + if loc != root && std::fs::remove_dir(loc).is_ok() { + parent = loc.parent(); + } else { + break; + } + } + + Ok(()) + } else { + Ok(()) + } }) .await } @@ -1010,6 +1043,8 @@ fn convert_walkdir_result( #[cfg(test)] mod tests { + use std::fs; + use futures::TryStreamExt; use tempfile::{NamedTempFile, TempDir}; @@ -1445,6 +1480,34 @@ mod tests { list.sort_unstable(); assert_eq!(list, vec![c, a]); } + + #[tokio::test] + async fn delete_dirs_automatically() { + let root = TempDir::new().unwrap(); + let integration = LocalFileSystem::new_with_prefix(root.path()) + .unwrap() + .with_automatic_cleanup(true); + let location = Path::from("nested/file/test_file"); + let data = Bytes::from("arbitrary data"); + + integration + .put(&location, data.clone().into()) + .await + .unwrap(); + + let read_data = integration + .get(&location) + .await + .unwrap() + .bytes() + .await + .unwrap(); + + assert_eq!(&*read_data, data); + assert!(fs::read_dir(root.path()).unwrap().count() > 0); + integration.delete(&location).await.unwrap(); + assert!(fs::read_dir(root.path()).unwrap().count() == 0); + } } #[cfg(not(target_arch = "wasm32"))] From b8439d01673172604d5f21d647fa48c2056da19c Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 10 Jul 2024 12:10:02 +0100 Subject: [PATCH 319/397] Update quick-xml requirement from 0.35.0 to 0.36.0 in /object_store (#6032) Updates the requirements on [quick-xml](https://github.com/tafia/quick-xml) to permit the latest version. - [Release notes](https://github.com/tafia/quick-xml/releases) - [Changelog](https://github.com/tafia/quick-xml/blob/master/Changelog.md) - [Commits](https://github.com/tafia/quick-xml/compare/v0.35.0...v0.36.0) --- updated-dependencies: - dependency-name: quick-xml dependency-type: direct:production ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index 5a0df23..3f7b2c0 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -46,7 +46,7 @@ walkdir = "2" # Cloud storage support base64 = { version = "0.22", default-features = false, features = ["std"], optional = true } hyper = { version = "1.2", default-features = false, optional = true } -quick-xml = { version = "0.35.0", features = ["serialize", "overlapped-lists"], optional = true } +quick-xml = { version = "0.36.0", features = ["serialize", "overlapped-lists"], optional = true } serde = { version = "1.0", default-features = false, features = ["derive"], optional = true } serde_json = { version = "1.0", default-features = false, optional = true } rand = { version = "0.8", default-features = false, features = ["std", "std_rng"], optional = true } From 9c4c95ff7c510239265d2feb78216e9d78c29278 Mon Sep 17 00:00:00 2001 From: Hesam Pakdaman <14890379+hesampakdaman@users.noreply.github.com> Date: Sat, 13 Jul 2024 11:49:31 +0200 Subject: [PATCH 320/397] Fix 5592: Colon (:) in in object_store::path::{Path} is not handled on Windows (#5830) * Fix issue #5800: Handle missing files in list_with_delimiter * draft * cargo fmt * Handle leading colon * Add windows CI * Fix CI job * Only run local tests and set target family for failing tests * Run all tests without my changes and removed target os * Restore changes again * Add back newline (removed by mistake) * Fix test after merge with master --- src/local.rs | 41 ++++++++++++++++++++++++++++++++++++++++- 1 file changed, 40 insertions(+), 1 deletion(-) diff --git a/src/local.rs b/src/local.rs index 4847389..db4b4b0 100644 --- a/src/local.rs +++ b/src/local.rs @@ -297,7 +297,22 @@ impl LocalFileSystem { path: location.as_ref() } ); - self.config.prefix_to_filesystem(location) + let path = self.config.prefix_to_filesystem(location)?; + + #[cfg(target_os = "windows")] + let path = { + let path = path.to_string_lossy(); + + // Assume the first char is the drive letter and the next is a colon. + let mut out = String::new(); + let drive = &path[..2]; // The drive letter and colon (e.g., "C:") + let filepath = &path[2..].replace(':', "%3A"); // Replace subsequent colons + out.push_str(drive); + out.push_str(filepath); + PathBuf::from(out) + }; + + Ok(path) } /// Enable automatic cleanup of empty directories when deleting files @@ -1053,6 +1068,7 @@ mod tests { use super::*; #[tokio::test] + #[cfg(target_family = "unix")] async fn file_test() { let root = TempDir::new().unwrap(); let integration = LocalFileSystem::new_with_prefix(root.path()).unwrap(); @@ -1069,6 +1085,7 @@ mod tests { } #[test] + #[cfg(target_family = "unix")] fn test_non_tokio() { let root = TempDir::new().unwrap(); let integration = LocalFileSystem::new_with_prefix(root.path()).unwrap(); @@ -1481,6 +1498,28 @@ mod tests { assert_eq!(list, vec![c, a]); } + #[tokio::test] + #[cfg(target_os = "windows")] + async fn filesystem_filename_with_colon() { + let root = TempDir::new().unwrap(); + let integration = LocalFileSystem::new_with_prefix(root.path()).unwrap(); + let path = Path::parse("file%3Aname.parquet").unwrap(); + let location = Path::parse("file:name.parquet").unwrap(); + + integration.put(&location, "test".into()).await.unwrap(); + let list = flatten_list_stream(&integration, None).await.unwrap(); + assert_eq!(list, vec![path.clone()]); + + let result = integration + .get(&location) + .await + .unwrap() + .bytes() + .await + .unwrap(); + assert_eq!(result, Bytes::from("test")); + } + #[tokio::test] async fn delete_dirs_automatically() { let root = TempDir::new().unwrap(); From 263ad735830399351d386a9f470ef74664fec2ef Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Wed, 17 Jul 2024 13:35:53 +0100 Subject: [PATCH 321/397] Sanitize error message for sensitive requests (#6074) * Sanitize error message for sensitive requests * Clippy --- src/aws/credential.rs | 1 + src/client/retry.rs | 60 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 61 insertions(+) diff --git a/src/aws/credential.rs b/src/aws/credential.rs index c13f8aa..63cb571 100644 --- a/src/aws/credential.rs +++ b/src/aws/credential.rs @@ -610,6 +610,7 @@ async fn web_identity( ]) .retryable(retry_config) .idempotent(true) + .sensitive(true) .send() .await? .bytes() diff --git a/src/client/retry.rs b/src/client/retry.rs index 5dfdd55..5df4ce0 100644 --- a/src/client/retry.rs +++ b/src/client/retry.rs @@ -174,6 +174,7 @@ pub struct RetryableRequest { retry_timeout: Duration, backoff: Backoff, + sensitive: bool, idempotent: Option, payload: Option, } @@ -190,6 +191,14 @@ impl RetryableRequest { } } + /// Set whether this request contains sensitive data + /// + /// This will avoid printing out the URL in error messages + #[allow(unused)] + pub fn sensitive(self, sensitive: bool) -> Self { + Self { sensitive, ..self } + } + /// Provide a [`PutPayload`] pub fn payload(self, payload: Option) -> Self { Self { payload, ..self } @@ -206,6 +215,11 @@ impl RetryableRequest { .idempotent .unwrap_or_else(|| self.request.method().is_safe()); + let sanitize_err = move |e: reqwest::Error| match self.sensitive { + true => e.without_url(), + false => e, + }; + loop { let mut request = self .request @@ -238,6 +252,7 @@ impl RetryableRequest { }; } Err(e) => { + let e = sanitize_err(e); let status = r.status(); if retries == max_retries || now.elapsed() > retry_timeout @@ -280,6 +295,8 @@ impl RetryableRequest { } }, Err(e) => { + let e = sanitize_err(e); + let mut do_retry = false; if e.is_connect() || e.is_body() @@ -365,6 +382,7 @@ impl RetryExt for reqwest::RequestBuilder { backoff: Backoff::new(&config.backoff), idempotent: None, payload: None, + sensitive: false, } } @@ -565,6 +583,48 @@ mod tests { "{e}" ); + let url = format!("{}/SENSITIVE", mock.url()); + for _ in 0..=retry.max_retries { + mock.push( + Response::builder() + .status(StatusCode::BAD_GATEWAY) + .body("ignored".to_string()) + .unwrap(), + ); + } + let res = client.request(Method::GET, url).send_retry(&retry).await; + let err = res.unwrap_err().to_string(); + assert!(err.contains("SENSITIVE"), "{err}"); + + let url = format!("{}/SENSITIVE", mock.url()); + for _ in 0..=retry.max_retries { + mock.push( + Response::builder() + .status(StatusCode::BAD_GATEWAY) + .body("ignored".to_string()) + .unwrap(), + ); + } + + // Sensitive requests should strip URL from error + let req = client + .request(Method::GET, &url) + .retryable(&retry) + .sensitive(true); + let err = req.send().await.unwrap_err().to_string(); + assert!(!err.contains("SENSITIVE"), "{err}"); + + for _ in 0..=retry.max_retries { + mock.push_fn(|_| panic!()); + } + + let req = client + .request(Method::GET, &url) + .retryable(&retry) + .sensitive(true); + let err = req.send().await.unwrap_err().to_string(); + assert!(!err.contains("SENSITIVE"), "{err}"); + // Shutdown mock.shutdown().await } From d6d3c9df6ed83844b02f7d3929717a7562ec7fa3 Mon Sep 17 00:00:00 2001 From: barronw <141040627+barronw@users.noreply.github.com> Date: Wed, 17 Jul 2024 11:57:12 -0400 Subject: [PATCH 322/397] use GCE metadata server env var overrides (#6015) * use GCE metadata env var overrides * update docs Co-authored-by: Andrew Lamb --------- Co-authored-by: Andrew Lamb --- src/gcp/credential.rs | 48 +++++++++++++++++++++++++++++++++++-------- 1 file changed, 40 insertions(+), 8 deletions(-) diff --git a/src/gcp/credential.rs b/src/gcp/credential.rs index 829db9b..0e80e62 100644 --- a/src/gcp/credential.rs +++ b/src/gcp/credential.rs @@ -51,6 +51,9 @@ pub const DEFAULT_GCS_BASE_URL: &str = "https://storage.googleapis.com"; const DEFAULT_GCS_PLAYLOAD_STRING: &str = "UNSIGNED-PAYLOAD"; const DEFAULT_GCS_SIGN_BLOB_HOST: &str = "storage.googleapis.com"; +const DEFAULT_METADATA_HOST: &str = "metadata.google.internal"; +const DEFAULT_METADATA_IP: &str = "169.254.169.254"; + #[derive(Debug, Snafu)] pub enum Error { #[snafu(display("Unable to open service account file from {}: {}", path.display(), source))] @@ -414,17 +417,31 @@ impl TokenProvider for InstanceCredentialProvider { /// Fetch a token from the metadata server. /// Since the connection is local we need to enable http access and don't actually use the client object passed in. + /// Respects the `GCE_METADATA_HOST`, `GCE_METADATA_ROOT`, and `GCE_METADATA_IP` + /// environment variables. + /// + /// References: async fn fetch_token( &self, client: &Client, retry: &RetryConfig, ) -> crate::Result>> { - const METADATA_IP: &str = "169.254.169.254"; - const METADATA_HOST: &str = "metadata"; + let metadata_host = if let Ok(host) = env::var("GCE_METADATA_HOST") { + host + } else if let Ok(host) = env::var("GCE_METADATA_ROOT") { + host + } else { + DEFAULT_METADATA_HOST.to_string() + }; + let metadata_ip = if let Ok(ip) = env::var("GCE_METADATA_IP") { + ip + } else { + DEFAULT_METADATA_IP.to_string() + }; info!("fetching token from metadata server"); - let response = make_metadata_request(client, METADATA_HOST, retry) - .or_else(|_| make_metadata_request(client, METADATA_IP, retry)) + let response = make_metadata_request(client, &metadata_host, retry) + .or_else(|_| make_metadata_request(client, &metadata_ip, retry)) .await?; let token = TemporaryToken { @@ -469,18 +486,33 @@ impl TokenProvider for InstanceSigningCredentialProvider { /// Fetch a token from the metadata server. /// Since the connection is local we need to enable http access and don't actually use the client object passed in. + /// Respects the `GCE_METADATA_HOST`, `GCE_METADATA_ROOT`, and `GCE_METADATA_IP` + /// environment variables. + /// + /// References: async fn fetch_token( &self, client: &Client, retry: &RetryConfig, ) -> crate::Result>> { - const METADATA_IP: &str = "169.254.169.254"; - const METADATA_HOST: &str = "metadata"; + let metadata_host = if let Ok(host) = env::var("GCE_METADATA_HOST") { + host + } else if let Ok(host) = env::var("GCE_METADATA_ROOT") { + host + } else { + DEFAULT_METADATA_HOST.to_string() + }; + + let metadata_ip = if let Ok(ip) = env::var("GCE_METADATA_IP") { + ip + } else { + DEFAULT_METADATA_IP.to_string() + }; info!("fetching token from metadata server"); - let email = make_metadata_request_for_email(client, METADATA_HOST, retry) - .or_else(|_| make_metadata_request_for_email(client, METADATA_IP, retry)) + let email = make_metadata_request_for_email(client, &metadata_host, retry) + .or_else(|_| make_metadata_request_for_email(client, &metadata_ip, retry)) .await?; let token = TemporaryToken { From e470cae04d3ae05a79c2973a9294b306bc329963 Mon Sep 17 00:00:00 2001 From: Trung Dinh Date: Wed, 17 Jul 2024 09:48:00 -0700 Subject: [PATCH 323/397] Correct timeout in comment from 5s to 30s (#6073) --- src/client/mod.rs | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/client/mod.rs b/src/client/mod.rs index 3fefbb5..43fd658 100644 --- a/src/client/mod.rs +++ b/src/client/mod.rs @@ -401,7 +401,7 @@ impl ClientOptions { /// The timeout is applied from when the request starts connecting until the /// response body has finished /// - /// Default is 5 seconds + /// Default is 30 seconds pub fn with_timeout(mut self, timeout: Duration) -> Self { self.timeout = Some(ConfigValue::Parsed(timeout)); self @@ -435,7 +435,7 @@ impl ClientOptions { /// /// This is the length of time an idle connection will be kept alive /// - /// Default is 90 seconds + /// Default is 90 seconds enforced by reqwest pub fn with_pool_idle_timeout(mut self, timeout: Duration) -> Self { self.pool_idle_timeout = Some(ConfigValue::Parsed(timeout)); self @@ -443,7 +443,7 @@ impl ClientOptions { /// Set the maximum number of idle connections per host /// - /// Default is no limit + /// Default is no limit enforced by reqwest pub fn with_pool_max_idle_per_host(mut self, max: usize) -> Self { self.pool_max_idle_per_host = Some(max.into()); self @@ -451,7 +451,7 @@ impl ClientOptions { /// Sets an interval for HTTP2 Ping frames should be sent to keep a connection alive. /// - /// Default is disabled + /// Default is disabled enforced by reqwest pub fn with_http2_keep_alive_interval(mut self, interval: Duration) -> Self { self.http2_keep_alive_interval = Some(ConfigValue::Parsed(interval)); self @@ -462,7 +462,7 @@ impl ClientOptions { /// If the ping is not acknowledged within the timeout, the connection will be closed. /// Does nothing if http2_keep_alive_interval is disabled. /// - /// Default is disabled + /// Default is disabled enforced by reqwest pub fn with_http2_keep_alive_timeout(mut self, interval: Duration) -> Self { self.http2_keep_alive_timeout = Some(ConfigValue::Parsed(interval)); self @@ -473,7 +473,7 @@ impl ClientOptions { /// If disabled, keep-alive pings are only sent while there are open request/response /// streams. If enabled, pings are also sent when no streams are active /// - /// Default is disabled + /// Default is disabled enforced by reqwest pub fn with_http2_keep_alive_while_idle(mut self) -> Self { self.http2_keep_alive_while_idle = true.into(); self From e109bc9d6fd70a2bf15bea0ef0ebea9624bf7be2 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Wed, 17 Jul 2024 13:27:33 -0400 Subject: [PATCH 324/397] Prepare for object_store `0.10.2` release (#6079) * Prepare for `object_store 10.2.0` release * Add CHANGELOG * Historical changelog --- CHANGELOG-old.md | 23 ++++++++++++++++ CHANGELOG.md | 46 ++++++++++++++++++++++++-------- Cargo.toml | 2 +- dev/release/update_change_log.sh | 5 ++-- 4 files changed, 62 insertions(+), 14 deletions(-) diff --git a/CHANGELOG-old.md b/CHANGELOG-old.md index f52d900..4193700 100644 --- a/CHANGELOG-old.md +++ b/CHANGELOG-old.md @@ -21,6 +21,29 @@ # Changelog +## [object_store_0.10.1](https://github.com/apache/arrow-rs/tree/object_store_0.10.1) (2024-05-10) + +[Full Changelog](https://github.com/apache/arrow-rs/compare/object_store_0.10.0...object_store_0.10.1) + +**Implemented enhancements:** + +- Allow specifying PUT options when using `BufWriter` [\#5692](https://github.com/apache/arrow-rs/issues/5692) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Add more attributes to `object_store::Attribute` [\#5689](https://github.com/apache/arrow-rs/issues/5689) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- feat object\_store: moving tests from src/ to a tests/ folder and enabling access to test functions for enabling a shared integration test suite [\#5685](https://github.com/apache/arrow-rs/issues/5685) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Release Object Store 0.10.0 [\#5647](https://github.com/apache/arrow-rs/issues/5647) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] + +**Fixed bugs:** + +- Using WriteMultipart::put results in 0 bytes being written [\#5743](https://github.com/apache/arrow-rs/issues/5743) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] + +**Merged pull requests:** + +- Fix PutPayloadMut::push not updating content\_length \(\#5743\) [\#5744](https://github.com/apache/arrow-rs/pull/5744) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Export object\_store integration tests [\#5709](https://github.com/apache/arrow-rs/pull/5709) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Add `BufWriter::with_attributes` and `::with_tags` in `object_store` [\#5693](https://github.com/apache/arrow-rs/pull/5693) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([netthier](https://github.com/netthier)) +- Add more attributes to `object_store::Attribute` [\#5690](https://github.com/apache/arrow-rs/pull/5690) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([netthier](https://github.com/netthier)) + + ## [object_store_0.10.0](https://github.com/apache/arrow-rs/tree/object_store_0.10.0) (2024-04-17) [Full Changelog](https://github.com/apache/arrow-rs/compare/object_store_0.9.1...object_store_0.10.0) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5beda50..0267ba8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,27 +19,51 @@ # Changelog -## [object_store_0.10.1](https://github.com/apache/arrow-rs/tree/object_store_0.10.1) (2024-05-10) +## [object_store_0.10.2](https://github.com/apache/arrow-rs/tree/object_store_0.10.2) (2024-07-17) -[Full Changelog](https://github.com/apache/arrow-rs/compare/object_store_0.10.0...object_store_0.10.1) +[Full Changelog](https://github.com/apache/arrow-rs/compare/object_store_0.10.1...object_store_0.10.2) **Implemented enhancements:** -- Allow specifying PUT options when using `BufWriter` [\#5692](https://github.com/apache/arrow-rs/issues/5692) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- Add more attributes to `object_store::Attribute` [\#5689](https://github.com/apache/arrow-rs/issues/5689) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- feat object\_store: moving tests from src/ to a tests/ folder and enabling access to test functions for enabling a shared integration test suite [\#5685](https://github.com/apache/arrow-rs/issues/5685) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- Release Object Store 0.10.0 [\#5647](https://github.com/apache/arrow-rs/issues/5647) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Relax `WriteMultipart` API to support aborting after completion [\#5977](https://github.com/apache/arrow-rs/issues/5977) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Make ObjectStoreScheme in the object\_store crate public [\#5911](https://github.com/apache/arrow-rs/issues/5911) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Add BufUploader to implement same feature upon `WriteMultipart` like `BufWriter` [\#5834](https://github.com/apache/arrow-rs/issues/5834) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] **Fixed bugs:** -- Using WriteMultipart::put results in 0 bytes being written [\#5743](https://github.com/apache/arrow-rs/issues/5743) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Investigate why `InstanceCredentialProvider::cache` is flagged as dead code [\#5884](https://github.com/apache/arrow-rs/issues/5884) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- \[object\_store\] Potential race condition in `list_with_delimiter` on `Local` [\#5800](https://github.com/apache/arrow-rs/issues/5800) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] + +**Documentation updates:** + +- Correct timeout in comment from 5s to 30s [\#6073](https://github.com/apache/arrow-rs/pull/6073) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([trungda](https://github.com/trungda)) +- docs: Fix broken links of object\_store\_opendal README [\#5929](https://github.com/apache/arrow-rs/pull/5929) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([Xuanwo](https://github.com/Xuanwo)) +- docs: Add object\_store\_opendal as related projects [\#5926](https://github.com/apache/arrow-rs/pull/5926) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([Xuanwo](https://github.com/Xuanwo)) +- chore: update docs to delineate which ObjectStore lists are recursive [\#5794](https://github.com/apache/arrow-rs/pull/5794) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([wiedld](https://github.com/wiedld)) +- Document object store release cadence [\#5750](https://github.com/apache/arrow-rs/pull/5750) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([alamb](https://github.com/alamb)) **Merged pull requests:** -- Fix PutPayloadMut::push not updating content\_length \(\#5743\) [\#5744](https://github.com/apache/arrow-rs/pull/5744) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- Export object\_store integration tests [\#5709](https://github.com/apache/arrow-rs/pull/5709) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- Add `BufWriter::with_attributes` and `::with_tags` in `object_store` [\#5693](https://github.com/apache/arrow-rs/pull/5693) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([netthier](https://github.com/netthier)) -- Add more attributes to `object_store::Attribute` [\#5690](https://github.com/apache/arrow-rs/pull/5690) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([netthier](https://github.com/netthier)) +- Sanitize error message for sensitive requests [\#6074](https://github.com/apache/arrow-rs/pull/6074) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Update quick-xml requirement from 0.35.0 to 0.36.0 in /object\_store [\#6032](https://github.com/apache/arrow-rs/pull/6032) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- use GCE metadata server env var overrides [\#6015](https://github.com/apache/arrow-rs/pull/6015) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([barronw](https://github.com/barronw)) +- Update quick-xml requirement from 0.34.0 to 0.35.0 in /object\_store [\#5983](https://github.com/apache/arrow-rs/pull/5983) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- Automatically cleanup empty dirs in LocalFileSystem [\#5978](https://github.com/apache/arrow-rs/pull/5978) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([fsdvh](https://github.com/fsdvh)) +- WriteMultipart Abort on MultipartUpload::complete Error [\#5974](https://github.com/apache/arrow-rs/pull/5974) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([fsdvh](https://github.com/fsdvh)) +- Update quick-xml requirement from 0.33.0 to 0.34.0 in /object\_store [\#5954](https://github.com/apache/arrow-rs/pull/5954) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- Update quick-xml requirement from 0.32.0 to 0.33.0 in /object\_store [\#5946](https://github.com/apache/arrow-rs/pull/5946) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- Add `MultipartUpload` blanket implementation for `Box` [\#5919](https://github.com/apache/arrow-rs/pull/5919) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([fsdvh](https://github.com/fsdvh)) +- Add user defined metadata [\#5915](https://github.com/apache/arrow-rs/pull/5915) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([criccomini](https://github.com/criccomini)) +- Make ObjectStoreScheme public [\#5912](https://github.com/apache/arrow-rs/pull/5912) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([orf](https://github.com/orf)) +- chore: Remove not used cache in InstanceCredentialProvider [\#5888](https://github.com/apache/arrow-rs/pull/5888) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([Xuanwo](https://github.com/Xuanwo)) +- Fix clippy for object\_store [\#5883](https://github.com/apache/arrow-rs/pull/5883) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([alamb](https://github.com/alamb)) +- Update quick-xml requirement from 0.31.0 to 0.32.0 in /object\_store [\#5870](https://github.com/apache/arrow-rs/pull/5870) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- feat\(object\_store\): Add `put` API for buffered::BufWriter [\#5835](https://github.com/apache/arrow-rs/pull/5835) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([Xuanwo](https://github.com/Xuanwo)) +- Fix 5592: Colon \(:\) in in object\_store::path::{Path} is not handled on Windows [\#5830](https://github.com/apache/arrow-rs/pull/5830) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([hesampakdaman](https://github.com/hesampakdaman)) +- Fix issue \#5800: Handle missing files in list\_with\_delimiter [\#5803](https://github.com/apache/arrow-rs/pull/5803) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([hesampakdaman](https://github.com/hesampakdaman)) +- Update nix requirement from 0.28.0 to 0.29.0 in /object\_store [\#5799](https://github.com/apache/arrow-rs/pull/5799) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- Update itertools requirement from 0.12.0 to 0.13.0 in /object\_store [\#5780](https://github.com/apache/arrow-rs/pull/5780) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- Add additional WriteMultipart tests \(\#5743\) [\#5746](https://github.com/apache/arrow-rs/pull/5746) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) diff --git a/Cargo.toml b/Cargo.toml index 3f7b2c0..4b11661 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "object_store" -version = "0.10.1" +version = "0.10.2" edition = "2021" license = "MIT/Apache-2.0" readme = "README.md" diff --git a/dev/release/update_change_log.sh b/dev/release/update_change_log.sh index 9ba5d89..9c684ea 100755 --- a/dev/release/update_change_log.sh +++ b/dev/release/update_change_log.sh @@ -29,8 +29,8 @@ set -e -SINCE_TAG="object_store_0.10.0" -FUTURE_RELEASE="object_store_0.10.1" +SINCE_TAG="object_store_0.10.1" +FUTURE_RELEASE="object_store_0.10.2" SOURCE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" SOURCE_TOP_DIR="$(cd "${SOURCE_DIR}/../../" && pwd)" @@ -50,6 +50,7 @@ docker run -it --rm -e CHANGELOG_GITHUB_TOKEN="$CHANGELOG_GITHUB_TOKEN" -v "$(pw --cache-log=.githubchangeloggenerator.cache.log \ --http-cache \ --max-issues=600 \ + --include-labels="object-store" \ --exclude-tags-regex "(^\d+\.\d+\.\d+$)|(rc)" \ --since-tag ${SINCE_TAG} \ --future-release ${FUTURE_RELEASE} From 37d09369188faf8183a82bbda5c87e5e88d54114 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Thu, 25 Jul 2024 16:22:35 -0400 Subject: [PATCH 325/397] Fix clippy in object_store crate (#6120) * Fix clippy in object_store crate * clippy ignore --- src/client/mock_server.rs | 2 ++ src/lib.rs | 8 ++++---- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/src/client/mock_server.rs b/src/client/mock_server.rs index aa5a9e0..0f8e8bf 100644 --- a/src/client/mock_server.rs +++ b/src/client/mock_server.rs @@ -60,6 +60,8 @@ impl MockServer { let mut set = JoinSet::new(); loop { + // https://github.com/apache/arrow-rs/issues/6122 + #[allow(clippy::incompatible_msrv)] let (stream, _) = tokio::select! { conn = listener.accept() => conn.unwrap(), _ = &mut rx => break, diff --git a/src/lib.rs b/src/lib.rs index efbfe0b..904cb67 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -39,13 +39,13 @@ //! # Highlights //! //! 1. A high-performance async API focused on providing a consistent interface -//! mirroring that of object stores such as [S3] +//! mirroring that of object stores such as [S3] //! //! 2. Production quality, leading this crate to be used in large -//! scale production systems, such as [crates.io] and [InfluxDB IOx] +//! scale production systems, such as [crates.io] and [InfluxDB IOx] //! //! 3. Support for advanced functionality, including atomic, conditional reads -//! and writes, vectored IO, bulk deletion, and more... +//! and writes, vectored IO, bulk deletion, and more... //! //! 4. Stable and predictable governance via the [Apache Arrow] project //! @@ -98,7 +98,7 @@ //! * Methods map directly to object store APIs, providing both efficiency and predictability //! * Abstracts away filesystem and operating system specific quirks, ensuring portability //! * Allows for functionality not native to filesystems, such as operation preconditions -//! and atomic multipart uploads +//! and atomic multipart uploads //! //! This crate does provide [`BufReader`] and [`BufWriter`] adapters //! which provide a more filesystem-like API for working with the From 86f2febe3138565b2b7b44bf03360b533640d5f0 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Wed, 31 Jul 2024 12:39:36 -0400 Subject: [PATCH 326/397] Update object store MSRV to `1.64` (#6123) * Update MSRV to 1.64 * Revert "clippy ignore" This reverts commit 7a4b760bfb2a63c7778b20a4710c2828224f9565. --- Cargo.toml | 2 +- src/client/mock_server.rs | 2 -- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 4b11661..4e845e5 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -24,7 +24,7 @@ readme = "README.md" description = "A generic object store interface for uniformly interacting with AWS S3, Google Cloud Storage, Azure Blob Storage and local files." keywords = ["object", "storage", "cloud"] repository = "https://github.com/apache/arrow-rs/tree/master/object_store" -rust-version = "1.62.1" +rust-version = "1.64.0" [package.metadata.docs.rs] all-features = true diff --git a/src/client/mock_server.rs b/src/client/mock_server.rs index 0f8e8bf..aa5a9e0 100644 --- a/src/client/mock_server.rs +++ b/src/client/mock_server.rs @@ -60,8 +60,6 @@ impl MockServer { let mut set = JoinSet::new(); loop { - // https://github.com/apache/arrow-rs/issues/6122 - #[allow(clippy::incompatible_msrv)] let (stream, _) = tokio::select! { conn = listener.accept() => conn.unwrap(), _ = &mut rx => break, From 5a18a1cb0014f6db7cf0ac9066f83b33c00890ad Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Fri, 2 Aug 2024 16:14:38 +0100 Subject: [PATCH 327/397] Make object_store errors non-exhaustive (#6165) --- src/lib.rs | 1 + src/path/mod.rs | 1 + 2 files changed, 2 insertions(+) diff --git a/src/lib.rs b/src/lib.rs index 904cb67..7699477 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1225,6 +1225,7 @@ pub type Result = std::result::Result; /// A specialized `Error` for object store-related errors #[derive(Debug, Snafu)] #[allow(missing_docs)] +#[non_exhaustive] pub enum Error { #[snafu(display("Generic {} error: {}", store, source))] Generic { diff --git a/src/path/mod.rs b/src/path/mod.rs index f914862..59e08e2 100644 --- a/src/path/mod.rs +++ b/src/path/mod.rs @@ -37,6 +37,7 @@ pub use parts::{InvalidPart, PathPart}; /// Error returned by [`Path::parse`] #[derive(Debug, Snafu)] #[allow(missing_docs)] +#[non_exhaustive] pub enum Error { #[snafu(display("Path \"{}\" contained empty path segment", path))] EmptySegment { path: String }, From 58acb7f71d61ba774a444b60908ebd5e9cfe479f Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 2 Aug 2024 11:15:20 -0400 Subject: [PATCH 328/397] Update snafu (#5930) (#6070) Co-authored-by: Jesse --- Cargo.toml | 2 +- src/client/get.rs | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 4e845e5..71e0bcb 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -38,7 +38,7 @@ humantime = "2.1" itertools = "0.13.0" parking_lot = { version = "0.12" } percent-encoding = "2.1" -snafu = "0.7" +snafu = { version = "0.8", default-features = false, features = ["std", "rust_1_61"] } tracing = { version = "0.1" } url = "2.2" walkdir = "2" diff --git a/src/client/get.rs b/src/client/get.rs index b45eaa1..0fef578 100644 --- a/src/client/get.rs +++ b/src/client/get.rs @@ -103,7 +103,7 @@ enum GetResultError { source: crate::client::header::Error, }, - #[snafu(context(false))] + #[snafu(transparent)] InvalidRangeRequest { source: crate::util::InvalidGetRange, }, @@ -386,7 +386,7 @@ mod tests { let err = get_result::(&path, Some(get_range.clone()), resp).unwrap_err(); assert_eq!( err.to_string(), - "InvalidRangeRequest: Wanted range starting at 2, but object was only 2 bytes long" + "Wanted range starting at 2, but object was only 2 bytes long" ); let resp = make_response( From 7f06294eb29643dd22e7674d9ce82d7d5979ab94 Mon Sep 17 00:00:00 2001 From: Kyle McCarthy Date: Thu, 8 Aug 2024 12:45:14 -0500 Subject: [PATCH 329/397] feat(object_store): add `PermissionDenied` variant to top-level error (#6194) * feat(object_store): add `PermissionDenied` variant to top-level error * Update object_store/src/lib.rs Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> * refactor: add additional error variant for unauthenticated ops * fix: include path in unauthenticated error --------- Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> --- src/client/retry.rs | 12 ++++++++++++ src/lib.rs | 20 ++++++++++++++++++++ 2 files changed, 32 insertions(+) diff --git a/src/client/retry.rs b/src/client/retry.rs index 5df4ce0..1fc689c 100644 --- a/src/client/retry.rs +++ b/src/client/retry.rs @@ -86,6 +86,14 @@ impl Error { path, source: Box::new(self), }, + Some(StatusCode::FORBIDDEN) => crate::Error::PermissionDenied { + path, + source: Box::new(self), + }, + Some(StatusCode::UNAUTHORIZED) => crate::Error::Unauthenticated { + path, + source: Box::new(self), + }, _ => crate::Error::Generic { store, source: Box::new(self), @@ -106,6 +114,10 @@ impl From for std::io::Error { status: StatusCode::BAD_REQUEST, .. } => Self::new(ErrorKind::InvalidInput, err), + Error::Client { + status: StatusCode::UNAUTHORIZED | StatusCode::FORBIDDEN, + .. + } => Self::new(ErrorKind::PermissionDenied, err), Error::Reqwest { source, .. } if source.is_timeout() => { Self::new(ErrorKind::TimedOut, err) } diff --git a/src/lib.rs b/src/lib.rs index 7699477..4184d58 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1274,6 +1274,26 @@ pub enum Error { #[snafu(display("Operation not yet implemented."))] NotImplemented, + #[snafu(display( + "The operation lacked the necessary privileges to complete for path {}: {}", + path, + source + ))] + PermissionDenied { + path: String, + source: Box, + }, + + #[snafu(display( + "The operation lacked valid authentication credentials for path {}: {}", + path, + source + ))] + Unauthenticated { + path: String, + source: Box, + }, + #[snafu(display("Configuration key: '{}' is not valid for store '{}'.", key, store))] UnknownConfigurationKey { store: &'static str, key: String }, } From b04f6336df05e80f00177b8e77205d1186b52fa3 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Mon, 12 Aug 2024 17:23:43 -0400 Subject: [PATCH 330/397] Prepare for object_store `0.11.0` release (#6227) * Update version to 0.11.0 * Changelog for 0.11.0 * Remove irrelevant content from changelog --- CHANGELOG-old.md | 50 +++++++++++++++++++++++++++++++- CHANGELOG.md | 47 +++++------------------------- Cargo.toml | 2 +- dev/release/update_change_log.sh | 4 +-- 4 files changed, 60 insertions(+), 43 deletions(-) diff --git a/CHANGELOG-old.md b/CHANGELOG-old.md index 4193700..172b0f9 100644 --- a/CHANGELOG-old.md +++ b/CHANGELOG-old.md @@ -19,7 +19,55 @@ # Historical Changelog -# Changelog +## [object_store_0.10.2](https://github.com/apache/arrow-rs/tree/object_store_0.10.2) (2024-07-17) + +[Full Changelog](https://github.com/apache/arrow-rs/compare/object_store_0.10.1...object_store_0.10.2) + +**Implemented enhancements:** + +- Relax `WriteMultipart` API to support aborting after completion [\#5977](https://github.com/apache/arrow-rs/issues/5977) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Make ObjectStoreScheme in the object\_store crate public [\#5911](https://github.com/apache/arrow-rs/issues/5911) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Add BufUploader to implement same feature upon `WriteMultipart` like `BufWriter` [\#5834](https://github.com/apache/arrow-rs/issues/5834) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] + +**Fixed bugs:** + +- Investigate why `InstanceCredentialProvider::cache` is flagged as dead code [\#5884](https://github.com/apache/arrow-rs/issues/5884) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- \[object\_store\] Potential race condition in `list_with_delimiter` on `Local` [\#5800](https://github.com/apache/arrow-rs/issues/5800) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] + +**Documentation updates:** + +- Correct timeout in comment from 5s to 30s [\#6073](https://github.com/apache/arrow-rs/pull/6073) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([trungda](https://github.com/trungda)) +- docs: Fix broken links of object\_store\_opendal README [\#5929](https://github.com/apache/arrow-rs/pull/5929) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([Xuanwo](https://github.com/Xuanwo)) +- docs: Add object\_store\_opendal as related projects [\#5926](https://github.com/apache/arrow-rs/pull/5926) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([Xuanwo](https://github.com/Xuanwo)) +- chore: update docs to delineate which ObjectStore lists are recursive [\#5794](https://github.com/apache/arrow-rs/pull/5794) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([wiedld](https://github.com/wiedld)) +- Document object store release cadence [\#5750](https://github.com/apache/arrow-rs/pull/5750) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([alamb](https://github.com/alamb)) + +**Merged pull requests:** + +- Sanitize error message for sensitive requests [\#6074](https://github.com/apache/arrow-rs/pull/6074) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Update quick-xml requirement from 0.35.0 to 0.36.0 in /object\_store [\#6032](https://github.com/apache/arrow-rs/pull/6032) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- use GCE metadata server env var overrides [\#6015](https://github.com/apache/arrow-rs/pull/6015) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([barronw](https://github.com/barronw)) +- Update quick-xml requirement from 0.34.0 to 0.35.0 in /object\_store [\#5983](https://github.com/apache/arrow-rs/pull/5983) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- Automatically cleanup empty dirs in LocalFileSystem [\#5978](https://github.com/apache/arrow-rs/pull/5978) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([fsdvh](https://github.com/fsdvh)) +- WriteMultipart Abort on MultipartUpload::complete Error [\#5974](https://github.com/apache/arrow-rs/pull/5974) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([fsdvh](https://github.com/fsdvh)) +- Update quick-xml requirement from 0.33.0 to 0.34.0 in /object\_store [\#5954](https://github.com/apache/arrow-rs/pull/5954) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- Update quick-xml requirement from 0.32.0 to 0.33.0 in /object\_store [\#5946](https://github.com/apache/arrow-rs/pull/5946) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- Add `MultipartUpload` blanket implementation for `Box` [\#5919](https://github.com/apache/arrow-rs/pull/5919) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([fsdvh](https://github.com/fsdvh)) +- Add user defined metadata [\#5915](https://github.com/apache/arrow-rs/pull/5915) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([criccomini](https://github.com/criccomini)) +- Make ObjectStoreScheme public [\#5912](https://github.com/apache/arrow-rs/pull/5912) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([orf](https://github.com/orf)) +- chore: Remove not used cache in InstanceCredentialProvider [\#5888](https://github.com/apache/arrow-rs/pull/5888) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([Xuanwo](https://github.com/Xuanwo)) +- Fix clippy for object\_store [\#5883](https://github.com/apache/arrow-rs/pull/5883) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([alamb](https://github.com/alamb)) +- Update quick-xml requirement from 0.31.0 to 0.32.0 in /object\_store [\#5870](https://github.com/apache/arrow-rs/pull/5870) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- feat\(object\_store\): Add `put` API for buffered::BufWriter [\#5835](https://github.com/apache/arrow-rs/pull/5835) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([Xuanwo](https://github.com/Xuanwo)) +- Fix 5592: Colon \(:\) in in object\_store::path::{Path} is not handled on Windows [\#5830](https://github.com/apache/arrow-rs/pull/5830) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([hesampakdaman](https://github.com/hesampakdaman)) +- Fix issue \#5800: Handle missing files in list\_with\_delimiter [\#5803](https://github.com/apache/arrow-rs/pull/5803) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([hesampakdaman](https://github.com/hesampakdaman)) +- Update nix requirement from 0.28.0 to 0.29.0 in /object\_store [\#5799](https://github.com/apache/arrow-rs/pull/5799) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- Update itertools requirement from 0.12.0 to 0.13.0 in /object\_store [\#5780](https://github.com/apache/arrow-rs/pull/5780) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- Add additional WriteMultipart tests \(\#5743\) [\#5746](https://github.com/apache/arrow-rs/pull/5746) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) + + + +\* *This Changelog was automatically generated by [github_changelog_generator](https://github.com/github-changelog-generator/github-changelog-generator)* ## [object_store_0.10.1](https://github.com/apache/arrow-rs/tree/object_store_0.10.1) (2024-05-10) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0267ba8..dc71171 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,52 +19,21 @@ # Changelog -## [object_store_0.10.2](https://github.com/apache/arrow-rs/tree/object_store_0.10.2) (2024-07-17) +## [object_store_0.11.0](https://github.com/apache/arrow-rs/tree/object_store_0.11.0) (2024-08-12) -[Full Changelog](https://github.com/apache/arrow-rs/compare/object_store_0.10.1...object_store_0.10.2) +[Full Changelog](https://github.com/apache/arrow-rs/compare/object_store_0.10.2...object_store_0.11.0) -**Implemented enhancements:** +**Breaking changes:** -- Relax `WriteMultipart` API to support aborting after completion [\#5977](https://github.com/apache/arrow-rs/issues/5977) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- Make ObjectStoreScheme in the object\_store crate public [\#5911](https://github.com/apache/arrow-rs/issues/5911) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- Add BufUploader to implement same feature upon `WriteMultipart` like `BufWriter` [\#5834](https://github.com/apache/arrow-rs/issues/5834) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Make object\_store errors non-exhaustive [\#6165](https://github.com/apache/arrow-rs/pull/6165) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Update snafu to `0.8.0` in object\_store \(\#5930\) [\#6070](https://github.com/apache/arrow-rs/pull/6070) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([alamb](https://github.com/alamb)) -**Fixed bugs:** - -- Investigate why `InstanceCredentialProvider::cache` is flagged as dead code [\#5884](https://github.com/apache/arrow-rs/issues/5884) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- \[object\_store\] Potential race condition in `list_with_delimiter` on `Local` [\#5800](https://github.com/apache/arrow-rs/issues/5800) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] - -**Documentation updates:** - -- Correct timeout in comment from 5s to 30s [\#6073](https://github.com/apache/arrow-rs/pull/6073) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([trungda](https://github.com/trungda)) -- docs: Fix broken links of object\_store\_opendal README [\#5929](https://github.com/apache/arrow-rs/pull/5929) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([Xuanwo](https://github.com/Xuanwo)) -- docs: Add object\_store\_opendal as related projects [\#5926](https://github.com/apache/arrow-rs/pull/5926) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([Xuanwo](https://github.com/Xuanwo)) -- chore: update docs to delineate which ObjectStore lists are recursive [\#5794](https://github.com/apache/arrow-rs/pull/5794) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([wiedld](https://github.com/wiedld)) -- Document object store release cadence [\#5750](https://github.com/apache/arrow-rs/pull/5750) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([alamb](https://github.com/alamb)) **Merged pull requests:** -- Sanitize error message for sensitive requests [\#6074](https://github.com/apache/arrow-rs/pull/6074) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- Update quick-xml requirement from 0.35.0 to 0.36.0 in /object\_store [\#6032](https://github.com/apache/arrow-rs/pull/6032) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([dependabot[bot]](https://github.com/apps/dependabot)) -- use GCE metadata server env var overrides [\#6015](https://github.com/apache/arrow-rs/pull/6015) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([barronw](https://github.com/barronw)) -- Update quick-xml requirement from 0.34.0 to 0.35.0 in /object\_store [\#5983](https://github.com/apache/arrow-rs/pull/5983) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([dependabot[bot]](https://github.com/apps/dependabot)) -- Automatically cleanup empty dirs in LocalFileSystem [\#5978](https://github.com/apache/arrow-rs/pull/5978) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([fsdvh](https://github.com/fsdvh)) -- WriteMultipart Abort on MultipartUpload::complete Error [\#5974](https://github.com/apache/arrow-rs/pull/5974) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([fsdvh](https://github.com/fsdvh)) -- Update quick-xml requirement from 0.33.0 to 0.34.0 in /object\_store [\#5954](https://github.com/apache/arrow-rs/pull/5954) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([dependabot[bot]](https://github.com/apps/dependabot)) -- Update quick-xml requirement from 0.32.0 to 0.33.0 in /object\_store [\#5946](https://github.com/apache/arrow-rs/pull/5946) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([dependabot[bot]](https://github.com/apps/dependabot)) -- Add `MultipartUpload` blanket implementation for `Box` [\#5919](https://github.com/apache/arrow-rs/pull/5919) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([fsdvh](https://github.com/fsdvh)) -- Add user defined metadata [\#5915](https://github.com/apache/arrow-rs/pull/5915) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([criccomini](https://github.com/criccomini)) -- Make ObjectStoreScheme public [\#5912](https://github.com/apache/arrow-rs/pull/5912) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([orf](https://github.com/orf)) -- chore: Remove not used cache in InstanceCredentialProvider [\#5888](https://github.com/apache/arrow-rs/pull/5888) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([Xuanwo](https://github.com/Xuanwo)) -- Fix clippy for object\_store [\#5883](https://github.com/apache/arrow-rs/pull/5883) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([alamb](https://github.com/alamb)) -- Update quick-xml requirement from 0.31.0 to 0.32.0 in /object\_store [\#5870](https://github.com/apache/arrow-rs/pull/5870) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([dependabot[bot]](https://github.com/apps/dependabot)) -- feat\(object\_store\): Add `put` API for buffered::BufWriter [\#5835](https://github.com/apache/arrow-rs/pull/5835) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([Xuanwo](https://github.com/Xuanwo)) -- Fix 5592: Colon \(:\) in in object\_store::path::{Path} is not handled on Windows [\#5830](https://github.com/apache/arrow-rs/pull/5830) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([hesampakdaman](https://github.com/hesampakdaman)) -- Fix issue \#5800: Handle missing files in list\_with\_delimiter [\#5803](https://github.com/apache/arrow-rs/pull/5803) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([hesampakdaman](https://github.com/hesampakdaman)) -- Update nix requirement from 0.28.0 to 0.29.0 in /object\_store [\#5799](https://github.com/apache/arrow-rs/pull/5799) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([dependabot[bot]](https://github.com/apps/dependabot)) -- Update itertools requirement from 0.12.0 to 0.13.0 in /object\_store [\#5780](https://github.com/apache/arrow-rs/pull/5780) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([dependabot[bot]](https://github.com/apps/dependabot)) -- Add additional WriteMultipart tests \(\#5743\) [\#5746](https://github.com/apache/arrow-rs/pull/5746) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) - +- feat\(object\_store\): add `PermissionDenied` variant to top-level error [\#6194](https://github.com/apache/arrow-rs/pull/6194) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([kyle-mccarthy](https://github.com/kyle-mccarthy)) +- Update object store MSRV to `1.64` [\#6123](https://github.com/apache/arrow-rs/pull/6123) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([alamb](https://github.com/alamb)) +- Fix clippy in object\_store crate [\#6120](https://github.com/apache/arrow-rs/pull/6120) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([alamb](https://github.com/alamb)) \* *This Changelog was automatically generated by [github_changelog_generator](https://github.com/github-changelog-generator/github-changelog-generator)* diff --git a/Cargo.toml b/Cargo.toml index 71e0bcb..a878c0c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "object_store" -version = "0.10.2" +version = "0.11.0" edition = "2021" license = "MIT/Apache-2.0" readme = "README.md" diff --git a/dev/release/update_change_log.sh b/dev/release/update_change_log.sh index 9c684ea..142bbb0 100755 --- a/dev/release/update_change_log.sh +++ b/dev/release/update_change_log.sh @@ -29,8 +29,8 @@ set -e -SINCE_TAG="object_store_0.10.1" -FUTURE_RELEASE="object_store_0.10.2" +SINCE_TAG="object_store_0.10.2" +FUTURE_RELEASE="object_store_0.11.0" SOURCE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" SOURCE_TOP_DIR="$(cd "${SOURCE_DIR}/../../" && pwd)" From 09c6a66c16b9c7e1361de57c6ce4b65d3e50bce6 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Tue, 13 Aug 2024 06:11:26 -0400 Subject: [PATCH 331/397] Add LICENSE and NOTICE files to object_store (#6234) * Add LICENSE and NOTICE files to object_store * Update object_store/NOTICE.txt Co-authored-by: Xuanwo * Update object_store/LICENSE.txt --------- Co-authored-by: Xuanwo --- LICENSE.txt | 204 ++++++++++++++++++++++++++++++++++++++++++++++++++++ NOTICE.txt | 5 ++ 2 files changed, 209 insertions(+) create mode 100644 LICENSE.txt create mode 100644 NOTICE.txt diff --git a/LICENSE.txt b/LICENSE.txt new file mode 100644 index 0000000..de4b130 --- /dev/null +++ b/LICENSE.txt @@ -0,0 +1,204 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + + diff --git a/NOTICE.txt b/NOTICE.txt new file mode 100644 index 0000000..0a23eee --- /dev/null +++ b/NOTICE.txt @@ -0,0 +1,5 @@ +Apache Arrow Object Store +Copyright 2020-2024 The Apache Software Foundation + +This product includes software developed at +The Apache Software Foundation (http://www.apache.org/). From a1935374d456b08129a5b702c002233a3315643d Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Tue, 13 Aug 2024 06:19:52 -0400 Subject: [PATCH 332/397] Update changelog for object_store 0.11.0 release (#6238) --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index dc71171..18dde11 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -31,6 +31,7 @@ **Merged pull requests:** +- Add LICENSE and NOTICE files to object_store [\#6234](https://github.com/apache/arrow-rs/pull/6234) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([alamb](https://github.com/alamb)) - feat\(object\_store\): add `PermissionDenied` variant to top-level error [\#6194](https://github.com/apache/arrow-rs/pull/6194) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([kyle-mccarthy](https://github.com/kyle-mccarthy)) - Update object store MSRV to `1.64` [\#6123](https://github.com/apache/arrow-rs/pull/6123) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([alamb](https://github.com/alamb)) - Fix clippy in object\_store crate [\#6120](https://github.com/apache/arrow-rs/pull/6120) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([alamb](https://github.com/alamb)) From 64f9c85f4e82d4369426edcbbdf7d21eabff3526 Mon Sep 17 00:00:00 2001 From: Jiacheng Yang <92543367+jiachengdb@users.noreply.github.com> Date: Thu, 15 Aug 2024 13:30:55 -0700 Subject: [PATCH 333/397] feat(object_store): add support for server-side encryption with customer-provided keys (SSE-C) (#6230) * Add support for server-side encryption with customer-provided keys (SSE-C). * Add SSE-C test using MinIO. * Visibility change * add nocapture to verify the test indeed runs * cargo fmt * Update object_store/src/aws/mod.rs use environment variables Co-authored-by: Will Jones * Update object_store/CONTRIBUTING.md use environment variables Co-authored-by: Will Jones * Fix api --------- Co-authored-by: Will Jones --- CONTRIBUTING.md | 48 +++++++++++++++ src/aws/builder.rs | 148 ++++++++++++++++++++++++++++++++++++--------- src/aws/client.rs | 56 +++++++++++++++-- src/aws/mod.rs | 68 ++++++++++++++++++++- 4 files changed, 285 insertions(+), 35 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 4b0ef1f..5444ec7 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -101,6 +101,54 @@ export AWS_SERVER_SIDE_ENCRYPTION=aws:kms:dsse cargo test --features aws ``` +#### SSE-C Encryption tests + +Unfortunately, localstack does not support SSE-C encryption (https://github.com/localstack/localstack/issues/11356). + +We will use [MinIO](https://min.io/docs/minio/container/operations/server-side-encryption.html) to test SSE-C encryption. + +First, create a self-signed certificate to enable HTTPS for MinIO, as SSE-C requires HTTPS. + +```shell +mkdir ~/certs +cd ~/certs +openssl genpkey -algorithm RSA -out private.key +openssl req -new -key private.key -out request.csr -subj "/C=US/ST=State/L=City/O=Organization/OU=Unit/CN=example.com/emailAddress=email@example.com" +openssl x509 -req -days 365 -in request.csr -signkey private.key -out public.crt +rm request.csr +``` + +Second, start MinIO with the self-signed certificate. + +```shell +docker run -d \ + -p 9000:9000 \ + --name minio \ + -v ${HOME}/certs:/root/.minio/certs \ + -e "MINIO_ROOT_USER=minio" \ + -e "MINIO_ROOT_PASSWORD=minio123" \ + minio/minio server /data +``` + +Create a test bucket. + +```shell +export AWS_BUCKET_NAME=test-bucket +export AWS_ACCESS_KEY_ID=minio +export AWS_SECRET_ACCESS_KEY=minio123 +export AWS_ENDPOINT=https://localhost:9000 +aws s3 mb s3://test-bucket --endpoint-url=https://localhost:9000 --no-verify-ssl +``` + +Run the tests. The real test is `test_s3_ssec_encryption_with_minio()` + +```shell +export TEST_S3_SSEC_ENCRYPTION=1 +cargo test --features aws --package object_store --lib aws::tests::test_s3_ssec_encryption_with_minio -- --exact --nocapture +``` + + + ### Azure To test the Azure integration diff --git a/src/aws/builder.rs b/src/aws/builder.rs index ffef3fb..574345c 100644 --- a/src/aws/builder.rs +++ b/src/aws/builder.rs @@ -26,7 +26,10 @@ use crate::aws::{ use crate::client::TokenCredentialProvider; use crate::config::ConfigValue; use crate::{ClientConfigKey, ClientOptions, Result, RetryConfig, StaticCredentialProvider}; +use base64::prelude::BASE64_STANDARD; +use base64::Engine; use itertools::Itertools; +use md5::{Digest, Md5}; use reqwest::header::{HeaderMap, HeaderValue}; use serde::{Deserialize, Serialize}; use snafu::{OptionExt, ResultExt, Snafu}; @@ -73,7 +76,7 @@ enum Error { #[snafu(display("Invalid Zone suffix for bucket '{bucket}'"))] ZoneSuffix { bucket: String }, - #[snafu(display("Invalid encryption type: {}. Valid values are \"AES256\", \"sse:kms\", and \"sse:kms:dsse\".", passed))] + #[snafu(display("Invalid encryption type: {}. Valid values are \"AES256\", \"sse:kms\", \"sse:kms:dsse\" and \"sse-c\".", passed))] InvalidEncryptionType { passed: String }, #[snafu(display( @@ -166,6 +169,8 @@ pub struct AmazonS3Builder { encryption_type: Option>, encryption_kms_key_id: Option, encryption_bucket_key_enabled: Option>, + /// base64-encoded 256-bit customer encryption key for SSE-C. + encryption_customer_key_base64: Option, } /// Configuration keys for [`AmazonS3Builder`] @@ -394,6 +399,9 @@ impl FromStr for AmazonS3ConfigKey { "aws_sse_bucket_key_enabled" => { Ok(Self::Encryption(S3EncryptionConfigKey::BucketKeyEnabled)) } + "aws_sse_customer_key_base64" => Ok(Self::Encryption( + S3EncryptionConfigKey::CustomerEncryptionKey, + )), _ => match s.parse() { Ok(key) => Ok(Self::Client(key)), Err(_) => Err(Error::UnknownConfigurationKey { key: s.into() }.into()), @@ -511,6 +519,9 @@ impl AmazonS3Builder { S3EncryptionConfigKey::BucketKeyEnabled => { self.encryption_bucket_key_enabled = Some(ConfigValue::Deferred(value.into())) } + S3EncryptionConfigKey::CustomerEncryptionKey => { + self.encryption_customer_key_base64 = Some(value.into()) + } }, }; self @@ -566,6 +577,9 @@ impl AmazonS3Builder { .encryption_bucket_key_enabled .as_ref() .map(ToString::to_string), + S3EncryptionConfigKey::CustomerEncryptionKey => { + self.encryption_customer_key_base64.clone() + } }, } } @@ -813,6 +827,14 @@ impl AmazonS3Builder { self } + /// Use SSE-C for server side encryption. + /// Must pass the *base64-encoded* 256-bit customer encryption key. + pub fn with_ssec_encryption(mut self, customer_key_base64: impl Into) -> Self { + self.encryption_type = Some(ConfigValue::Parsed(S3EncryptionType::SseC)); + self.encryption_customer_key_base64 = customer_key_base64.into().into(); + self + } + /// Set whether to enable bucket key for server side encryption. This overrides /// the bucket default setting for bucket keys. /// @@ -953,6 +975,7 @@ impl AmazonS3Builder { self.encryption_bucket_key_enabled .map(|val| val.get()) .transpose()?, + self.encryption_customer_key_base64, )? } else { S3EncryptionHeaders::default() @@ -994,15 +1017,14 @@ fn parse_bucket_az(bucket: &str) -> Option<&str> { /// These options are used to configure server-side encryption for S3 objects. /// To configure them, pass them to [`AmazonS3Builder::with_config`]. /// -/// Both [SSE-KMS] and [DSSE-KMS] are supported. [SSE-C] is not yet supported. -/// +/// [SSE-S3]: https://docs.aws.amazon.com/AmazonS3/latest/userguide/UsingServerSideEncryption.html /// [SSE-KMS]: https://docs.aws.amazon.com/AmazonS3/latest/userguide/UsingKMSEncryption.html /// [DSSE-KMS]: https://docs.aws.amazon.com/AmazonS3/latest/userguide/UsingDSSEncryption.html /// [SSE-C]: https://docs.aws.amazon.com/AmazonS3/latest/userguide/ServerSideEncryptionCustomerKeys.html #[derive(PartialEq, Eq, Hash, Clone, Debug, Copy, Serialize, Deserialize)] #[non_exhaustive] pub enum S3EncryptionConfigKey { - /// Type of encryption to use. If set, must be one of "AES256", "aws:kms", or "aws:kms:dsse". + /// Type of encryption to use. If set, must be one of "AES256" (SSE-S3), "aws:kms" (SSE-KMS), "aws:kms:dsse" (DSSE-KMS) or "sse-c". ServerSideEncryption, /// The KMS key ID to use for server-side encryption. If set, ServerSideEncryption /// must be "aws:kms" or "aws:kms:dsse". @@ -1010,6 +1032,10 @@ pub enum S3EncryptionConfigKey { /// If set to true, will use the bucket's default KMS key for server-side encryption. /// If set to false, will disable the use of the bucket's default KMS key for server-side encryption. BucketKeyEnabled, + + /// The base64 encoded, 256-bit customer encryption key to use for server-side encryption. + /// If set, ServerSideEncryption must be "sse-c". + CustomerEncryptionKey, } impl AsRef for S3EncryptionConfigKey { @@ -1018,6 +1044,7 @@ impl AsRef for S3EncryptionConfigKey { Self::ServerSideEncryption => "aws_server_side_encryption", Self::KmsKeyId => "aws_sse_kms_key_id", Self::BucketKeyEnabled => "aws_sse_bucket_key_enabled", + Self::CustomerEncryptionKey => "aws_sse_customer_key_base64", } } } @@ -1027,6 +1054,7 @@ enum S3EncryptionType { S3, SseKms, DsseKms, + SseC, } impl crate::config::Parse for S3EncryptionType { @@ -1035,6 +1063,7 @@ impl crate::config::Parse for S3EncryptionType { "AES256" => Ok(Self::S3), "aws:kms" => Ok(Self::SseKms), "aws:kms:dsse" => Ok(Self::DsseKms), + "sse-c" => Ok(Self::SseC), _ => Err(Error::InvalidEncryptionType { passed: s.into() }.into()), } } @@ -1046,6 +1075,7 @@ impl From<&S3EncryptionType> for &'static str { S3EncryptionType::S3 => "AES256", S3EncryptionType::SseKms => "aws:kms", S3EncryptionType::DsseKms => "aws:kms:dsse", + S3EncryptionType::SseC => "sse-c", } } } @@ -1062,37 +1092,87 @@ impl std::fmt::Display for S3EncryptionType { /// Whether these headers are sent depends on both the kind of encryption set /// and the kind of request being made. #[derive(Default, Clone, Debug)] -pub struct S3EncryptionHeaders(HeaderMap); +pub(super) struct S3EncryptionHeaders(pub HeaderMap); impl S3EncryptionHeaders { fn try_new( encryption_type: &S3EncryptionType, - key_id: Option, + encryption_kms_key_id: Option, bucket_key_enabled: Option, + encryption_customer_key_base64: Option, ) -> Result { let mut headers = HeaderMap::new(); - // Note: if we later add support for SSE-C, we should be sure to use - // HeaderValue::set_sensitive to prevent the key from being logged. - headers.insert( - "x-amz-server-side-encryption", - HeaderValue::from_static(encryption_type.into()), - ); - if let Some(key_id) = key_id { - headers.insert( - "x-amz-server-side-encryption-aws-kms-key-id", - key_id - .try_into() - .map_err(|err| Error::InvalidEncryptionHeader { - header: "kms-key-id", - source: Box::new(err), - })?, - ); - } - if let Some(bucket_key_enabled) = bucket_key_enabled { - headers.insert( - "x-amz-server-side-encryption-bucket-key-enabled", - HeaderValue::from_static(if bucket_key_enabled { "true" } else { "false" }), - ); + match encryption_type { + S3EncryptionType::S3 | S3EncryptionType::SseKms | S3EncryptionType::DsseKms => { + headers.insert( + "x-amz-server-side-encryption", + HeaderValue::from_static(encryption_type.into()), + ); + if let Some(key_id) = encryption_kms_key_id { + headers.insert( + "x-amz-server-side-encryption-aws-kms-key-id", + key_id + .try_into() + .map_err(|err| Error::InvalidEncryptionHeader { + header: "kms-key-id", + source: Box::new(err), + })?, + ); + } + if let Some(bucket_key_enabled) = bucket_key_enabled { + headers.insert( + "x-amz-server-side-encryption-bucket-key-enabled", + HeaderValue::from_static(if bucket_key_enabled { "true" } else { "false" }), + ); + } + } + S3EncryptionType::SseC => { + headers.insert( + "x-amz-server-side-encryption-customer-algorithm", + HeaderValue::from_static("AES256"), + ); + if let Some(key) = encryption_customer_key_base64 { + let mut header_value: HeaderValue = + key.clone() + .try_into() + .map_err(|err| Error::InvalidEncryptionHeader { + header: "x-amz-server-side-encryption-customer-key", + source: Box::new(err), + })?; + header_value.set_sensitive(true); + headers.insert("x-amz-server-side-encryption-customer-key", header_value); + + let decoded_key = BASE64_STANDARD.decode(key.as_bytes()).map_err(|err| { + Error::InvalidEncryptionHeader { + header: "x-amz-server-side-encryption-customer-key", + source: Box::new(err), + } + })?; + let mut hasher = Md5::new(); + hasher.update(decoded_key); + let md5 = BASE64_STANDARD.encode(hasher.finalize()); + let mut md5_header_value: HeaderValue = + md5.try_into() + .map_err(|err| Error::InvalidEncryptionHeader { + header: "x-amz-server-side-encryption-customer-key-MD5", + source: Box::new(err), + })?; + md5_header_value.set_sensitive(true); + headers.insert( + "x-amz-server-side-encryption-customer-key-MD5", + md5_header_value, + ); + } else { + return Err(Error::InvalidEncryptionHeader { + header: "x-amz-server-side-encryption-customer-key", + source: Box::new(std::io::Error::new( + std::io::ErrorKind::InvalidInput, + "Missing customer key", + )), + } + .into()); + } + } } Ok(Self(headers)) } @@ -1162,7 +1242,11 @@ mod tests { .with_config(AmazonS3ConfigKey::UnsignedPayload, "true") .with_config("aws_server_side_encryption".parse().unwrap(), "AES256") .with_config("aws_sse_kms_key_id".parse().unwrap(), "some_key_id") - .with_config("aws_sse_bucket_key_enabled".parse().unwrap(), "true"); + .with_config("aws_sse_bucket_key_enabled".parse().unwrap(), "true") + .with_config( + "aws_sse_customer_key_base64".parse().unwrap(), + "some_customer_key", + ); assert_eq!( builder @@ -1216,6 +1300,12 @@ mod tests { .unwrap(), "true" ); + assert_eq!( + builder + .get_config_value(&"aws_sse_customer_key_base64".parse().unwrap()) + .unwrap(), + "some_customer_key" + ); } #[test] diff --git a/src/aws/client.rs b/src/aws/client.rs index ab4da86..007e271 100644 --- a/src/aws/client.rs +++ b/src/aws/client.rs @@ -181,7 +181,7 @@ pub struct S3Config { pub checksum: Option, pub copy_if_not_exists: Option, pub conditional_put: Option, - pub encryption_headers: S3EncryptionHeaders, + pub(super) encryption_headers: S3EncryptionHeaders, } impl S3Config { @@ -522,10 +522,47 @@ impl S3Client { /// Make an S3 Copy request pub fn copy_request<'a>(&'a self, from: &Path, to: &'a Path) -> Request<'a> { let source = format!("{}/{}", self.config.bucket, encode_path(from)); + + let mut copy_source_encryption_headers = HeaderMap::new(); + if let Some(customer_algorithm) = self + .config + .encryption_headers + .0 + .get("x-amz-server-side-encryption-customer-algorithm") + { + copy_source_encryption_headers.insert( + "x-amz-copy-source-server-side-encryption-customer-algorithm", + customer_algorithm.clone(), + ); + } + if let Some(customer_key) = self + .config + .encryption_headers + .0 + .get("x-amz-server-side-encryption-customer-key") + { + copy_source_encryption_headers.insert( + "x-amz-copy-source-server-side-encryption-customer-key", + customer_key.clone(), + ); + } + if let Some(customer_key_md5) = self + .config + .encryption_headers + .0 + .get("x-amz-server-side-encryption-customer-key-MD5") + { + copy_source_encryption_headers.insert( + "x-amz-copy-source-server-side-encryption-customer-key-MD5", + customer_key_md5.clone(), + ); + } + self.request(Method::PUT, to) .idempotent(true) .header(©_SOURCE_HEADER, &source) .headers(self.config.encryption_headers.clone().into()) + .headers(copy_source_encryption_headers) .with_session_creds(false) } @@ -562,13 +599,21 @@ impl S3Client { ) -> Result { let part = (part_idx + 1).to_string(); - let response = self + let mut request = self .request(Method::PUT, path) .with_payload(data) .query(&[("partNumber", &part), ("uploadId", upload_id)]) - .idempotent(true) - .send() - .await?; + .idempotent(true); + if self + .config + .encryption_headers + .0 + .contains_key("x-amz-server-side-encryption-customer-algorithm") + { + // If SSE-C is used, we must include the encryption headers in every upload request. + request = request.with_encryption_headers(); + } + let response = request.send().await?; let content_id = get_etag(response.headers()).context(MetadataSnafu)?; Ok(PartId { content_id }) @@ -660,6 +705,7 @@ impl GetClient for S3Client { }; let mut builder = self.client.request(method, url); + builder = builder.headers(self.config.encryption_headers.clone().into()); if let Some(v) = &options.version { builder = builder.query(&[("versionId", v)]) diff --git a/src/aws/mod.rs b/src/aws/mod.rs index f5204a5..4a773e7 100644 --- a/src/aws/mod.rs +++ b/src/aws/mod.rs @@ -60,7 +60,7 @@ mod dynamo; mod precondition; mod resolve; -pub use builder::{AmazonS3Builder, AmazonS3ConfigKey, S3EncryptionHeaders}; +pub use builder::{AmazonS3Builder, AmazonS3ConfigKey}; pub use checksum::Checksum; pub use dynamo::DynamoCommit; pub use precondition::{S3ConditionalPut, S3CopyIfNotExists}; @@ -412,6 +412,9 @@ mod tests { use crate::client::get::GetClient; use crate::integration::*; use crate::tests::*; + use crate::ClientOptions; + use base64::prelude::BASE64_STANDARD; + use base64::Engine; use hyper::HeaderMap; const NON_EXISTENT_NAME: &str = "nonexistentname"; @@ -605,4 +608,67 @@ mod tests { store.delete(location).await.unwrap(); } } + + /// See CONTRIBUTING.md for the MinIO setup for this test. + #[tokio::test] + async fn test_s3_ssec_encryption_with_minio() { + if std::env::var("TEST_S3_SSEC_ENCRYPTION").is_err() { + eprintln!("Skipping S3 SSE-C encryption test"); + return; + } + eprintln!("Running S3 SSE-C encryption test"); + + let customer_key = "1234567890abcdef1234567890abcdef"; + let expected_md5 = "JMwgiexXqwuPqIPjYFmIZQ=="; + + let store = AmazonS3Builder::from_env() + .with_ssec_encryption(BASE64_STANDARD.encode(customer_key)) + .with_client_options(ClientOptions::default().with_allow_invalid_certificates(true)) + .build() + .unwrap(); + + let data = PutPayload::from(vec![3u8; 1024]); + + let locations = [ + Path::from("test-encryption-1"), + Path::from("test-encryption-2"), + Path::from("test-encryption-3"), + ]; + + // Test put with sse-c. + store.put(&locations[0], data.clone()).await.unwrap(); + + // Test copy with sse-c. + store.copy(&locations[0], &locations[1]).await.unwrap(); + + // Test multipart upload with sse-c. + let mut upload = store.put_multipart(&locations[2]).await.unwrap(); + upload.put_part(data.clone()).await.unwrap(); + upload.complete().await.unwrap(); + + // Test get with sse-c. + for location in &locations { + let res = store + .client + .get_request(location, GetOptions::default()) + .await + .unwrap(); + let headers = res.headers(); + assert_eq!( + headers + .get("x-amz-server-side-encryption-customer-algorithm") + .expect("object is not encrypted with SSE-C"), + "AES256" + ); + + assert_eq!( + headers + .get("x-amz-server-side-encryption-customer-key-MD5") + .expect("object is not encrypted with SSE-C"), + expected_md5 + ); + + store.delete(location).await.unwrap(); + } + } } From 15e294d01146c3f7f2425b9cd5d2d7f3b53dac03 Mon Sep 17 00:00:00 2001 From: ByteBaker <42913098+ByteBaker@users.noreply.github.com> Date: Tue, 20 Aug 2024 00:43:09 +0530 Subject: [PATCH 334/397] feat: further TLS options on ClientOptions: #5034 (#6148) * feat: further TLS options on ClientOptions: #5034 * Rename to Certificate and with_root_certificate, add docs --------- Co-authored-by: Andrew Lamb --- src/client/mod.rs | 64 +++++++++++++++++++++++++++++++++++++++++++++++ src/lib.rs | 4 +-- 2 files changed, 66 insertions(+), 2 deletions(-) diff --git a/src/client/mod.rs b/src/client/mod.rs index 43fd658..c45833b 100644 --- a/src/client/mod.rs +++ b/src/client/mod.rs @@ -167,10 +167,60 @@ impl FromStr for ClientConfigKey { } } +/// Represents a CA certificate provided by the user. +/// +/// This is used to configure the client to trust a specific certificate. See +/// [Self::from_pem] for an example +#[derive(Debug, Clone)] +pub struct Certificate(reqwest::tls::Certificate); + +impl Certificate { + /// Create a `Certificate` from a PEM encoded certificate. + /// + /// # Example from a PEM file + /// + /// ```no_run + /// # use object_store::Certificate; + /// # use std::fs::File; + /// # use std::io::Read; + /// let mut buf = Vec::new(); + /// File::open("my_cert.pem").unwrap() + /// .read_to_end(&mut buf).unwrap(); + /// let cert = Certificate::from_pem(&buf).unwrap(); + /// + /// ``` + pub fn from_pem(pem: &[u8]) -> Result { + Ok(Self( + reqwest::tls::Certificate::from_pem(pem).map_err(map_client_error)?, + )) + } + + /// Create a collection of `Certificate` from a PEM encoded certificate + /// bundle. + /// + /// Files that contain such collections have extensions such as `.crt`, + /// `.cer` and `.pem` files. + pub fn from_pem_bundle(pem_bundle: &[u8]) -> Result> { + Ok(reqwest::tls::Certificate::from_pem_bundle(pem_bundle) + .map_err(map_client_error)? + .into_iter() + .map(Self) + .collect()) + } + + /// Create a `Certificate` from a binary DER encoded certificate. + pub fn from_der(der: &[u8]) -> Result { + Ok(Self( + reqwest::tls::Certificate::from_der(der).map_err(map_client_error)?, + )) + } +} + /// HTTP client configuration for remote object stores #[derive(Debug, Clone)] pub struct ClientOptions { user_agent: Option>, + root_certificates: Vec, content_type_map: HashMap, default_content_type: Option, default_headers: Option, @@ -201,6 +251,7 @@ impl Default for ClientOptions { // we opt for a slightly higher default timeout of 30 seconds Self { user_agent: None, + root_certificates: Default::default(), content_type_map: Default::default(), default_content_type: None, default_headers: None, @@ -310,6 +361,15 @@ impl ClientOptions { self } + /// Add a custom root certificate. + /// + /// This can be used to connect to a server that has a self-signed + /// certificate for example. + pub fn with_root_certificate(mut self, certificate: Certificate) -> Self { + self.root_certificates.push(certificate); + self + } + /// Set the default CONTENT_TYPE for uploads pub fn with_default_content_type(mut self, mime: impl Into) -> Self { self.default_content_type = Some(mime.into()); @@ -541,6 +601,10 @@ impl ClientOptions { builder = builder.proxy(proxy); } + for certificate in &self.root_certificates { + builder = builder.add_root_certificate(certificate.0.clone()); + } + if let Some(timeout) = &self.timeout { builder = builder.timeout(timeout.get()?) } diff --git a/src/lib.rs b/src/lib.rs index 4184d58..4b43f0c 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -526,8 +526,8 @@ mod client; #[cfg(feature = "cloud")] pub use client::{ - backoff::BackoffConfig, retry::RetryConfig, ClientConfigKey, ClientOptions, CredentialProvider, - StaticCredentialProvider, + backoff::BackoffConfig, retry::RetryConfig, Certificate, ClientConfigKey, ClientOptions, + CredentialProvider, StaticCredentialProvider, }; #[cfg(feature = "cloud")] From 2124260876e505f7e24e4189fa1af528a180a31c Mon Sep 17 00:00:00 2001 From: Jiacheng Yang <92543367+jiachengdb@users.noreply.github.com> Date: Tue, 20 Aug 2024 11:36:59 -0700 Subject: [PATCH 335/397] Only add encryption headers for for SSE-C in get. (#6260) --- src/aws/client.rs | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/aws/client.rs b/src/aws/client.rs index 007e271..6fe4889 100644 --- a/src/aws/client.rs +++ b/src/aws/client.rs @@ -705,7 +705,14 @@ impl GetClient for S3Client { }; let mut builder = self.client.request(method, url); - builder = builder.headers(self.config.encryption_headers.clone().into()); + if self + .config + .encryption_headers + .0 + .contains_key("x-amz-server-side-encryption-customer-algorithm") + { + builder = builder.headers(self.config.encryption_headers.clone().into()); + } if let Some(v) = &options.version { builder = builder.query(&[("versionId", v)]) From 732e910462b3cbe978267d1784475c87ea304532 Mon Sep 17 00:00:00 2001 From: Weston Pace Date: Thu, 29 Aug 2024 09:35:00 -0700 Subject: [PATCH 336/397] docs[object_store]: clarify the backoff strategy that is actually implemented (#6325) * Clarify the backoff strategy that is actually implemented * Update object_store/src/client/backoff.rs Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> --------- Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> --- src/client/backoff.rs | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/src/client/backoff.rs b/src/client/backoff.rs index e015891..a1fa26c 100644 --- a/src/client/backoff.rs +++ b/src/client/backoff.rs @@ -18,7 +18,12 @@ use rand::prelude::*; use std::time::Duration; -/// Exponential backoff with jitter +/// Exponential backoff with decorrelated jitter algorithm +/// +/// The first backoff will always be `init_backoff`. +/// +/// Subsequent backoffs will pick a random value between `init_backoff` and +/// `base * previous` where `previous` is the duration of the previous backoff /// /// See #[allow(missing_copy_implementations)] @@ -28,7 +33,7 @@ pub struct BackoffConfig { pub init_backoff: Duration, /// The maximum backoff duration pub max_backoff: Duration, - /// The base of the exponential to use + /// The multiplier to use for the next backoff duration pub base: f64, } From 1211e50ba9a182f088d12db0ba42ad363f351303 Mon Sep 17 00:00:00 2001 From: Alex Wilcoxson Date: Mon, 2 Sep 2024 04:28:32 -0500 Subject: [PATCH 337/397] fix: azure sas token visible in logs (#6323) --- src/azure/client.rs | 47 +++++++++++++++++++++++++++++++++++++---- src/azure/credential.rs | 12 +++++++++++ 2 files changed, 55 insertions(+), 4 deletions(-) diff --git a/src/azure/client.rs b/src/azure/client.rs index b5e82c2..0499051 100644 --- a/src/azure/client.rs +++ b/src/azure/client.rs @@ -226,11 +226,16 @@ impl<'a> PutRequest<'a> { async fn send(self) -> Result { let credential = self.config.get_credential().await?; + let sensitive = credential + .as_deref() + .map(|c| c.sensitive_request()) + .unwrap_or_default(); let response = self .builder .header(CONTENT_LENGTH, self.payload.content_length()) .with_azure_authorization(&credential, &self.config.account) .retryable(&self.config.retry_config) + .sensitive(sensitive) .idempotent(self.idempotent) .payload(Some(self.payload)) .send() @@ -356,12 +361,18 @@ impl AzureClient { let credential = self.get_credential().await?; let url = self.config.path_url(path); + let sensitive = credential + .as_deref() + .map(|c| c.sensitive_request()) + .unwrap_or_default(); self.client .request(Method::DELETE, url) .query(query) .header(&DELETE_SNAPSHOTS, "include") .with_azure_authorization(&credential, &self.config.account) - .send_retry(&self.config.retry_config) + .retryable(&self.config.retry_config) + .sensitive(sensitive) + .send() .await .context(DeleteRequestSnafu { path: path.as_ref(), @@ -392,9 +403,14 @@ impl AzureClient { builder = builder.header(IF_NONE_MATCH, "*"); } + let sensitive = credential + .as_deref() + .map(|c| c.sensitive_request()) + .unwrap_or_default(); builder .with_azure_authorization(&credential, &self.config.account) .retryable(&self.config.retry_config) + .sensitive(sensitive) .idempotent(overwrite) .send() .await @@ -423,6 +439,10 @@ impl AzureClient { )); body.push_str(""); + let sensitive = credential + .as_deref() + .map(|c| c.sensitive_request()) + .unwrap_or_default(); let response = self .client .request(Method::POST, url) @@ -430,6 +450,7 @@ impl AzureClient { .query(&[("restype", "service"), ("comp", "userdelegationkey")]) .with_azure_authorization(&credential, &self.config.account) .retryable(&self.config.retry_config) + .sensitive(sensitive) .idempotent(true) .send() .await @@ -482,12 +503,18 @@ impl AzureClient { pub async fn get_blob_tagging(&self, path: &Path) -> Result { let credential = self.get_credential().await?; let url = self.config.path_url(path); + let sensitive = credential + .as_deref() + .map(|c| c.sensitive_request()) + .unwrap_or_default(); let response = self .client .request(Method::GET, url) .query(&[("comp", "tags")]) .with_azure_authorization(&credential, &self.config.account) - .send_retry(&self.config.retry_config) + .retryable(&self.config.retry_config) + .sensitive(sensitive) + .send() .await .context(GetRequestSnafu { path: path.as_ref(), @@ -536,10 +563,16 @@ impl GetClient for AzureClient { builder = builder.query(&[("versionid", v)]) } + let sensitive = credential + .as_deref() + .map(|c| c.sensitive_request()) + .unwrap_or_default(); let response = builder .with_get_options(options) .with_azure_authorization(&credential, &self.config.account) - .send_retry(&self.config.retry_config) + .retryable(&self.config.retry_config) + .sensitive(sensitive) + .send() .await .context(GetRequestSnafu { path: path.as_ref(), @@ -590,12 +623,18 @@ impl ListClient for AzureClient { query.push(("marker", token)) } + let sensitive = credential + .as_deref() + .map(|c| c.sensitive_request()) + .unwrap_or_default(); let response = self .client .request(Method::GET, url) .query(&query) .with_azure_authorization(&credential, &self.config.account) - .send_retry(&self.config.retry_config) + .retryable(&self.config.retry_config) + .sensitive(sensitive) + .send() .await .context(ListRequestSnafu)? .bytes() diff --git a/src/azure/credential.rs b/src/azure/credential.rs index c8212a9..7808c7c 100644 --- a/src/azure/credential.rs +++ b/src/azure/credential.rs @@ -130,6 +130,18 @@ pub enum AzureCredential { BearerToken(String), } +impl AzureCredential { + /// Determines if the credential requires the request be treated as sensitive + pub fn sensitive_request(&self) -> bool { + match self { + Self::AccessKey(_) => false, + Self::BearerToken(_) => false, + // SAS tokens are sent as query parameters in the url + Self::SASToken(_) => true, + } + } +} + /// A list of known Azure authority hosts pub mod authority_hosts { /// China-based Azure Authority Host From 32f85b47a10be19ee17e555e1a3e5b71699a0d50 Mon Sep 17 00:00:00 2001 From: Costi Ciudatu Date: Wed, 4 Sep 2024 12:00:02 +0300 Subject: [PATCH 338/397] [object_store] Propagate env vars as object store client options (#6334) * [object_store] Propagate env vars as object store client options * [object_store] Include the missing variants in the FromStr implementation of ClientConfigKey * cargo fmt --- src/aws/builder.rs | 15 ++++++++++++++- src/azure/builder.rs | 15 ++++++++++++++- src/client/mod.rs | 2 ++ src/gcp/builder.rs | 15 ++++++++++++++- 4 files changed, 44 insertions(+), 3 deletions(-) diff --git a/src/aws/builder.rs b/src/aws/builder.rs index 574345c..75acb73 100644 --- a/src/aws/builder.rs +++ b/src/aws/builder.rs @@ -402,7 +402,7 @@ impl FromStr for AmazonS3ConfigKey { "aws_sse_customer_key_base64" => Ok(Self::Encryption( S3EncryptionConfigKey::CustomerEncryptionKey, )), - _ => match s.parse() { + _ => match s.strip_prefix("aws_").unwrap_or(s).parse() { Ok(key) => Ok(Self::Client(key)), Err(_) => Err(Error::UnknownConfigurationKey { key: s.into() }.into()), }, @@ -1455,4 +1455,17 @@ mod tests { assert_eq!(parse_bucket_az(bucket), expected) } } + + #[test] + fn aws_test_client_opts() { + let key = "AWS_PROXY_URL"; + if let Ok(config_key) = key.to_ascii_lowercase().parse() { + assert_eq!( + AmazonS3ConfigKey::Client(ClientConfigKey::ProxyUrl), + config_key + ); + } else { + panic!("{} not propagated as ClientConfigKey", key); + } + } } diff --git a/src/azure/builder.rs b/src/azure/builder.rs index c0c4e89..0208073 100644 --- a/src/azure/builder.rs +++ b/src/azure/builder.rs @@ -408,7 +408,7 @@ impl FromStr for AzureConfigKey { "azure_disable_tagging" | "disable_tagging" => Ok(Self::DisableTagging), // Backwards compatibility "azure_allow_http" => Ok(Self::Client(ClientConfigKey::AllowHttp)), - _ => match s.parse() { + _ => match s.strip_prefix("azure_").unwrap_or(s).parse() { Ok(key) => Ok(Self::Client(key)), Err(_) => Err(Error::UnknownConfigurationKey { key: s.into() }.into()), }, @@ -1103,4 +1103,17 @@ mod tests { let pairs = split_sas(raw_sas).unwrap(); assert_eq!(expected, pairs); } + + #[test] + fn azure_test_client_opts() { + let key = "AZURE_PROXY_URL"; + if let Ok(config_key) = key.to_ascii_lowercase().parse() { + assert_eq!( + AzureConfigKey::Client(ClientConfigKey::ProxyUrl), + config_key + ); + } else { + panic!("{} not propagated as ClientConfigKey", key); + } + } } diff --git a/src/client/mod.rs b/src/client/mod.rs index c45833b..9a3b705 100644 --- a/src/client/mod.rs +++ b/src/client/mod.rs @@ -157,6 +157,8 @@ impl FromStr for ClientConfigKey { "pool_idle_timeout" => Ok(Self::PoolIdleTimeout), "pool_max_idle_per_host" => Ok(Self::PoolMaxIdlePerHost), "proxy_url" => Ok(Self::ProxyUrl), + "proxy_ca_certificate" => Ok(Self::ProxyCaCertificate), + "proxy_excludes" => Ok(Self::ProxyExcludes), "timeout" => Ok(Self::Timeout), "user_agent" => Ok(Self::UserAgent), _ => Err(super::Error::UnknownConfigurationKey { diff --git a/src/gcp/builder.rs b/src/gcp/builder.rs index 82dab14..26cc821 100644 --- a/src/gcp/builder.rs +++ b/src/gcp/builder.rs @@ -185,7 +185,7 @@ impl FromStr for GoogleConfigKey { "google_service_account_key" | "service_account_key" => Ok(Self::ServiceAccountKey), "google_bucket" | "google_bucket_name" | "bucket" | "bucket_name" => Ok(Self::Bucket), "google_application_credentials" => Ok(Self::ApplicationCredentials), - _ => match s.parse() { + _ => match s.strip_prefix("google_").unwrap_or(s).parse() { Ok(key) => Ok(Self::Client(key)), Err(_) => Err(Error::UnknownConfigurationKey { key: s.into() }.into()), }, @@ -671,4 +671,17 @@ mod tests { google_bucket_name ); } + + #[test] + fn gcp_test_client_opts() { + let key = "GOOGLE_PROXY_URL"; + if let Ok(config_key) = key.to_ascii_lowercase().parse() { + assert_eq!( + GoogleConfigKey::Client(ClientConfigKey::ProxyUrl), + config_key + ); + } else { + panic!("{} not propagated as ClientConfigKey", key); + } + } } From c8117150935a089c1411003241e8834e9fd559d9 Mon Sep 17 00:00:00 2001 From: Samuel Colvin Date: Fri, 6 Sep 2024 06:23:42 +0100 Subject: [PATCH 339/397] `object_store::GetOptions` derive `Clone` (#6361) * object_store::GetOptions derive Clone * undo wrong submodule * bump --- src/lib.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lib.rs b/src/lib.rs index 4b43f0c..8820983 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -911,7 +911,7 @@ pub struct ObjectMeta { } /// Options for a get request, such as range -#[derive(Debug, Default)] +#[derive(Debug, Default, Clone)] pub struct GetOptions { /// Request will succeed if the `ObjectMeta::e_tag` matches /// otherwise returning [`Error::Precondition`] From 8fd9a5394d4c7c8f3597ee574cccc4824f306026 Mon Sep 17 00:00:00 2001 From: Tobias Bieniek Date: Fri, 6 Sep 2024 07:26:24 +0200 Subject: [PATCH 340/397] object_store/delimited: Fix `TrailingEscape` condition (#6265) This seems like a copy-paste mistake since checking `is_quote` twice is probably wrong... --- src/delimited.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/delimited.rs b/src/delimited.rs index 4f25c9d..96f88bf 100644 --- a/src/delimited.rs +++ b/src/delimited.rs @@ -126,7 +126,7 @@ impl LineDelimiter { fn finish(&mut self) -> Result { if !self.remainder.is_empty() { ensure!(!self.is_quote, UnterminatedStringSnafu); - ensure!(!self.is_quote, TrailingEscapeSnafu); + ensure!(!self.is_escape, TrailingEscapeSnafu); self.complete .push_back(Bytes::from(std::mem::take(&mut self.remainder))) From 55cdf518121a724fb7b478b87e442b9139922485 Mon Sep 17 00:00:00 2001 From: Tzu Gwo Date: Thu, 19 Sep 2024 04:47:51 +0800 Subject: [PATCH 341/397] Derive `Clone` for `object_store::aws::AmazonS3` (#6414) --- src/aws/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/aws/mod.rs b/src/aws/mod.rs index 4a773e7..a27ed05 100644 --- a/src/aws/mod.rs +++ b/src/aws/mod.rs @@ -77,7 +77,7 @@ use crate::client::parts::Parts; pub use credential::{AwsAuthorizer, AwsCredential}; /// Interface for [Amazon S3](https://aws.amazon.com/s3/). -#[derive(Debug)] +#[derive(Debug, Clone)] pub struct AmazonS3 { client: Arc, } From fe9c941cf670c677244d204e3d4364a5f3565039 Mon Sep 17 00:00:00 2001 From: Robin Lin <128118209+RobinLin666@users.noreply.github.com> Date: Sat, 21 Sep 2024 18:15:26 +0800 Subject: [PATCH 342/397] object_score: Support Azure Fabric OAuth Provider (#6382) * Update Azure dependencies and add support for Fabric token authentication * Refactor Azure credential provider to support Fabric token authentication * Refactor Azure credential provider to remove unnecessary print statements and improve token handling * Bump object_store version to 0.11.0 * Refactor Azure credential provider to remove unnecessary print statements and improve token handling --- src/azure/builder.rs | 88 ++++++++++++++++++++++++++++++- src/azure/credential.rs | 114 +++++++++++++++++++++++++++++++++++++++- 2 files changed, 199 insertions(+), 3 deletions(-) diff --git a/src/azure/builder.rs b/src/azure/builder.rs index 0208073..35cedea 100644 --- a/src/azure/builder.rs +++ b/src/azure/builder.rs @@ -17,8 +17,8 @@ use crate::azure::client::{AzureClient, AzureConfig}; use crate::azure::credential::{ - AzureAccessKey, AzureCliCredential, ClientSecretOAuthProvider, ImdsManagedIdentityProvider, - WorkloadIdentityOAuthProvider, + AzureAccessKey, AzureCliCredential, ClientSecretOAuthProvider, FabricTokenOAuthProvider, + ImdsManagedIdentityProvider, WorkloadIdentityOAuthProvider, }; use crate::azure::{AzureCredential, AzureCredentialProvider, MicrosoftAzure, STORE}; use crate::client::TokenCredentialProvider; @@ -172,6 +172,14 @@ pub struct MicrosoftAzureBuilder { use_fabric_endpoint: ConfigValue, /// When set to true, skips tagging objects disable_tagging: ConfigValue, + /// Fabric token service url + fabric_token_service_url: Option, + /// Fabric workload host + fabric_workload_host: Option, + /// Fabric session token + fabric_session_token: Option, + /// Fabric cluster identifier + fabric_cluster_identifier: Option, } /// Configuration keys for [`MicrosoftAzureBuilder`] @@ -336,6 +344,34 @@ pub enum AzureConfigKey { /// - `disable_tagging` DisableTagging, + /// Fabric token service url + /// + /// Supported keys: + /// - `azure_fabric_token_service_url` + /// - `fabric_token_service_url` + FabricTokenServiceUrl, + + /// Fabric workload host + /// + /// Supported keys: + /// - `azure_fabric_workload_host` + /// - `fabric_workload_host` + FabricWorkloadHost, + + /// Fabric session token + /// + /// Supported keys: + /// - `azure_fabric_session_token` + /// - `fabric_session_token` + FabricSessionToken, + + /// Fabric cluster identifier + /// + /// Supported keys: + /// - `azure_fabric_cluster_identifier` + /// - `fabric_cluster_identifier` + FabricClusterIdentifier, + /// Client options Client(ClientConfigKey), } @@ -361,6 +397,10 @@ impl AsRef for AzureConfigKey { Self::SkipSignature => "azure_skip_signature", Self::ContainerName => "azure_container_name", Self::DisableTagging => "azure_disable_tagging", + Self::FabricTokenServiceUrl => "azure_fabric_token_service_url", + Self::FabricWorkloadHost => "azure_fabric_workload_host", + Self::FabricSessionToken => "azure_fabric_session_token", + Self::FabricClusterIdentifier => "azure_fabric_cluster_identifier", Self::Client(key) => key.as_ref(), } } @@ -406,6 +446,14 @@ impl FromStr for AzureConfigKey { "azure_skip_signature" | "skip_signature" => Ok(Self::SkipSignature), "azure_container_name" | "container_name" => Ok(Self::ContainerName), "azure_disable_tagging" | "disable_tagging" => Ok(Self::DisableTagging), + "azure_fabric_token_service_url" | "fabric_token_service_url" => { + Ok(Self::FabricTokenServiceUrl) + } + "azure_fabric_workload_host" | "fabric_workload_host" => Ok(Self::FabricWorkloadHost), + "azure_fabric_session_token" | "fabric_session_token" => Ok(Self::FabricSessionToken), + "azure_fabric_cluster_identifier" | "fabric_cluster_identifier" => { + Ok(Self::FabricClusterIdentifier) + } // Backwards compatibility "azure_allow_http" => Ok(Self::Client(ClientConfigKey::AllowHttp)), _ => match s.strip_prefix("azure_").unwrap_or(s).parse() { @@ -525,6 +573,14 @@ impl MicrosoftAzureBuilder { } AzureConfigKey::ContainerName => self.container_name = Some(value.into()), AzureConfigKey::DisableTagging => self.disable_tagging.parse(value), + AzureConfigKey::FabricTokenServiceUrl => { + self.fabric_token_service_url = Some(value.into()) + } + AzureConfigKey::FabricWorkloadHost => self.fabric_workload_host = Some(value.into()), + AzureConfigKey::FabricSessionToken => self.fabric_session_token = Some(value.into()), + AzureConfigKey::FabricClusterIdentifier => { + self.fabric_cluster_identifier = Some(value.into()) + } }; self } @@ -561,6 +617,10 @@ impl MicrosoftAzureBuilder { AzureConfigKey::Client(key) => self.client_options.get_config_value(key), AzureConfigKey::ContainerName => self.container_name.clone(), AzureConfigKey::DisableTagging => Some(self.disable_tagging.to_string()), + AzureConfigKey::FabricTokenServiceUrl => self.fabric_token_service_url.clone(), + AzureConfigKey::FabricWorkloadHost => self.fabric_workload_host.clone(), + AzureConfigKey::FabricSessionToken => self.fabric_session_token.clone(), + AzureConfigKey::FabricClusterIdentifier => self.fabric_cluster_identifier.clone(), } } @@ -856,6 +916,30 @@ impl MicrosoftAzureBuilder { let credential = if let Some(credential) = self.credentials { credential + } else if let ( + Some(fabric_token_service_url), + Some(fabric_workload_host), + Some(fabric_session_token), + Some(fabric_cluster_identifier), + ) = ( + &self.fabric_token_service_url, + &self.fabric_workload_host, + &self.fabric_session_token, + &self.fabric_cluster_identifier, + ) { + // This case should precede the bearer token case because it is more specific and will utilize the bearer token. + let fabric_credential = FabricTokenOAuthProvider::new( + fabric_token_service_url, + fabric_workload_host, + fabric_session_token, + fabric_cluster_identifier, + self.bearer_token.clone(), + ); + Arc::new(TokenCredentialProvider::new( + fabric_credential, + self.client_options.client()?, + self.retry_config.clone(), + )) as _ } else if let Some(bearer_token) = self.bearer_token { static_creds(AzureCredential::BearerToken(bearer_token)) } else if let Some(access_key) = self.access_key { diff --git a/src/azure/credential.rs b/src/azure/credential.rs index 7808c7c..6b5fa19 100644 --- a/src/azure/credential.rs +++ b/src/azure/credential.rs @@ -22,7 +22,7 @@ use crate::client::{CredentialProvider, TokenProvider}; use crate::util::hmac_sha256; use crate::RetryConfig; use async_trait::async_trait; -use base64::prelude::BASE64_STANDARD; +use base64::prelude::{BASE64_STANDARD, BASE64_URL_SAFE_NO_PAD}; use base64::Engine; use chrono::{DateTime, SecondsFormat, Utc}; use reqwest::header::{ @@ -51,10 +51,15 @@ pub(crate) static BLOB_TYPE: HeaderName = HeaderName::from_static("x-ms-blob-typ pub(crate) static DELETE_SNAPSHOTS: HeaderName = HeaderName::from_static("x-ms-delete-snapshots"); pub(crate) static COPY_SOURCE: HeaderName = HeaderName::from_static("x-ms-copy-source"); static CONTENT_MD5: HeaderName = HeaderName::from_static("content-md5"); +static PARTNER_TOKEN: HeaderName = HeaderName::from_static("x-ms-partner-token"); +static CLUSTER_IDENTIFIER: HeaderName = HeaderName::from_static("x-ms-cluster-identifier"); +static WORKLOAD_RESOURCE: HeaderName = HeaderName::from_static("x-ms-workload-resource-moniker"); +static PROXY_HOST: HeaderName = HeaderName::from_static("x-ms-proxy-host"); pub(crate) const RFC1123_FMT: &str = "%a, %d %h %Y %T GMT"; const CONTENT_TYPE_JSON: &str = "application/json"; const MSI_SECRET_ENV_KEY: &str = "IDENTITY_HEADER"; const MSI_API_VERSION: &str = "2019-08-01"; +const TOKEN_MIN_TTL: u64 = 300; /// OIDC scope used when interacting with OAuth2 APIs /// @@ -934,6 +939,113 @@ impl AzureCliCredential { } } +/// Encapsulates the logic to perform an OAuth token challenge for Fabric +#[derive(Debug)] +pub struct FabricTokenOAuthProvider { + fabric_token_service_url: String, + fabric_workload_host: String, + fabric_session_token: String, + fabric_cluster_identifier: String, + storage_access_token: Option, + token_expiry: Option, +} + +#[derive(Debug, Deserialize)] +struct Claims { + exp: u64, +} + +impl FabricTokenOAuthProvider { + /// Create a new [`FabricTokenOAuthProvider`] for an azure backed store + pub fn new( + fabric_token_service_url: impl Into, + fabric_workload_host: impl Into, + fabric_session_token: impl Into, + fabric_cluster_identifier: impl Into, + storage_access_token: Option, + ) -> Self { + let (storage_access_token, token_expiry) = match storage_access_token { + Some(token) => match Self::validate_and_get_expiry(&token) { + Some(expiry) if expiry > Self::get_current_timestamp() + TOKEN_MIN_TTL => { + (Some(token), Some(expiry)) + } + _ => (None, None), + }, + None => (None, None), + }; + + Self { + fabric_token_service_url: fabric_token_service_url.into(), + fabric_workload_host: fabric_workload_host.into(), + fabric_session_token: fabric_session_token.into(), + fabric_cluster_identifier: fabric_cluster_identifier.into(), + storage_access_token, + token_expiry, + } + } + + fn validate_and_get_expiry(token: &str) -> Option { + let payload = token.split('.').nth(1)?; + let decoded_bytes = BASE64_URL_SAFE_NO_PAD.decode(payload).ok()?; + let decoded_str = str::from_utf8(&decoded_bytes).ok()?; + let claims: Claims = serde_json::from_str(decoded_str).ok()?; + Some(claims.exp) + } + + fn get_current_timestamp() -> u64 { + SystemTime::now() + .duration_since(SystemTime::UNIX_EPOCH) + .map_or(0, |d| d.as_secs()) + } +} + +#[async_trait::async_trait] +impl TokenProvider for FabricTokenOAuthProvider { + type Credential = AzureCredential; + + /// Fetch a token + async fn fetch_token( + &self, + client: &Client, + retry: &RetryConfig, + ) -> crate::Result>> { + if let Some(storage_access_token) = &self.storage_access_token { + if let Some(expiry) = self.token_expiry { + let exp_in = expiry - Self::get_current_timestamp(); + if exp_in > TOKEN_MIN_TTL { + return Ok(TemporaryToken { + token: Arc::new(AzureCredential::BearerToken(storage_access_token.clone())), + expiry: Some(Instant::now() + Duration::from_secs(exp_in)), + }); + } + } + } + + let query_items = vec![("resource", AZURE_STORAGE_RESOURCE)]; + let access_token: String = client + .request(Method::GET, &self.fabric_token_service_url) + .header(&PARTNER_TOKEN, self.fabric_session_token.as_str()) + .header(&CLUSTER_IDENTIFIER, self.fabric_cluster_identifier.as_str()) + .header(&WORKLOAD_RESOURCE, self.fabric_cluster_identifier.as_str()) + .header(&PROXY_HOST, self.fabric_workload_host.as_str()) + .query(&query_items) + .retryable(retry) + .idempotent(true) + .send() + .await + .context(TokenRequestSnafu)? + .text() + .await + .context(TokenResponseBodySnafu)?; + let exp_in = Self::validate_and_get_expiry(&access_token) + .map_or(3600, |expiry| expiry - Self::get_current_timestamp()); + Ok(TemporaryToken { + token: Arc::new(AzureCredential::BearerToken(access_token)), + expiry: Some(Instant::now() + Duration::from_secs(exp_in)), + }) + } +} + #[async_trait] impl CredentialProvider for AzureCliCredential { type Credential = AzureCredential; From fcd9f70fa9466dd2d9c7a5b8204f3c96207d34eb Mon Sep 17 00:00:00 2001 From: Marco Neumann Date: Mon, 23 Sep 2024 19:14:06 +0200 Subject: [PATCH 343/397] feat: expose HTTP/2 max frame size in `object_store` (#6442) Especially when transferring large amounts of data over HTTP/2, this can massively reduce the overhead. --- src/client/mod.rs | 32 ++++++++++++++++++++++++++++++++ src/config.rs | 9 +++++++++ 2 files changed, 41 insertions(+) diff --git a/src/client/mod.rs b/src/client/mod.rs index 9a3b705..b9688bc 100644 --- a/src/client/mod.rs +++ b/src/client/mod.rs @@ -94,6 +94,8 @@ pub enum ClientConfigKey { Http2KeepAliveTimeout, /// Enable HTTP2 keep alive pings for idle connections Http2KeepAliveWhileIdle, + /// Sets the maximum frame size to use for HTTP2. + Http2MaxFrameSize, /// Only use http2 connections Http2Only, /// The pool max idle timeout @@ -129,6 +131,7 @@ impl AsRef for ClientConfigKey { Self::Http2KeepAliveInterval => "http2_keep_alive_interval", Self::Http2KeepAliveTimeout => "http2_keep_alive_timeout", Self::Http2KeepAliveWhileIdle => "http2_keep_alive_while_idle", + Self::Http2MaxFrameSize => "http2_max_frame_size", Self::PoolIdleTimeout => "pool_idle_timeout", Self::PoolMaxIdlePerHost => "pool_max_idle_per_host", Self::ProxyUrl => "proxy_url", @@ -154,6 +157,7 @@ impl FromStr for ClientConfigKey { "http2_keep_alive_interval" => Ok(Self::Http2KeepAliveInterval), "http2_keep_alive_timeout" => Ok(Self::Http2KeepAliveTimeout), "http2_keep_alive_while_idle" => Ok(Self::Http2KeepAliveWhileIdle), + "http2_max_frame_size" => Ok(Self::Http2MaxFrameSize), "pool_idle_timeout" => Ok(Self::PoolIdleTimeout), "pool_max_idle_per_host" => Ok(Self::PoolMaxIdlePerHost), "proxy_url" => Ok(Self::ProxyUrl), @@ -238,6 +242,7 @@ pub struct ClientOptions { http2_keep_alive_interval: Option>, http2_keep_alive_timeout: Option>, http2_keep_alive_while_idle: ConfigValue, + http2_max_frame_size: Option>, http1_only: ConfigValue, http2_only: ConfigValue, } @@ -269,6 +274,7 @@ impl Default for ClientOptions { http2_keep_alive_interval: None, http2_keep_alive_timeout: None, http2_keep_alive_while_idle: Default::default(), + http2_max_frame_size: None, // HTTP2 is known to be significantly slower than HTTP1, so we default // to HTTP1 for now. // https://github.com/apache/arrow-rs/issues/5194 @@ -304,6 +310,9 @@ impl ClientOptions { ClientConfigKey::Http2KeepAliveWhileIdle => { self.http2_keep_alive_while_idle.parse(value) } + ClientConfigKey::Http2MaxFrameSize => { + self.http2_max_frame_size = Some(ConfigValue::Deferred(value.into())) + } ClientConfigKey::PoolIdleTimeout => { self.pool_idle_timeout = Some(ConfigValue::Deferred(value.into())) } @@ -338,6 +347,9 @@ impl ClientOptions { ClientConfigKey::Http2KeepAliveWhileIdle => { Some(self.http2_keep_alive_while_idle.to_string()) } + ClientConfigKey::Http2MaxFrameSize => { + self.http2_max_frame_size.as_ref().map(|v| v.to_string()) + } ClientConfigKey::Http2Only => Some(self.http2_only.to_string()), ClientConfigKey::PoolIdleTimeout => self.pool_idle_timeout.as_ref().map(fmt_duration), ClientConfigKey::PoolMaxIdlePerHost => { @@ -541,6 +553,14 @@ impl ClientOptions { self } + /// Sets the maximum frame size to use for HTTP2. + /// + /// Default is currently 16,384 but may change internally to optimize for common uses. + pub fn with_http2_max_frame_size(mut self, sz: u32) -> Self { + self.http2_max_frame_size = Some(ConfigValue::Parsed(sz)); + self + } + /// Get the mime type for the file in `path` to be uploaded /// /// Gets the file extension from `path`, and returns the @@ -635,6 +655,10 @@ impl ClientOptions { builder = builder.http2_keep_alive_while_idle(true) } + if let Some(sz) = &self.http2_max_frame_size { + builder = builder.http2_max_frame_size(Some(sz.get()?)) + } + if self.http1_only.get()? { builder = builder.http1_only() } @@ -799,6 +823,7 @@ mod tests { let http2_keep_alive_interval = "90 seconds".to_string(); let http2_keep_alive_timeout = "91 seconds".to_string(); let http2_keep_alive_while_idle = "92 seconds".to_string(); + let http2_max_frame_size = "1337".to_string(); let pool_idle_timeout = "93 seconds".to_string(); let pool_max_idle_per_host = "94".to_string(); let proxy_url = "https://fake_proxy_url".to_string(); @@ -824,6 +849,7 @@ mod tests { "http2_keep_alive_while_idle", http2_keep_alive_while_idle.clone(), ), + ("http2_max_frame_size", http2_max_frame_size.clone()), ("pool_idle_timeout", pool_idle_timeout.clone()), ("pool_max_idle_per_host", pool_max_idle_per_host.clone()), ("proxy_url", proxy_url.clone()), @@ -891,6 +917,12 @@ mod tests { .unwrap(), http2_keep_alive_while_idle ); + assert_eq!( + builder + .get_config_value(&ClientConfigKey::Http2MaxFrameSize) + .unwrap(), + http2_max_frame_size + ); assert_eq!( builder diff --git a/src/config.rs b/src/config.rs index 987e6e4..f715fa0 100644 --- a/src/config.rs +++ b/src/config.rs @@ -103,6 +103,15 @@ impl Parse for usize { } } +impl Parse for u32 { + fn parse(v: &str) -> Result { + Self::from_str(v).map_err(|_| Error::Generic { + store: "Config", + source: format!("failed to parse \"{v}\" as u32").into(), + }) + } +} + impl Parse for HeaderValue { fn parse(v: &str) -> Result { Self::from_str(v).map_err(|_| Error::Generic { From f253f1677f190b1dd630dc4afd4d3e1fd9eb46a1 Mon Sep 17 00:00:00 2001 From: Alexander Shtuchkin Date: Thu, 26 Sep 2024 04:42:37 -0400 Subject: [PATCH 344/397] Update Cargo.toml (#6459) --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index a878c0c..3a90322 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -53,7 +53,7 @@ rand = { version = "0.8", default-features = false, features = ["std", "std_rng" reqwest = { version = "0.12", default-features = false, features = ["rustls-tls-native-roots", "http2"], optional = true } ring = { version = "0.17", default-features = false, features = ["std"], optional = true } rustls-pemfile = { version = "2.0", default-features = false, features = ["std"], optional = true } -tokio = { version = "1.25.0", features = ["sync", "macros", "rt", "time", "io-util"] } +tokio = { version = "1.29.0", features = ["sync", "macros", "rt", "time", "io-util"] } md-5 = { version = "0.10.6", default-features = false, optional = true } [target.'cfg(target_family="unix")'.dev-dependencies] From df44c333083d10d6fe33cc60fd4d4bfa2fe56c1d Mon Sep 17 00:00:00 2001 From: ByteBaker <42913098+ByteBaker@users.noreply.github.com> Date: Wed, 2 Oct 2024 01:33:16 +0530 Subject: [PATCH 345/397] chore: add docs, part of #37 (#6453) * chore: add docs, part of #37 - add pragma `#![warn(missing_docs)]` to the following - `arrow-flight` - `arrow-ipc` - `arrow-integration-test` - `arrow-integration-testing` - `object_store` - also document the caveat with using level 10 GZIP compression in parquet. See #6282. * chore: resolve PR comments from #6453 --- src/aws/builder.rs | 1 - src/aws/client.rs | 1 - src/aws/resolve.rs | 1 - src/azure/builder.rs | 1 - src/azure/client.rs | 1 - src/client/get.rs | 1 - src/lib.rs | 46 ++++++++++++++++++++++++++++++++++++++++---- src/local.rs | 1 - src/memory.rs | 1 - src/path/mod.rs | 35 ++++++++++++++++++++++++++++----- 10 files changed, 72 insertions(+), 17 deletions(-) diff --git a/src/aws/builder.rs b/src/aws/builder.rs index 75acb73..c52c3f8 100644 --- a/src/aws/builder.rs +++ b/src/aws/builder.rs @@ -44,7 +44,6 @@ static DEFAULT_METADATA_ENDPOINT: &str = "http://169.254.169.254"; /// A specialized `Error` for object store-related errors #[derive(Debug, Snafu)] -#[allow(missing_docs)] enum Error { #[snafu(display("Missing bucket name"))] MissingBucketName, diff --git a/src/aws/client.rs b/src/aws/client.rs index 6fe4889..7034a37 100644 --- a/src/aws/client.rs +++ b/src/aws/client.rs @@ -65,7 +65,6 @@ const USER_DEFINED_METADATA_HEADER_PREFIX: &str = "x-amz-meta-"; /// A specialized `Error` for object store-related errors #[derive(Debug, Snafu)] -#[allow(missing_docs)] pub(crate) enum Error { #[snafu(display("Error performing DeleteObjects request: {}", source))] DeleteObjectsRequest { source: crate::client::retry::Error }, diff --git a/src/aws/resolve.rs b/src/aws/resolve.rs index 12c9f26..4c74893 100644 --- a/src/aws/resolve.rs +++ b/src/aws/resolve.rs @@ -21,7 +21,6 @@ use snafu::{ensure, OptionExt, ResultExt, Snafu}; /// A specialized `Error` for object store-related errors #[derive(Debug, Snafu)] -#[allow(missing_docs)] enum Error { #[snafu(display("Bucket '{}' not found", bucket))] BucketNotFound { bucket: String }, diff --git a/src/azure/builder.rs b/src/azure/builder.rs index 35cedea..1c4589b 100644 --- a/src/azure/builder.rs +++ b/src/azure/builder.rs @@ -46,7 +46,6 @@ const MSI_ENDPOINT_ENV_KEY: &str = "IDENTITY_ENDPOINT"; /// A specialized `Error` for Azure builder-related errors #[derive(Debug, Snafu)] -#[allow(missing_docs)] enum Error { #[snafu(display("Unable parse source url. Url: {}, Error: {}", url, source))] UnableToParseUrl { diff --git a/src/azure/client.rs b/src/azure/client.rs index 0499051..06d3fb5 100644 --- a/src/azure/client.rs +++ b/src/azure/client.rs @@ -60,7 +60,6 @@ static TAGS_HEADER: HeaderName = HeaderName::from_static("x-ms-tags"); /// A specialized `Error` for object store-related errors #[derive(Debug, Snafu)] -#[allow(missing_docs)] pub(crate) enum Error { #[snafu(display("Error performing get request {}: {}", path, source))] GetRequest { diff --git a/src/client/get.rs b/src/client/get.rs index 0fef578..ae6a8d9 100644 --- a/src/client/get.rs +++ b/src/client/get.rs @@ -96,7 +96,6 @@ impl ContentRange { /// A specialized `Error` for get-related errors #[derive(Debug, Snafu)] -#[allow(missing_docs)] enum GetResultError { #[snafu(context(false))] Header { diff --git a/src/lib.rs b/src/lib.rs index 8820983..a0d83eb 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1224,78 +1224,116 @@ pub type Result = std::result::Result; /// A specialized `Error` for object store-related errors #[derive(Debug, Snafu)] -#[allow(missing_docs)] #[non_exhaustive] pub enum Error { + /// A fallback error type when no variant matches #[snafu(display("Generic {} error: {}", store, source))] Generic { + /// The store this error originated from store: &'static str, + /// The wrapped error source: Box, }, + /// Error when the object is not found at given location #[snafu(display("Object at location {} not found: {}", path, source))] NotFound { + /// The path to file path: String, + /// The wrapped error source: Box, }, + /// Error for invalid path #[snafu( display("Encountered object with invalid path: {}", source), context(false) )] - InvalidPath { source: path::Error }, + InvalidPath { + /// The wrapped error + source: path::Error, + }, + /// Error when `tokio::spawn` failed #[snafu(display("Error joining spawned task: {}", source), context(false))] - JoinError { source: tokio::task::JoinError }, + JoinError { + /// The wrapped error + source: tokio::task::JoinError, + }, + /// Error when the attempted operation is not supported #[snafu(display("Operation not supported: {}", source))] NotSupported { + /// The wrapped error source: Box, }, + /// Error when the object already exists #[snafu(display("Object at location {} already exists: {}", path, source))] AlreadyExists { + /// The path to the path: String, + /// The wrapped error source: Box, }, + /// Error when the required conditions failed for the operation #[snafu(display("Request precondition failure for path {}: {}", path, source))] Precondition { + /// The path to the file path: String, + /// The wrapped error source: Box, }, + /// Error when the object at the location isn't modified #[snafu(display("Object at location {} not modified: {}", path, source))] NotModified { + /// The path to the file path: String, + /// The wrapped error source: Box, }, + /// Error when an operation is not implemented #[snafu(display("Operation not yet implemented."))] NotImplemented, + /// Error when the used credentials don't have enough permission + /// to perform the requested operation #[snafu(display( "The operation lacked the necessary privileges to complete for path {}: {}", path, source ))] PermissionDenied { + /// The path to the file path: String, + /// The wrapped error source: Box, }, + /// Error when the used credentials lack valid authentication #[snafu(display( "The operation lacked valid authentication credentials for path {}: {}", path, source ))] Unauthenticated { + /// The path to the file path: String, + /// The wrapped error source: Box, }, + /// Error when a configuration key is invalid for the store used #[snafu(display("Configuration key: '{}' is not valid for store '{}'.", key, store))] - UnknownConfigurationKey { store: &'static str, key: String }, + UnknownConfigurationKey { + /// The object store used + store: &'static str, + /// The configuration key used + key: String, + }, } impl From for std::io::Error { diff --git a/src/local.rs b/src/local.rs index db4b4b0..ac10f33 100644 --- a/src/local.rs +++ b/src/local.rs @@ -44,7 +44,6 @@ use crate::{ /// A specialized `Error` for filesystem object store-related errors #[derive(Debug, Snafu)] -#[allow(missing_docs)] pub(crate) enum Error { #[snafu(display("File size for {} did not fit in a usize: {}", path, source))] FileSizeOverflowedUsize { diff --git a/src/memory.rs b/src/memory.rs index 0d72983..b458bdd 100644 --- a/src/memory.rs +++ b/src/memory.rs @@ -38,7 +38,6 @@ use crate::{GetOptions, PutPayload}; /// A specialized `Error` for in-memory object store-related errors #[derive(Debug, Snafu)] -#[allow(missing_docs)] enum Error { #[snafu(display("No data in memory found. Location: {path}"))] NoDataInMemory { path: String }, diff --git a/src/path/mod.rs b/src/path/mod.rs index 59e08e2..4c9bb5f 100644 --- a/src/path/mod.rs +++ b/src/path/mod.rs @@ -36,32 +36,57 @@ pub use parts::{InvalidPart, PathPart}; /// Error returned by [`Path::parse`] #[derive(Debug, Snafu)] -#[allow(missing_docs)] #[non_exhaustive] pub enum Error { + /// Error when there's an empty segment between two slashes `/` in the path #[snafu(display("Path \"{}\" contained empty path segment", path))] - EmptySegment { path: String }, + EmptySegment { + /// The source path + path: String, + }, + /// Error when an invalid segment is encountered in the given path #[snafu(display("Error parsing Path \"{}\": {}", path, source))] - BadSegment { path: String, source: InvalidPart }, + BadSegment { + /// The source path + path: String, + /// The part containing the error + source: InvalidPart, + }, + /// Error when path cannot be canonicalized #[snafu(display("Failed to canonicalize path \"{}\": {}", path.display(), source))] Canonicalize { + /// The source path path: std::path::PathBuf, + /// The underlying error source: std::io::Error, }, + /// Error when the path is not a valid URL #[snafu(display("Unable to convert path \"{}\" to URL", path.display()))] - InvalidPath { path: std::path::PathBuf }, + InvalidPath { + /// The source path + path: std::path::PathBuf, + }, + /// Error when a path contains non-unicode characters #[snafu(display("Path \"{}\" contained non-unicode characters: {}", path, source))] NonUnicode { + /// The source path path: String, + /// The underlying `UTF8Error` source: std::str::Utf8Error, }, + /// Error when the a path doesn't start with given prefix #[snafu(display("Path {} does not start with prefix {}", path, prefix))] - PrefixMismatch { path: String, prefix: String }, + PrefixMismatch { + /// The source path + path: String, + /// The mismatched prefix + prefix: String, + }, } /// A parsed path representation that can be safely written to object storage From 101a097c0852eb885fa6584450f47dd07c512b16 Mon Sep 17 00:00:00 2001 From: ByteBaker <42913098+ByteBaker@users.noreply.github.com> Date: Sun, 6 Oct 2024 16:53:09 +0530 Subject: [PATCH 346/397] object_store: enable lint `unreachable_pub` (#6512) - remove warnings by changing unreachable `pub` to `pub(crate)` --- src/aws/client.rs | 40 ++++++++++++++++----------------- src/aws/credential.rs | 10 ++++----- src/aws/dynamo.rs | 6 ++--- src/azure/client.rs | 22 +++++++++--------- src/azure/credential.rs | 28 +++++++++++------------ src/client/backoff.rs | 8 +++---- src/client/get.rs | 4 ++-- src/client/header.rs | 15 ++++++++----- src/client/list.rs | 4 ++-- src/client/mock_server.rs | 16 ++++++------- src/client/mod.rs | 32 +++++++++++++------------- src/client/pagination.rs | 2 +- src/client/retry.rs | 14 ++++++------ src/client/s3.rs | 8 +++---- src/client/token.rs | 8 +++---- src/config.rs | 8 +++---- src/gcp/client.rs | 47 +++++++++++++++++++++++++-------------- src/gcp/credential.rs | 42 +++++++++++++++++----------------- src/http/client.rs | 36 +++++++++++++++++------------- src/lib.rs | 7 +++--- src/local.rs | 2 +- src/util.rs | 8 +++---- 22 files changed, 194 insertions(+), 173 deletions(-) diff --git a/src/aws/client.rs b/src/aws/client.rs index 7034a37..cc74f2d 100644 --- a/src/aws/client.rs +++ b/src/aws/client.rs @@ -165,7 +165,7 @@ impl From for Error { } #[derive(Debug)] -pub struct S3Config { +pub(crate) struct S3Config { pub region: String, pub endpoint: Option, pub bucket: String, @@ -269,12 +269,12 @@ pub(crate) struct Request<'a> { } impl<'a> Request<'a> { - pub fn query(self, query: &T) -> Self { + pub(crate) fn query(self, query: &T) -> Self { let builder = self.builder.query(query); Self { builder, ..self } } - pub fn header(self, k: K, v: &str) -> Self + pub(crate) fn header(self, k: K, v: &str) -> Self where HeaderName: TryFrom, >::Error: Into, @@ -283,29 +283,29 @@ impl<'a> Request<'a> { Self { builder, ..self } } - pub fn headers(self, headers: HeaderMap) -> Self { + pub(crate) fn headers(self, headers: HeaderMap) -> Self { let builder = self.builder.headers(headers); Self { builder, ..self } } - pub fn idempotent(self, idempotent: bool) -> Self { + pub(crate) fn idempotent(self, idempotent: bool) -> Self { Self { idempotent, ..self } } - pub fn with_encryption_headers(self) -> Self { + pub(crate) fn with_encryption_headers(self) -> Self { let headers = self.config.encryption_headers.clone().into(); let builder = self.builder.headers(headers); Self { builder, ..self } } - pub fn with_session_creds(self, use_session_creds: bool) -> Self { + pub(crate) fn with_session_creds(self, use_session_creds: bool) -> Self { Self { use_session_creds, ..self } } - pub fn with_tags(mut self, tags: TagSet) -> Self { + pub(crate) fn with_tags(mut self, tags: TagSet) -> Self { let tags = tags.encoded(); if !tags.is_empty() && !self.config.disable_tagging { self.builder = self.builder.header(&TAGS_HEADER, tags); @@ -313,7 +313,7 @@ impl<'a> Request<'a> { self } - pub fn with_attributes(self, attributes: Attributes) -> Self { + pub(crate) fn with_attributes(self, attributes: Attributes) -> Self { let mut has_content_type = false; let mut builder = self.builder; for (k, v) in &attributes { @@ -341,7 +341,7 @@ impl<'a> Request<'a> { Self { builder, ..self } } - pub fn with_payload(mut self, payload: PutPayload) -> Self { + pub(crate) fn with_payload(mut self, payload: PutPayload) -> Self { if !self.config.skip_signature || self.config.checksum.is_some() { let mut sha256 = Context::new(&digest::SHA256); payload.iter().for_each(|x| sha256.update(x)); @@ -362,7 +362,7 @@ impl<'a> Request<'a> { self } - pub async fn send(self) -> Result { + pub(crate) async fn send(self) -> Result { let credential = match self.use_session_creds { true => self.config.get_session_credential().await?, false => SessionCredential { @@ -385,7 +385,7 @@ impl<'a> Request<'a> { .context(RetrySnafu { path }) } - pub async fn do_put(self) -> Result { + pub(crate) async fn do_put(self) -> Result { let response = self.send().await?; Ok(get_put_result(response.headers(), VERSION_HEADER).context(MetadataSnafu)?) } @@ -398,12 +398,12 @@ pub(crate) struct S3Client { } impl S3Client { - pub fn new(config: S3Config) -> Result { + pub(crate) fn new(config: S3Config) -> Result { let client = config.client_options.client()?; Ok(Self { config, client }) } - pub fn request<'a>(&'a self, method: Method, path: &'a Path) -> Request<'a> { + pub(crate) fn request<'a>(&'a self, method: Method, path: &'a Path) -> Request<'a> { let url = self.config.path_url(path); Request { path, @@ -423,7 +423,7 @@ impl S3Client { /// there was an error for a certain path, the error will be returned in the /// vector. If there was an issue with making the overall request, an error /// will be returned at the top level. - pub async fn bulk_delete_request(&self, paths: Vec) -> Result>> { + pub(crate) async fn bulk_delete_request(&self, paths: Vec) -> Result>> { if paths.is_empty() { return Ok(Vec::new()); } @@ -519,7 +519,7 @@ impl S3Client { } /// Make an S3 Copy request - pub fn copy_request<'a>(&'a self, from: &Path, to: &'a Path) -> Request<'a> { + pub(crate) fn copy_request<'a>(&'a self, from: &Path, to: &'a Path) -> Request<'a> { let source = format!("{}/{}", self.config.bucket, encode_path(from)); let mut copy_source_encryption_headers = HeaderMap::new(); @@ -565,7 +565,7 @@ impl S3Client { .with_session_creds(false) } - pub async fn create_multipart( + pub(crate) async fn create_multipart( &self, location: &Path, opts: PutMultipartOpts, @@ -589,7 +589,7 @@ impl S3Client { Ok(response.upload_id) } - pub async fn put_part( + pub(crate) async fn put_part( &self, path: &Path, upload_id: &MultipartId, @@ -618,7 +618,7 @@ impl S3Client { Ok(PartId { content_id }) } - pub async fn complete_multipart( + pub(crate) async fn complete_multipart( &self, location: &Path, upload_id: &str, @@ -669,7 +669,7 @@ impl S3Client { } #[cfg(test)] - pub async fn get_object_tagging(&self, path: &Path) -> Result { + pub(crate) async fn get_object_tagging(&self, path: &Path) -> Result { let credential = self.config.get_session_credential().await?; let url = format!("{}?tagging", self.config.path_url(path)); let response = self diff --git a/src/aws/credential.rs b/src/aws/credential.rs index 63cb571..33972c6 100644 --- a/src/aws/credential.rs +++ b/src/aws/credential.rs @@ -314,7 +314,7 @@ impl<'a> AwsAuthorizer<'a> { } } -pub trait CredentialExt { +pub(crate) trait CredentialExt { /// Sign a request fn with_aws_sigv4( self, @@ -423,7 +423,7 @@ fn canonicalize_headers(header_map: &HeaderMap) -> (String, String) { /// /// #[derive(Debug)] -pub struct InstanceCredentialProvider { +pub(crate) struct InstanceCredentialProvider { pub imdsv1_fallback: bool, pub metadata_endpoint: String, } @@ -450,7 +450,7 @@ impl TokenProvider for InstanceCredentialProvider { /// /// #[derive(Debug)] -pub struct WebIdentityProvider { +pub(crate) struct WebIdentityProvider { pub token_path: String, pub role_arn: String, pub session_name: String, @@ -633,7 +633,7 @@ async fn web_identity( /// /// #[derive(Debug)] -pub struct TaskCredentialProvider { +pub(crate) struct TaskCredentialProvider { pub url: String, pub retry: RetryConfig, pub client: Client, @@ -675,7 +675,7 @@ async fn task_credential( /// /// #[derive(Debug)] -pub struct SessionProvider { +pub(crate) struct SessionProvider { pub endpoint: String, pub region: String, pub credentials: AwsCredentialProvider, diff --git a/src/aws/dynamo.rs b/src/aws/dynamo.rs index 9de67e5..ece3b8a 100644 --- a/src/aws/dynamo.rs +++ b/src/aws/dynamo.rs @@ -508,11 +508,11 @@ impl<'a> From<&'a str> for AttributeValue<'a> { mod number { use serde::{Deserialize, Deserializer, Serializer}; - pub fn serialize(v: &u64, s: S) -> Result { + pub(crate) fn serialize(v: &u64, s: S) -> Result { s.serialize_str(&v.to_string()) } - pub fn deserialize<'de, D: Deserializer<'de>>(d: D) -> Result { + pub(crate) fn deserialize<'de, D: Deserializer<'de>>(d: D) -> Result { let v: &str = Deserialize::deserialize(d)?; v.parse().map_err(serde::de::Error::custom) } @@ -541,7 +541,7 @@ mod tests { /// An integration test for DynamoDB /// /// This is a function called by s3_test to avoid test concurrency issues - pub async fn integration_test(integration: &AmazonS3, d: &DynamoCommit) { + pub(crate) async fn integration_test(integration: &AmazonS3, d: &DynamoCommit) { let client = integration.client.as_ref(); let src = Path::from("dynamo_path_src"); diff --git a/src/azure/client.rs b/src/azure/client.rs index 06d3fb5..e78f8db 100644 --- a/src/azure/client.rs +++ b/src/azure/client.rs @@ -255,13 +255,13 @@ pub(crate) struct AzureClient { impl AzureClient { /// create a new instance of [AzureClient] - pub fn new(config: AzureConfig) -> Result { + pub(crate) fn new(config: AzureConfig) -> Result { let client = config.client_options.client()?; Ok(Self { config, client }) } /// Returns the config - pub fn config(&self) -> &AzureConfig { + pub(crate) fn config(&self) -> &AzureConfig { &self.config } @@ -283,7 +283,7 @@ impl AzureClient { } /// Make an Azure PUT request - pub async fn put_blob( + pub(crate) async fn put_blob( &self, path: &Path, payload: PutPayload, @@ -308,7 +308,7 @@ impl AzureClient { } /// PUT a block - pub async fn put_block( + pub(crate) async fn put_block( &self, path: &Path, part_idx: usize, @@ -327,7 +327,7 @@ impl AzureClient { } /// PUT a block list - pub async fn put_block_list( + pub(crate) async fn put_block_list( &self, path: &Path, parts: Vec, @@ -352,7 +352,7 @@ impl AzureClient { } /// Make an Azure Delete request - pub async fn delete_request( + pub(crate) async fn delete_request( &self, path: &Path, query: &T, @@ -381,7 +381,7 @@ impl AzureClient { } /// Make an Azure Copy request - pub async fn copy_request(&self, from: &Path, to: &Path, overwrite: bool) -> Result<()> { + pub(crate) async fn copy_request(&self, from: &Path, to: &Path, overwrite: bool) -> Result<()> { let credential = self.get_credential().await?; let url = self.config.path_url(to); let mut source = self.config.path_url(from); @@ -468,7 +468,7 @@ impl AzureClient { /// /// Depending on the type of credential, this will either use the account key or a user delegation key. /// Since delegation keys are acquired ad-hoc, the signer aloows for signing multiple urls with the same key. - pub async fn signer(&self, expires_in: Duration) -> Result { + pub(crate) async fn signer(&self, expires_in: Duration) -> Result { let credential = self.get_credential().await?; let signed_start = chrono::Utc::now(); let signed_expiry = signed_start + expires_in; @@ -499,7 +499,7 @@ impl AzureClient { } #[cfg(test)] - pub async fn get_blob_tagging(&self, path: &Path) -> Result { + pub(crate) async fn get_blob_tagging(&self, path: &Path) -> Result { let credential = self.get_credential().await?; let url = self.config.path_url(path); let sensitive = credential @@ -757,7 +757,7 @@ struct BlobProperties { pub(crate) struct BlockId(Bytes); impl BlockId { - pub fn new(block_id: impl Into) -> Self { + pub(crate) fn new(block_id: impl Into) -> Self { Self(block_id.into()) } } @@ -783,7 +783,7 @@ pub(crate) struct BlockList { } impl BlockList { - pub fn to_xml(&self) -> String { + pub(crate) fn to_xml(&self) -> String { let mut s = String::new(); s.push_str("\n\n"); for block_id in &self.blocks { diff --git a/src/azure/credential.rs b/src/azure/credential.rs index 6b5fa19..2832eed 100644 --- a/src/azure/credential.rs +++ b/src/azure/credential.rs @@ -95,7 +95,7 @@ pub enum Error { SASforSASNotSupported, } -pub type Result = std::result::Result; +pub(crate) type Result = std::result::Result; impl From for crate::Error { fn from(value: Error) -> Self { @@ -168,7 +168,7 @@ pub(crate) struct AzureSigner { } impl AzureSigner { - pub fn new( + pub(crate) fn new( signing_key: AzureAccessKey, account: String, start: DateTime, @@ -184,7 +184,7 @@ impl AzureSigner { } } - pub fn sign(&self, method: &Method, url: &mut Url) -> Result<()> { + pub(crate) fn sign(&self, method: &Method, url: &mut Url) -> Result<()> { let (str_to_sign, query_pairs) = match &self.delegation_key { Some(delegation_key) => string_to_sign_user_delegation_sas( url, @@ -584,7 +584,7 @@ struct OAuthTokenResponse { /// /// #[derive(Debug)] -pub struct ClientSecretOAuthProvider { +pub(crate) struct ClientSecretOAuthProvider { token_url: String, client_id: String, client_secret: String, @@ -592,7 +592,7 @@ pub struct ClientSecretOAuthProvider { impl ClientSecretOAuthProvider { /// Create a new [`ClientSecretOAuthProvider`] for an azure backed store - pub fn new( + pub(crate) fn new( client_id: String, client_secret: String, tenant_id: impl AsRef, @@ -676,7 +676,7 @@ struct ImdsTokenResponse { /// This authentication type works in Azure VMs, App Service and Azure Functions applications, as well as the Azure Cloud Shell /// #[derive(Debug)] -pub struct ImdsManagedIdentityProvider { +pub(crate) struct ImdsManagedIdentityProvider { msi_endpoint: String, client_id: Option, object_id: Option, @@ -685,7 +685,7 @@ pub struct ImdsManagedIdentityProvider { impl ImdsManagedIdentityProvider { /// Create a new [`ImdsManagedIdentityProvider`] for an azure backed store - pub fn new( + pub(crate) fn new( client_id: Option, object_id: Option, msi_res_id: Option, @@ -760,7 +760,7 @@ impl TokenProvider for ImdsManagedIdentityProvider { /// /// #[derive(Debug)] -pub struct WorkloadIdentityOAuthProvider { +pub(crate) struct WorkloadIdentityOAuthProvider { token_url: String, client_id: String, federated_token_file: String, @@ -768,7 +768,7 @@ pub struct WorkloadIdentityOAuthProvider { impl WorkloadIdentityOAuthProvider { /// Create a new [`WorkloadIdentityOAuthProvider`] for an azure backed store - pub fn new( + pub(crate) fn new( client_id: impl Into, federated_token_file: impl Into, tenant_id: impl AsRef, @@ -836,7 +836,7 @@ mod az_cli_date_format { use chrono::{DateTime, TimeZone}; use serde::{self, Deserialize, Deserializer}; - pub fn deserialize<'de, D>(deserializer: D) -> Result, D::Error> + pub(crate) fn deserialize<'de, D>(deserializer: D) -> Result, D::Error> where D: Deserializer<'de>, { @@ -863,12 +863,12 @@ struct AzureCliTokenResponse { } #[derive(Default, Debug)] -pub struct AzureCliCredential { +pub(crate) struct AzureCliCredential { cache: TokenCache>, } impl AzureCliCredential { - pub fn new() -> Self { + pub(crate) fn new() -> Self { Self::default() } @@ -941,7 +941,7 @@ impl AzureCliCredential { /// Encapsulates the logic to perform an OAuth token challenge for Fabric #[derive(Debug)] -pub struct FabricTokenOAuthProvider { +pub(crate) struct FabricTokenOAuthProvider { fabric_token_service_url: String, fabric_workload_host: String, fabric_session_token: String, @@ -957,7 +957,7 @@ struct Claims { impl FabricTokenOAuthProvider { /// Create a new [`FabricTokenOAuthProvider`] for an azure backed store - pub fn new( + pub(crate) fn new( fabric_token_service_url: impl Into, fabric_workload_host: impl Into, fabric_session_token: impl Into, diff --git a/src/client/backoff.rs b/src/client/backoff.rs index a1fa26c..8382a2e 100644 --- a/src/client/backoff.rs +++ b/src/client/backoff.rs @@ -51,7 +51,7 @@ impl Default for BackoffConfig { /// /// Consecutive calls to [`Backoff::next`] will return the next backoff interval /// -pub struct Backoff { +pub(crate) struct Backoff { init_backoff: f64, next_backoff_secs: f64, max_backoff_secs: f64, @@ -72,14 +72,14 @@ impl std::fmt::Debug for Backoff { impl Backoff { /// Create a new [`Backoff`] from the provided [`BackoffConfig`] - pub fn new(config: &BackoffConfig) -> Self { + pub(crate) fn new(config: &BackoffConfig) -> Self { Self::new_with_rng(config, None) } /// Creates a new `Backoff` with the optional `rng` /// /// Used [`rand::thread_rng()`] if no rng provided - pub fn new_with_rng( + pub(crate) fn new_with_rng( config: &BackoffConfig, rng: Option>, ) -> Self { @@ -94,7 +94,7 @@ impl Backoff { } /// Returns the next backoff duration to wait for - pub fn next(&mut self) -> Duration { + pub(crate) fn next(&mut self) -> Duration { let range = self.init_backoff..(self.next_backoff_secs * self.base); let rand_backoff = match self.rng.as_mut() { diff --git a/src/client/get.rs b/src/client/get.rs index ae6a8d9..5dd62cb 100644 --- a/src/client/get.rs +++ b/src/client/get.rs @@ -33,7 +33,7 @@ use snafu::{ensure, OptionExt, ResultExt, Snafu}; /// A client that can perform a get request #[async_trait] -pub trait GetClient: Send + Sync + 'static { +pub(crate) trait GetClient: Send + Sync + 'static { const STORE: &'static str; /// Configure the [`HeaderConfig`] for this client @@ -44,7 +44,7 @@ pub trait GetClient: Send + Sync + 'static { /// Extension trait for [`GetClient`] that adds common retrieval functionality #[async_trait] -pub trait GetClientExt { +pub(crate) trait GetClientExt { async fn get_opts(&self, location: &Path, options: GetOptions) -> Result; } diff --git a/src/client/header.rs b/src/client/header.rs index 9ce5db4..07c04c1 100644 --- a/src/client/header.rs +++ b/src/client/header.rs @@ -26,7 +26,7 @@ use snafu::{OptionExt, ResultExt, Snafu}; #[derive(Debug, Copy, Clone)] /// Configuration for header extraction -pub struct HeaderConfig { +pub(crate) struct HeaderConfig { /// Whether to require an ETag header when extracting [`ObjectMeta`] from headers. /// /// Defaults to `true` @@ -45,7 +45,7 @@ pub struct HeaderConfig { } #[derive(Debug, Snafu)] -pub enum Error { +pub(crate) enum Error { #[snafu(display("ETag Header missing from response"))] MissingEtag, @@ -73,7 +73,10 @@ pub enum Error { /// Extracts a PutResult from the provided [`HeaderMap`] #[cfg(any(feature = "aws", feature = "gcp", feature = "azure"))] -pub fn get_put_result(headers: &HeaderMap, version: &str) -> Result { +pub(crate) fn get_put_result( + headers: &HeaderMap, + version: &str, +) -> Result { let e_tag = Some(get_etag(headers)?); let version = get_version(headers, version)?; Ok(crate::PutResult { e_tag, version }) @@ -81,7 +84,7 @@ pub fn get_put_result(headers: &HeaderMap, version: &str) -> Result Result, Error> { +pub(crate) fn get_version(headers: &HeaderMap, version: &str) -> Result, Error> { Ok(match headers.get(version) { Some(x) => Some(x.to_str().context(BadHeaderSnafu)?.to_string()), None => None, @@ -89,13 +92,13 @@ pub fn get_version(headers: &HeaderMap, version: &str) -> Result, } /// Extracts an etag from the provided [`HeaderMap`] -pub fn get_etag(headers: &HeaderMap) -> Result { +pub(crate) fn get_etag(headers: &HeaderMap) -> Result { let e_tag = headers.get(ETAG).ok_or(Error::MissingEtag)?; Ok(e_tag.to_str().context(BadHeaderSnafu)?.to_string()) } /// Extracts [`ObjectMeta`] from the provided [`HeaderMap`] -pub fn header_meta( +pub(crate) fn header_meta( location: &Path, headers: &HeaderMap, cfg: HeaderConfig, diff --git a/src/client/list.rs b/src/client/list.rs index 2dbe20f..4445d0d 100644 --- a/src/client/list.rs +++ b/src/client/list.rs @@ -26,7 +26,7 @@ use std::collections::BTreeSet; /// A client that can perform paginated list requests #[async_trait] -pub trait ListClient: Send + Sync + 'static { +pub(crate) trait ListClient: Send + Sync + 'static { async fn list_request( &self, prefix: Option<&str>, @@ -38,7 +38,7 @@ pub trait ListClient: Send + Sync + 'static { /// Extension trait for [`ListClient`] that adds common listing functionality #[async_trait] -pub trait ListClientExt { +pub(crate) trait ListClientExt { fn list_paginated( &self, prefix: Option<&Path>, diff --git a/src/client/mock_server.rs b/src/client/mock_server.rs index aa5a9e0..8be4a72 100644 --- a/src/client/mock_server.rs +++ b/src/client/mock_server.rs @@ -32,11 +32,11 @@ use tokio::net::TcpListener; use tokio::sync::oneshot; use tokio::task::{JoinHandle, JoinSet}; -pub type ResponseFn = +pub(crate) type ResponseFn = Box) -> BoxFuture<'static, Response> + Send>; /// A mock server -pub struct MockServer { +pub(crate) struct MockServer { responses: Arc>>, shutdown: oneshot::Sender<()>, handle: JoinHandle<()>, @@ -44,7 +44,7 @@ pub struct MockServer { } impl MockServer { - pub async fn new() -> Self { + pub(crate) async fn new() -> Self { let responses: Arc>> = Arc::new(Mutex::new(VecDeque::with_capacity(10))); @@ -97,17 +97,17 @@ impl MockServer { } /// The url of the mock server - pub fn url(&self) -> &str { + pub(crate) fn url(&self) -> &str { &self.url } /// Add a response - pub fn push(&self, response: Response) { + pub(crate) fn push(&self, response: Response) { self.push_fn(|_| response) } /// Add a response function - pub fn push_fn(&self, f: F) + pub(crate) fn push_fn(&self, f: F) where F: FnOnce(Request) -> Response + Send + 'static, { @@ -115,7 +115,7 @@ impl MockServer { self.responses.lock().push_back(f) } - pub fn push_async_fn(&self, f: F) + pub(crate) fn push_async_fn(&self, f: F) where F: FnOnce(Request) -> Fut + Send + 'static, Fut: Future> + Send + 'static, @@ -124,7 +124,7 @@ impl MockServer { } /// Shutdown the mock server - pub async fn shutdown(self) { + pub(crate) async fn shutdown(self) { let _ = self.shutdown.send(()); self.handle.await.unwrap() } diff --git a/src/client/mod.rs b/src/client/mod.rs index b9688bc..7b1b469 100644 --- a/src/client/mod.rs +++ b/src/client/mod.rs @@ -17,31 +17,31 @@ //! Generic utilities reqwest based ObjectStore implementations -pub mod backoff; +pub(crate) mod backoff; #[cfg(test)] -pub mod mock_server; +pub(crate) mod mock_server; -pub mod retry; +pub(crate) mod retry; #[cfg(any(feature = "aws", feature = "gcp", feature = "azure"))] -pub mod pagination; +pub(crate) mod pagination; -pub mod get; +pub(crate) mod get; #[cfg(any(feature = "aws", feature = "gcp", feature = "azure"))] -pub mod list; +pub(crate) mod list; #[cfg(any(feature = "aws", feature = "gcp", feature = "azure"))] -pub mod token; +pub(crate) mod token; -pub mod header; +pub(crate) mod header; #[cfg(any(feature = "aws", feature = "gcp"))] -pub mod s3; +pub(crate) mod s3; #[cfg(any(feature = "aws", feature = "gcp", feature = "azure"))] -pub mod parts; +pub(crate) mod parts; use async_trait::async_trait; use std::collections::HashMap; @@ -678,7 +678,7 @@ impl ClientOptions { } } -pub trait GetOptionsExt { +pub(crate) trait GetOptionsExt { fn with_get_options(self, options: GetOptions) -> Self; } @@ -756,7 +756,7 @@ mod cloud { /// A [`CredentialProvider`] that uses [`Client`] to fetch temporary tokens #[derive(Debug)] - pub struct TokenCredentialProvider { + pub(crate) struct TokenCredentialProvider { inner: T, client: Client, retry: RetryConfig, @@ -764,7 +764,7 @@ mod cloud { } impl TokenCredentialProvider { - pub fn new(inner: T, client: Client, retry: RetryConfig) -> Self { + pub(crate) fn new(inner: T, client: Client, retry: RetryConfig) -> Self { Self { inner, client, @@ -775,7 +775,7 @@ mod cloud { /// Override the minimum remaining TTL for a cached token to be used #[cfg(feature = "aws")] - pub fn with_min_ttl(mut self, min_ttl: Duration) -> Self { + pub(crate) fn with_min_ttl(mut self, min_ttl: Duration) -> Self { self.cache = self.cache.with_min_ttl(min_ttl); self } @@ -793,7 +793,7 @@ mod cloud { } #[async_trait] - pub trait TokenProvider: std::fmt::Debug + Send + Sync { + pub(crate) trait TokenProvider: std::fmt::Debug + Send + Sync { type Credential: std::fmt::Debug + Send + Sync; async fn fetch_token( @@ -805,7 +805,7 @@ mod cloud { } #[cfg(any(feature = "aws", feature = "azure", feature = "gcp"))] -pub use cloud::*; +pub(crate) use cloud::*; #[cfg(test)] mod tests { diff --git a/src/client/pagination.rs b/src/client/pagination.rs index 1febe3a..77b2a3d 100644 --- a/src/client/pagination.rs +++ b/src/client/pagination.rs @@ -35,7 +35,7 @@ use std::future::Future; /// finish, otherwise it will continue to call `op(state, token)` with the values returned by the /// previous call to `op`, until a continuation token of `None` is returned /// -pub fn stream_paginated(state: S, op: F) -> impl Stream> +pub(crate) fn stream_paginated(state: S, op: F) -> impl Stream> where F: Fn(S, Option) -> Fut + Copy, Fut: Future)>>, diff --git a/src/client/retry.rs b/src/client/retry.rs index 1fc689c..2f2ba0a 100644 --- a/src/client/retry.rs +++ b/src/client/retry.rs @@ -129,7 +129,7 @@ impl From for std::io::Error { } } -pub type Result = std::result::Result; +pub(crate) type Result = std::result::Result; /// The configuration for how to respond to request errors /// @@ -178,7 +178,7 @@ impl Default for RetryConfig { } } -pub struct RetryableRequest { +pub(crate) struct RetryableRequest { client: Client, request: Request, @@ -196,7 +196,7 @@ impl RetryableRequest { /// /// An idempotent request will be retried on timeout even if the request /// method is not [safe](https://datatracker.ietf.org/doc/html/rfc7231#section-4.2.1) - pub fn idempotent(self, idempotent: bool) -> Self { + pub(crate) fn idempotent(self, idempotent: bool) -> Self { Self { idempotent: Some(idempotent), ..self @@ -207,16 +207,16 @@ impl RetryableRequest { /// /// This will avoid printing out the URL in error messages #[allow(unused)] - pub fn sensitive(self, sensitive: bool) -> Self { + pub(crate) fn sensitive(self, sensitive: bool) -> Self { Self { sensitive, ..self } } /// Provide a [`PutPayload`] - pub fn payload(self, payload: Option) -> Self { + pub(crate) fn payload(self, payload: Option) -> Self { Self { payload, ..self } } - pub async fn send(self) -> Result { + pub(crate) async fn send(self) -> Result { let max_retries = self.max_retries; let retry_timeout = self.retry_timeout; let mut retries = 0; @@ -369,7 +369,7 @@ impl RetryableRequest { } } -pub trait RetryExt { +pub(crate) trait RetryExt { /// Return a [`RetryableRequest`] fn retryable(self, config: &RetryConfig) -> RetryableRequest; diff --git a/src/client/s3.rs b/src/client/s3.rs index 61237dc..a9c4726 100644 --- a/src/client/s3.rs +++ b/src/client/s3.rs @@ -88,13 +88,13 @@ impl TryFrom for ObjectMeta { #[derive(Debug, Deserialize)] #[serde(rename_all = "PascalCase")] -pub struct InitiateMultipartUploadResult { +pub(crate) struct InitiateMultipartUploadResult { pub upload_id: String, } #[derive(Debug, Serialize)] #[serde(rename_all = "PascalCase")] -pub struct CompleteMultipartUpload { +pub(crate) struct CompleteMultipartUpload { pub part: Vec, } @@ -113,7 +113,7 @@ impl From> for CompleteMultipartUpload { } #[derive(Debug, Serialize)] -pub struct MultipartPart { +pub(crate) struct MultipartPart { #[serde(rename = "ETag")] pub e_tag: String, #[serde(rename = "PartNumber")] @@ -122,7 +122,7 @@ pub struct MultipartPart { #[derive(Debug, Deserialize)] #[serde(rename_all = "PascalCase")] -pub struct CompleteMultipartUploadResult { +pub(crate) struct CompleteMultipartUploadResult { #[serde(rename = "ETag")] pub e_tag: String, } diff --git a/src/client/token.rs b/src/client/token.rs index 7a3c807..f729419 100644 --- a/src/client/token.rs +++ b/src/client/token.rs @@ -21,7 +21,7 @@ use tokio::sync::Mutex; /// A temporary authentication token with an associated expiry #[derive(Debug, Clone)] -pub struct TemporaryToken { +pub(crate) struct TemporaryToken { /// The temporary credential pub token: T, /// The instant at which this credential is no longer valid @@ -32,7 +32,7 @@ pub struct TemporaryToken { /// Provides [`TokenCache::get_or_insert_with`] which can be used to cache a /// [`TemporaryToken`] based on its expiry #[derive(Debug)] -pub struct TokenCache { +pub(crate) struct TokenCache { cache: Mutex>>, min_ttl: Duration, } @@ -49,11 +49,11 @@ impl Default for TokenCache { impl TokenCache { /// Override the minimum remaining TTL for a cached token to be used #[cfg(feature = "aws")] - pub fn with_min_ttl(self, min_ttl: Duration) -> Self { + pub(crate) fn with_min_ttl(self, min_ttl: Duration) -> Self { Self { min_ttl, ..self } } - pub async fn get_or_insert_with(&self, f: F) -> Result + pub(crate) async fn get_or_insert_with(&self, f: F) -> Result where F: FnOnce() -> Fut + Send, Fut: Future, E>> + Send, diff --git a/src/config.rs b/src/config.rs index f715fa0..29a389d 100644 --- a/src/config.rs +++ b/src/config.rs @@ -27,7 +27,7 @@ use crate::{Error, Result}; /// /// This allows builders to defer fallibility to build #[derive(Debug, Clone)] -pub enum ConfigValue { +pub(crate) enum ConfigValue { Parsed(T), Deferred(String), } @@ -48,11 +48,11 @@ impl From for ConfigValue { } impl ConfigValue { - pub fn parse(&mut self, v: impl Into) { + pub(crate) fn parse(&mut self, v: impl Into) { *self = Self::Deferred(v.into()) } - pub fn get(&self) -> Result { + pub(crate) fn get(&self) -> Result { match self { Self::Parsed(v) => Ok(v.clone()), Self::Deferred(v) => T::parse(v), @@ -67,7 +67,7 @@ impl Default for ConfigValue { } /// A value that can be stored in [`ConfigValue`] -pub trait Parse: Sized { +pub(crate) trait Parse: Sized { fn parse(v: &str) -> Result; } diff --git a/src/gcp/client.rs b/src/gcp/client.rs index 0045383..a259f6b 100644 --- a/src/gcp/client.rs +++ b/src/gcp/client.rs @@ -124,7 +124,7 @@ impl From for crate::Error { } #[derive(Debug)] -pub struct GoogleCloudStorageConfig { +pub(crate) struct GoogleCloudStorageConfig { pub base_url: String, pub credentials: GcpCredentialProvider, @@ -139,7 +139,7 @@ pub struct GoogleCloudStorageConfig { } impl GoogleCloudStorageConfig { - pub fn new( + pub(crate) fn new( base_url: String, credentials: GcpCredentialProvider, signing_credentials: GcpSigningCredentialProvider, @@ -157,13 +157,13 @@ impl GoogleCloudStorageConfig { } } - pub fn path_url(&self, path: &Path) -> String { + pub(crate) fn path_url(&self, path: &Path) -> String { format!("{}/{}/{}", self.base_url, self.bucket_name, path) } } /// A builder for a put request allowing customisation of the headers and query string -pub struct Request<'a> { +pub(crate) struct Request<'a> { path: &'a Path, config: &'a GoogleCloudStorageConfig, payload: Option, @@ -261,7 +261,7 @@ struct SignBlobResponse { } #[derive(Debug)] -pub struct GoogleCloudStorageClient { +pub(crate) struct GoogleCloudStorageClient { config: GoogleCloudStorageConfig, client: Client, @@ -273,7 +273,7 @@ pub struct GoogleCloudStorageClient { } impl GoogleCloudStorageClient { - pub fn new(config: GoogleCloudStorageConfig) -> Result { + pub(crate) fn new(config: GoogleCloudStorageConfig) -> Result { let client = config.client_options.client()?; let bucket_name_encoded = percent_encode(config.bucket_name.as_bytes(), NON_ALPHANUMERIC).to_string(); @@ -286,7 +286,7 @@ impl GoogleCloudStorageClient { }) } - pub fn config(&self) -> &GoogleCloudStorageConfig { + pub(crate) fn config(&self) -> &GoogleCloudStorageConfig { &self.config } @@ -309,7 +309,11 @@ impl GoogleCloudStorageClient { /// "payload": "REQUEST_INFORMATION" /// } /// ``` - pub async fn sign_blob(&self, string_to_sign: &str, client_email: &str) -> Result { + pub(crate) async fn sign_blob( + &self, + string_to_sign: &str, + client_email: &str, + ) -> Result { let credential = self.get_credential().await?; let body = SignBlobBody { payload: BASE64_STANDARD.encode(string_to_sign), @@ -344,7 +348,7 @@ impl GoogleCloudStorageClient { Ok(hex_encode(&signed_blob)) } - pub fn object_url(&self, path: &Path) -> String { + pub(crate) fn object_url(&self, path: &Path) -> String { let encoded = utf8_percent_encode(path.as_ref(), NON_ALPHANUMERIC); format!( "{}/{}/{}", @@ -355,7 +359,7 @@ impl GoogleCloudStorageClient { /// Perform a put request /// /// Returns the new ETag - pub fn request<'a>(&'a self, method: Method, path: &'a Path) -> Request<'a> { + pub(crate) fn request<'a>(&'a self, method: Method, path: &'a Path) -> Request<'a> { let builder = self.client.request(method, self.object_url(path)); Request { @@ -367,7 +371,7 @@ impl GoogleCloudStorageClient { } } - pub async fn put( + pub(crate) async fn put( &self, path: &Path, payload: PutPayload, @@ -398,7 +402,7 @@ impl GoogleCloudStorageClient { /// Perform a put part request /// /// Returns the new [`PartId`] - pub async fn put_part( + pub(crate) async fn put_part( &self, path: &Path, upload_id: &MultipartId, @@ -423,7 +427,7 @@ impl GoogleCloudStorageClient { } /// Initiate a multipart upload - pub async fn multipart_initiate( + pub(crate) async fn multipart_initiate( &self, path: &Path, opts: PutMultipartOpts, @@ -444,7 +448,11 @@ impl GoogleCloudStorageClient { } /// Cleanup unused parts - pub async fn multipart_cleanup(&self, path: &Path, multipart_id: &MultipartId) -> Result<()> { + pub(crate) async fn multipart_cleanup( + &self, + path: &Path, + multipart_id: &MultipartId, + ) -> Result<()> { let credential = self.get_credential().await?; let url = self.object_url(path); @@ -463,7 +471,7 @@ impl GoogleCloudStorageClient { Ok(()) } - pub async fn multipart_complete( + pub(crate) async fn multipart_complete( &self, path: &Path, multipart_id: &MultipartId, @@ -522,13 +530,18 @@ impl GoogleCloudStorageClient { } /// Perform a delete request - pub async fn delete_request(&self, path: &Path) -> Result<()> { + pub(crate) async fn delete_request(&self, path: &Path) -> Result<()> { self.request(Method::DELETE, path).send().await?; Ok(()) } /// Perform a copy request - pub async fn copy_request(&self, from: &Path, to: &Path, if_not_exists: bool) -> Result<()> { + pub(crate) async fn copy_request( + &self, + from: &Path, + to: &Path, + if_not_exists: bool, + ) -> Result<()> { let credential = self.get_credential().await?; let url = self.object_url(to); diff --git a/src/gcp/credential.rs b/src/gcp/credential.rs index 0e80e62..155a80b 100644 --- a/src/gcp/credential.rs +++ b/src/gcp/credential.rs @@ -44,9 +44,9 @@ use std::time::{Duration, Instant}; use tracing::info; use url::Url; -pub const DEFAULT_SCOPE: &str = "https://www.googleapis.com/auth/cloud-platform"; +pub(crate) const DEFAULT_SCOPE: &str = "https://www.googleapis.com/auth/cloud-platform"; -pub const DEFAULT_GCS_BASE_URL: &str = "https://storage.googleapis.com"; +pub(crate) const DEFAULT_GCS_BASE_URL: &str = "https://storage.googleapis.com"; const DEFAULT_GCS_PLAYLOAD_STRING: &str = "UNSIGNED-PAYLOAD"; const DEFAULT_GCS_SIGN_BLOB_HOST: &str = "storage.googleapis.com"; @@ -166,10 +166,10 @@ pub struct GcpCredential { pub bearer: String, } -pub type Result = std::result::Result; +pub(crate) type Result = std::result::Result; #[derive(Debug, Default, serde::Serialize)] -pub struct JwtHeader<'a> { +pub(crate) struct JwtHeader<'a> { /// The type of JWS: it can only be "JWT" here /// /// Defined in [RFC7515#4.1.9](https://tools.ietf.org/html/rfc7515#section-4.1.9). @@ -226,7 +226,7 @@ struct TokenResponse { /// # References /// - #[derive(Debug)] -pub struct SelfSignedJwt { +pub(crate) struct SelfSignedJwt { issuer: String, scope: String, private_key: ServiceAccountKey, @@ -235,7 +235,7 @@ pub struct SelfSignedJwt { impl SelfSignedJwt { /// Create a new [`SelfSignedJwt`] - pub fn new( + pub(crate) fn new( key_id: String, issuer: String, private_key: ServiceAccountKey, @@ -314,7 +314,7 @@ where /// A deserialized `service-account-********.json`-file. #[derive(serde::Deserialize, Debug, Clone)] -pub struct ServiceAccountCredentials { +pub(crate) struct ServiceAccountCredentials { /// The private key in RSA format. pub private_key: String, @@ -335,12 +335,12 @@ pub struct ServiceAccountCredentials { impl ServiceAccountCredentials { /// Create a new [`ServiceAccountCredentials`] from a file. - pub fn from_file>(path: P) -> Result { + pub(crate) fn from_file>(path: P) -> Result { read_credentials_file(path) } /// Create a new [`ServiceAccountCredentials`] from a string. - pub fn from_key(key: &str) -> Result { + pub(crate) fn from_key(key: &str) -> Result { serde_json::from_str(key).context(DecodeCredentialsSnafu) } @@ -352,7 +352,7 @@ impl ServiceAccountCredentials { /// # References /// - /// - - pub fn token_provider(self) -> crate::Result { + pub(crate) fn token_provider(self) -> crate::Result { Ok(SelfSignedJwt::new( self.private_key_id, self.client_email, @@ -361,7 +361,7 @@ impl ServiceAccountCredentials { )?) } - pub fn signing_credentials(self) -> crate::Result { + pub(crate) fn signing_credentials(self) -> crate::Result { Ok(Arc::new(StaticCredentialProvider::new( GcpSigningCredential { email: self.client_email, @@ -388,7 +388,7 @@ fn b64_encode_obj(obj: &T) -> Result { /// /// #[derive(Debug, Default)] -pub struct InstanceCredentialProvider {} +pub(crate) struct InstanceCredentialProvider {} /// Make a request to the metadata server to fetch a token, using a a given hostname. async fn make_metadata_request( @@ -478,7 +478,7 @@ async fn make_metadata_request_for_email( /// /// #[derive(Debug, Default)] -pub struct InstanceSigningCredentialProvider {} +pub(crate) struct InstanceSigningCredentialProvider {} #[async_trait] impl TokenProvider for InstanceSigningCredentialProvider { @@ -533,7 +533,7 @@ impl TokenProvider for InstanceSigningCredentialProvider { /// - #[derive(serde::Deserialize, Clone)] #[serde(tag = "type")] -pub enum ApplicationDefaultCredentials { +pub(crate) enum ApplicationDefaultCredentials { /// Service Account. /// /// # References @@ -558,7 +558,7 @@ impl ApplicationDefaultCredentials { // Create a new application default credential in the following situations: // 1. a file is passed in and the type matches. // 2. without argument if the well-known configuration file is present. - pub fn read(path: Option<&str>) -> Result, Error> { + pub(crate) fn read(path: Option<&str>) -> Result, Error> { if let Some(path) = path { return read_credentials_file::(path).map(Some); } @@ -580,14 +580,14 @@ const DEFAULT_TOKEN_GCP_URI: &str = "https://accounts.google.com/o/oauth2/token" /// #[derive(Debug, Deserialize, Clone)] -pub struct AuthorizedUserCredentials { +pub(crate) struct AuthorizedUserCredentials { client_id: String, client_secret: String, refresh_token: String, } #[derive(Debug, Deserialize)] -pub struct AuthorizedUserSigningCredentials { +pub(crate) struct AuthorizedUserSigningCredentials { credential: AuthorizedUserCredentials, } @@ -598,7 +598,7 @@ struct EmailResponse { } impl AuthorizedUserSigningCredentials { - pub fn from(credential: AuthorizedUserCredentials) -> crate::Result { + pub(crate) fn from(credential: AuthorizedUserCredentials) -> crate::Result { Ok(Self { credential }) } @@ -684,14 +684,14 @@ fn trim_header_value(value: &str) -> String { /// /// [Google SigV4]: https://cloud.google.com/storage/docs/access-control/signed-urls #[derive(Debug)] -pub struct GCSAuthorizer { +pub(crate) struct GCSAuthorizer { date: Option>, credential: Arc, } impl GCSAuthorizer { /// Create a new [`GCSAuthorizer`] - pub fn new(credential: Arc) -> Self { + pub(crate) fn new(credential: Arc) -> Self { Self { date: None, credential, @@ -821,7 +821,7 @@ impl GCSAuthorizer { ///``` ///`ACTIVE_DATETIME` format:`YYYYMMDD'T'HHMMSS'Z'` /// - pub fn string_to_sign( + pub(crate) fn string_to_sign( &self, date: DateTime, request_method: &Method, diff --git a/src/http/client.rs b/src/http/client.rs index 5def931..eeb7e56 100644 --- a/src/http/client.rs +++ b/src/http/client.rs @@ -85,7 +85,7 @@ impl From for crate::Error { /// Internal client for HttpStore #[derive(Debug)] -pub struct Client { +pub(crate) struct Client { url: Url, client: reqwest::Client, retry_config: RetryConfig, @@ -93,7 +93,11 @@ pub struct Client { } impl Client { - pub fn new(url: Url, client_options: ClientOptions, retry_config: RetryConfig) -> Result { + pub(crate) fn new( + url: Url, + client_options: ClientOptions, + retry_config: RetryConfig, + ) -> Result { let client = client_options.client()?; Ok(Self { url, @@ -103,7 +107,7 @@ impl Client { }) } - pub fn base_url(&self) -> &Url { + pub(crate) fn base_url(&self) -> &Url { &self.url } @@ -159,7 +163,7 @@ impl Client { Ok(()) } - pub async fn put( + pub(crate) async fn put( &self, location: &Path, payload: PutPayload, @@ -216,7 +220,7 @@ impl Client { } } - pub async fn list(&self, location: Option<&Path>, depth: &str) -> Result { + pub(crate) async fn list(&self, location: Option<&Path>, depth: &str) -> Result { let url = location .map(|path| self.path_url(path)) .unwrap_or_else(|| self.url.clone()); @@ -255,7 +259,7 @@ impl Client { Ok(status) } - pub async fn delete(&self, path: &Path) -> Result<()> { + pub(crate) async fn delete(&self, path: &Path) -> Result<()> { let url = self.path_url(path); self.client .delete(url) @@ -271,7 +275,7 @@ impl Client { Ok(()) } - pub async fn copy(&self, from: &Path, to: &Path, overwrite: bool) -> Result<()> { + pub(crate) async fn copy(&self, from: &Path, to: &Path, overwrite: bool) -> Result<()> { let mut retry = false; loop { let method = Method::from_bytes(b"COPY").unwrap(); @@ -364,12 +368,12 @@ impl GetClient for Client { /// The response returned by a PROPFIND request, i.e. list #[derive(Deserialize, Default)] -pub struct MultiStatus { +pub(crate) struct MultiStatus { pub response: Vec, } #[derive(Deserialize)] -pub struct MultiStatusResponse { +pub(crate) struct MultiStatusResponse { href: String, #[serde(rename = "propstat")] prop_stat: PropStat, @@ -377,7 +381,7 @@ pub struct MultiStatusResponse { impl MultiStatusResponse { /// Returns an error if this response is not OK - pub fn check_ok(&self) -> Result<()> { + pub(crate) fn check_ok(&self) -> Result<()> { match self.prop_stat.status.contains("200 OK") { true => Ok(()), false => Err(Error::PropStatus { @@ -389,7 +393,7 @@ impl MultiStatusResponse { } /// Returns the resolved path of this element relative to `base_url` - pub fn path(&self, base_url: &Url) -> Result { + pub(crate) fn path(&self, base_url: &Url) -> Result { let url = Url::options() .base_url(Some(base_url)) .parse(&self.href) @@ -413,7 +417,7 @@ impl MultiStatusResponse { } /// Returns this objects metadata as [`ObjectMeta`] - pub fn object_meta(&self, base_url: &Url) -> Result { + pub(crate) fn object_meta(&self, base_url: &Url) -> Result { let last_modified = self.prop_stat.prop.last_modified; Ok(ObjectMeta { location: self.path(base_url)?, @@ -425,19 +429,19 @@ impl MultiStatusResponse { } /// Returns true if this is a directory / collection - pub fn is_dir(&self) -> bool { + pub(crate) fn is_dir(&self) -> bool { self.prop_stat.prop.resource_type.collection.is_some() } } #[derive(Deserialize)] -pub struct PropStat { +pub(crate) struct PropStat { prop: Prop, status: String, } #[derive(Deserialize)] -pub struct Prop { +pub(crate) struct Prop { #[serde(deserialize_with = "deserialize_rfc1123", rename = "getlastmodified")] last_modified: DateTime, @@ -452,6 +456,6 @@ pub struct Prop { } #[derive(Deserialize)] -pub struct ResourceType { +pub(crate) struct ResourceType { collection: Option<()>, } diff --git a/src/lib.rs b/src/lib.rs index a0d83eb..7b1af26 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -23,7 +23,8 @@ clippy::explicit_iter_loop, clippy::future_not_send, clippy::use_self, - clippy::clone_on_ref_ptr + clippy::clone_on_ref_ptr, + unreachable_pub )] //! # object_store @@ -1373,7 +1374,7 @@ mod tests { } #[cfg(any(feature = "azure", feature = "aws"))] - pub async fn signing(integration: &T) + pub(crate) async fn signing(integration: &T) where T: ObjectStore + signer::Signer, { @@ -1396,7 +1397,7 @@ mod tests { } #[cfg(any(feature = "aws", feature = "azure"))] - pub async fn tagging(storage: Arc, validate: bool, get_tags: F) + pub(crate) async fn tagging(storage: Arc, validate: bool, get_tags: F) where F: Fn(Path) -> Fut + Send + Sync, Fut: std::future::Future> + Send, diff --git a/src/local.rs b/src/local.rs index ac10f33..11324b1 100644 --- a/src/local.rs +++ b/src/local.rs @@ -775,7 +775,7 @@ struct UploadState { } impl LocalUpload { - pub fn new(src: PathBuf, dest: PathBuf, file: File) -> Self { + pub(crate) fn new(src: PathBuf, dest: PathBuf, file: File) -> Self { Self { state: Arc::new(UploadState { dest, diff --git a/src/util.rs b/src/util.rs index 161d2d1..ecf90f9 100644 --- a/src/util.rs +++ b/src/util.rs @@ -27,11 +27,11 @@ use futures::{stream::StreamExt, Stream, TryStreamExt}; use snafu::Snafu; #[cfg(any(feature = "azure", feature = "http"))] -pub static RFC1123_FMT: &str = "%a, %d %h %Y %T GMT"; +pub(crate) static RFC1123_FMT: &str = "%a, %d %h %Y %T GMT"; // deserialize dates according to rfc1123 #[cfg(any(feature = "azure", feature = "http"))] -pub fn deserialize_rfc1123<'de, D>( +pub(crate) fn deserialize_rfc1123<'de, D>( deserializer: D, ) -> Result, D::Error> where @@ -77,7 +77,7 @@ where #[cfg(not(target_arch = "wasm32"))] /// Takes a function and spawns it to a tokio blocking pool if available -pub async fn maybe_spawn_blocking(f: F) -> Result +pub(crate) async fn maybe_spawn_blocking(f: F) -> Result where F: FnOnce() -> Result + Send + 'static, T: Send + 'static, @@ -93,7 +93,7 @@ where pub const OBJECT_STORE_COALESCE_DEFAULT: usize = 1024 * 1024; /// Up to this number of range requests will be performed in parallel by [`coalesce_ranges`] -pub const OBJECT_STORE_COALESCE_PARALLEL: usize = 10; +pub(crate) const OBJECT_STORE_COALESCE_PARALLEL: usize = 10; /// Takes a function `fetch` that can fetch a range of bytes and uses this to /// fetch the provided byte `ranges` From b95a4f027f5794815a35f30a29805399cde8e445 Mon Sep 17 00:00:00 2001 From: Peter Ke Date: Mon, 7 Oct 2024 01:30:27 -0700 Subject: [PATCH 347/397] [object_store] Retry S3 requests with 200 response with "Error" in body (#6508) * rebase * generalize * add test * fix lint * remove dep --------- Co-authored-by: Peter Ke --- src/aws/client.rs | 12 +++++ src/client/retry.rs | 124 +++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 134 insertions(+), 2 deletions(-) diff --git a/src/aws/client.rs b/src/aws/client.rs index cc74f2d..4b4d0b6 100644 --- a/src/aws/client.rs +++ b/src/aws/client.rs @@ -266,6 +266,7 @@ pub(crate) struct Request<'a> { payload: Option, use_session_creds: bool, idempotent: bool, + retry_error_body: bool, } impl<'a> Request<'a> { @@ -292,6 +293,13 @@ impl<'a> Request<'a> { Self { idempotent, ..self } } + pub(crate) fn retry_error_body(self, retry_error_body: bool) -> Self { + Self { + retry_error_body, + ..self + } + } + pub(crate) fn with_encryption_headers(self) -> Self { let headers = self.config.encryption_headers.clone().into(); let builder = self.builder.headers(headers); @@ -379,6 +387,7 @@ impl<'a> Request<'a> { .with_aws_sigv4(credential.authorizer(), sha) .retryable(&self.config.retry_config) .idempotent(self.idempotent) + .retry_error_body(self.retry_error_body) .payload(self.payload) .send() .await @@ -413,6 +422,7 @@ impl S3Client { config: &self.config, use_session_creds: true, idempotent: false, + retry_error_body: false, } } @@ -559,6 +569,7 @@ impl S3Client { self.request(Method::PUT, to) .idempotent(true) + .retry_error_body(true) .header(©_SOURCE_HEADER, &source) .headers(self.config.encryption_headers.clone().into()) .headers(copy_source_encryption_headers) @@ -648,6 +659,7 @@ impl S3Client { .with_aws_sigv4(credential.authorizer(), None) .retryable(&self.config.retry_config) .idempotent(true) + .retry_error_body(true) .send() .await .context(CompleteMultipartRequestSnafu)?; diff --git a/src/client/retry.rs b/src/client/retry.rs index 2f2ba0a..601bffd 100644 --- a/src/client/retry.rs +++ b/src/client/retry.rs @@ -33,6 +33,12 @@ pub enum Error { #[snafu(display("Received redirect without LOCATION, this normally indicates an incorrectly configured region"))] BareRedirect, + #[snafu(display("Server error, body contains Error, with status {status}: {}", body.as_deref().unwrap_or("No Body")))] + Server { + status: StatusCode, + body: Option, + }, + #[snafu(display("Client error with status {status}: {}", body.as_deref().unwrap_or("No Body")))] Client { status: StatusCode, @@ -54,6 +60,7 @@ impl Error { pub fn status(&self) -> Option { match self { Self::BareRedirect => None, + Self::Server { status, .. } => Some(*status), Self::Client { status, .. } => Some(*status), Self::Reqwest { source, .. } => source.status(), } @@ -63,6 +70,7 @@ impl Error { pub fn body(&self) -> Option<&str> { match self { Self::Client { body, .. } => body.as_deref(), + Self::Server { body, .. } => body.as_deref(), Self::BareRedirect => None, Self::Reqwest { .. } => None, } @@ -178,6 +186,10 @@ impl Default for RetryConfig { } } +fn body_contains_error(response_body: &str) -> bool { + response_body.contains("InternalError") || response_body.contains("SlowDown") +} + pub(crate) struct RetryableRequest { client: Client, request: Request, @@ -189,6 +201,8 @@ pub(crate) struct RetryableRequest { sensitive: bool, idempotent: Option, payload: Option, + + retry_error_body: bool, } impl RetryableRequest { @@ -216,6 +230,14 @@ impl RetryableRequest { Self { payload, ..self } } + #[allow(unused)] + pub(crate) fn retry_error_body(self, retry_error_body: bool) -> Self { + Self { + retry_error_body, + ..self + } + } + pub(crate) async fn send(self) -> Result { let max_retries = self.max_retries; let retry_timeout = self.retry_timeout; @@ -244,7 +266,57 @@ impl RetryableRequest { match self.client.execute(request).await { Ok(r) => match r.error_for_status_ref() { - Ok(_) if r.status().is_success() => return Ok(r), + Ok(_) if r.status().is_success() => { + // For certain S3 requests, 200 response may contain `InternalError` or + // `SlowDown` in the message. These responses should be handled similarly + // to r5xx errors. + // More info here: https://repost.aws/knowledge-center/s3-resolve-200-internalerror + if !self.retry_error_body { + return Ok(r); + } + + let status = r.status(); + let headers = r.headers().clone(); + + let bytes = r.bytes().await.map_err(|e| Error::Reqwest { + retries, + max_retries, + elapsed: now.elapsed(), + retry_timeout, + source: e, + })?; + + let response_body = String::from_utf8_lossy(&bytes); + info!("Checking for error in response_body: {}", response_body); + + if !body_contains_error(&response_body) { + // Success response and no error, clone and return response + let mut success_response = hyper::Response::new(bytes); + *success_response.status_mut() = status; + *success_response.headers_mut() = headers; + + return Ok(reqwest::Response::from(success_response)); + } else { + // Retry as if this was a 5xx response + if retries == max_retries || now.elapsed() > retry_timeout { + return Err(Error::Server { + body: Some(response_body.into_owned()), + status, + }); + } + + let sleep = backoff.next(); + retries += 1; + info!( + "Encountered a response status of {} but body contains Error, backing off for {} seconds, retry {} of {}", + status, + sleep.as_secs_f32(), + retries, + max_retries, + ); + tokio::time::sleep(sleep).await; + } + } Ok(r) if r.status() == StatusCode::NOT_MODIFIED => { return Err(Error::Client { body: None, @@ -395,6 +467,7 @@ impl RetryExt for reqwest::RequestBuilder { idempotent: None, payload: None, sensitive: false, + retry_error_body: false, } } @@ -407,13 +480,27 @@ impl RetryExt for reqwest::RequestBuilder { #[cfg(test)] mod tests { use crate::client::mock_server::MockServer; - use crate::client::retry::{Error, RetryExt}; + use crate::client::retry::{body_contains_error, Error, RetryExt}; use crate::RetryConfig; use hyper::header::LOCATION; use hyper::Response; use reqwest::{Client, Method, StatusCode}; use std::time::Duration; + #[test] + fn test_body_contains_error() { + // Example error message provided by https://repost.aws/knowledge-center/s3-resolve-200-internalerror + let error_response = "AmazonS3Exception: We encountered an internal error. Please try again. (Service: Amazon S3; Status Code: 200; Error Code: InternalError; Request ID: 0EXAMPLE9AAEB265)"; + assert!(body_contains_error(error_response)); + + let error_response_2 = "SlowDownPlease reduce your request rate.123456"; + assert!(body_contains_error(error_response_2)); + + // Example success response from https://docs.aws.amazon.com/AmazonS3/latest/API/API_CopyObject.html + let success_response = "2009-10-12T17:50:30.000Z\"9b2cf535f27731c974343645a3985328\""; + assert!(!body_contains_error(success_response)); + } + #[tokio::test] async fn test_retry() { let mock = MockServer::new().await; @@ -637,6 +724,39 @@ mod tests { let err = req.send().await.unwrap_err().to_string(); assert!(!err.contains("SENSITIVE"), "{err}"); + // Success response with error in body is retried + mock.push( + Response::builder() + .status(StatusCode::OK) + .body("InternalError".to_string()) + .unwrap(), + ); + let req = client + .request(Method::PUT, &url) + .retryable(&retry) + .idempotent(true) + .retry_error_body(true); + let r = req.send().await.unwrap(); + assert_eq!(r.status(), StatusCode::OK); + // Response with InternalError should have been retried + assert!(!r.text().await.unwrap().contains("InternalError")); + + // Should not retry success response with no error in body + mock.push( + Response::builder() + .status(StatusCode::OK) + .body("success".to_string()) + .unwrap(), + ); + let req = client + .request(Method::PUT, &url) + .retryable(&retry) + .idempotent(true) + .retry_error_body(true); + let r = req.send().await.unwrap(); + assert_eq!(r.status(), StatusCode::OK); + assert!(r.text().await.unwrap().contains("success")); + // Shutdown mock.shutdown().await } From 7c920ae6f0f8c190f3e66f37c84b369a6960388d Mon Sep 17 00:00:00 2001 From: Val Lorentz Date: Mon, 7 Oct 2024 15:38:34 +0200 Subject: [PATCH 348/397] object_store: Clarify what is a prefix in list() documentation (#6520) It should be obvious to reader that `foo/bar/` is not a prefix of `foo/bar_baz`, because it is true whether they think in term of strings, S3-like API paths, or POSIX-like paths. What is less clear is that `foo/bar` is also not a prefix of `foo/bar_baz`, because this is not true with strings or S3-like API paths, only of POSIX-like paths. Additionally, the definition of paths (https://docs.rs/object_store/0.11.0/object_store/path/struct.Path.html) says that paths cannot have a trailing `/`. --- src/lib.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 7b1af26..4d8d8f0 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -716,7 +716,7 @@ pub trait ObjectStore: std::fmt::Display + Send + Sync + Debug + 'static { /// List all the objects with the given prefix. /// - /// Prefixes are evaluated on a path segment basis, i.e. `foo/bar/` is a prefix of `foo/bar/x` but not of + /// Prefixes are evaluated on a path segment basis, i.e. `foo/bar` is a prefix of `foo/bar/x` but not of /// `foo/bar_baz/x`. List is recursive, i.e. `foo/bar/more/x` will be included. /// /// Note: the order of returned [`ObjectMeta`] is not guaranteed @@ -743,7 +743,7 @@ pub trait ObjectStore: std::fmt::Display + Send + Sync + Debug + 'static { /// delimiter. Returns common prefixes (directories) in addition to object /// metadata. /// - /// Prefixes are evaluated on a path segment basis, i.e. `foo/bar/` is a prefix of `foo/bar/x` but not of + /// Prefixes are evaluated on a path segment basis, i.e. `foo/bar` is a prefix of `foo/bar/x` but not of /// `foo/bar_baz/x`. List is not recursive, i.e. `foo/bar/more/x` will not be included. async fn list_with_delimiter(&self, prefix: Option<&Path>) -> Result; From 0864d3412e46d8ef43aac3e7ea742c349a65cc9d Mon Sep 17 00:00:00 2001 From: Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com> Date: Tue, 15 Oct 2024 15:08:57 -0500 Subject: [PATCH 349/397] object_store: fix typo in with_connect_timeout_disabled that actually disabled non-connect timeouts (#6563) --- src/client/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/client/mod.rs b/src/client/mod.rs index 7b1b469..b65fea7 100644 --- a/src/client/mod.rs +++ b/src/client/mod.rs @@ -501,7 +501,7 @@ impl ClientOptions { /// /// See [`Self::with_connect_timeout`] pub fn with_connect_timeout_disabled(mut self) -> Self { - self.timeout = None; + self.connect_timeout = None; self } From 12d3dfa5aace7746c9ea4879228fb3881e5033f8 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Tue, 15 Oct 2024 16:46:45 -0400 Subject: [PATCH 350/397] Prepare for object_store `0.11.1` release (#6566) * Update version to 0.11.1 * Update changelog * add old changelog * Update release * prettier --- CHANGELOG-old.md | 17 ++++++++++++++ CHANGELOG.md | 38 ++++++++++++++++++++++++-------- Cargo.toml | 2 +- dev/release/update_change_log.sh | 4 ++-- 4 files changed, 49 insertions(+), 12 deletions(-) diff --git a/CHANGELOG-old.md b/CHANGELOG-old.md index 172b0f9..28dbde4 100644 --- a/CHANGELOG-old.md +++ b/CHANGELOG-old.md @@ -19,6 +19,23 @@ # Historical Changelog +## [object_store_0.11.0](https://github.com/apache/arrow-rs/tree/object_store_0.11.0) (2024-08-12) + +[Full Changelog](https://github.com/apache/arrow-rs/compare/object_store_0.10.2...object_store_0.11.0) + +**Breaking changes:** + +- Make object\_store errors non-exhaustive [\#6165](https://github.com/apache/arrow-rs/pull/6165) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Update snafu to `0.8.0` in object\_store \(\#5930\) [\#6070](https://github.com/apache/arrow-rs/pull/6070) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([alamb](https://github.com/alamb)) + + +**Merged pull requests:** + +- Add LICENSE and NOTICE files to object_store [\#6234](https://github.com/apache/arrow-rs/pull/6234) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([alamb](https://github.com/alamb)) +- feat\(object\_store\): add `PermissionDenied` variant to top-level error [\#6194](https://github.com/apache/arrow-rs/pull/6194) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([kyle-mccarthy](https://github.com/kyle-mccarthy)) +- Update object store MSRV to `1.64` [\#6123](https://github.com/apache/arrow-rs/pull/6123) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([alamb](https://github.com/alamb)) +- Fix clippy in object\_store crate [\#6120](https://github.com/apache/arrow-rs/pull/6120) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([alamb](https://github.com/alamb)) + ## [object_store_0.10.2](https://github.com/apache/arrow-rs/tree/object_store_0.10.2) (2024-07-17) [Full Changelog](https://github.com/apache/arrow-rs/compare/object_store_0.10.1...object_store_0.10.2) diff --git a/CHANGELOG.md b/CHANGELOG.md index 18dde11..9558598 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,22 +19,42 @@ # Changelog -## [object_store_0.11.0](https://github.com/apache/arrow-rs/tree/object_store_0.11.0) (2024-08-12) +## [object_store_0.11.1](https://github.com/apache/arrow-rs/tree/object_store_0.11.1) (2024-10-15) -[Full Changelog](https://github.com/apache/arrow-rs/compare/object_store_0.10.2...object_store_0.11.0) +[Full Changelog](https://github.com/apache/arrow-rs/compare/object_store_0.11.0...object_store_0.11.1) -**Breaking changes:** +**Implemented enhancements:** -- Make object\_store errors non-exhaustive [\#6165](https://github.com/apache/arrow-rs/pull/6165) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- Update snafu to `0.8.0` in object\_store \(\#5930\) [\#6070](https://github.com/apache/arrow-rs/pull/6070) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([alamb](https://github.com/alamb)) +- There is no way to pass object store client options as environment variables [\#6333](https://github.com/apache/arrow-rs/issues/6333) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Better Document Backoff Algorithm [\#6324](https://github.com/apache/arrow-rs/issues/6324) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Add direction to `list_with_offset` [\#6274](https://github.com/apache/arrow-rs/issues/6274) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Support server-side encryption with customer-provided keys \(SSE-C\) [\#6229](https://github.com/apache/arrow-rs/issues/6229) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +**Fixed bugs:** + +- \[object-store\] Requested tokio version is too old - does not compile [\#6458](https://github.com/apache/arrow-rs/issues/6458) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Azure SAS tokens are visible when retry errors are logged via object\_store [\#6322](https://github.com/apache/arrow-rs/issues/6322) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] **Merged pull requests:** -- Add LICENSE and NOTICE files to object_store [\#6234](https://github.com/apache/arrow-rs/pull/6234) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([alamb](https://github.com/alamb)) -- feat\(object\_store\): add `PermissionDenied` variant to top-level error [\#6194](https://github.com/apache/arrow-rs/pull/6194) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([kyle-mccarthy](https://github.com/kyle-mccarthy)) -- Update object store MSRV to `1.64` [\#6123](https://github.com/apache/arrow-rs/pull/6123) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([alamb](https://github.com/alamb)) -- Fix clippy in object\_store crate [\#6120](https://github.com/apache/arrow-rs/pull/6120) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([alamb](https://github.com/alamb)) +- object\_store: fix typo in with\_connect\_timeout\_disabled that actually disabled non-connect timeouts [\#6563](https://github.com/apache/arrow-rs/pull/6563) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([adriangb](https://github.com/adriangb)) +- object\_store: Clarify what is a prefix in list\(\) documentation [\#6520](https://github.com/apache/arrow-rs/pull/6520) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([progval](https://github.com/progval)) +- object\_store: enable lint `unreachable_pub` [\#6512](https://github.com/apache/arrow-rs/pull/6512) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([ByteBaker](https://github.com/ByteBaker)) +- \[object\_store\] Retry S3 requests with 200 response with "Error" in body [\#6508](https://github.com/apache/arrow-rs/pull/6508) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([PeterKeDer](https://github.com/PeterKeDer)) +- \[object-store\] Require tokio 1.29.0. [\#6459](https://github.com/apache/arrow-rs/pull/6459) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([ashtuchkin](https://github.com/ashtuchkin)) +- feat: expose HTTP/2 max frame size in `object_store` [\#6442](https://github.com/apache/arrow-rs/pull/6442) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([crepererum](https://github.com/crepererum)) +- Derive `Clone` for `object_store::aws::AmazonS3` [\#6414](https://github.com/apache/arrow-rs/pull/6414) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([ethe](https://github.com/ethe)) +- object\_score: Support Azure Fabric OAuth Provider [\#6382](https://github.com/apache/arrow-rs/pull/6382) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([RobinLin666](https://github.com/RobinLin666)) +- `object_store::GetOptions` derive `Clone` [\#6361](https://github.com/apache/arrow-rs/pull/6361) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([samuelcolvin](https://github.com/samuelcolvin)) +- \[object\_store\] Propagate env vars as object store client options [\#6334](https://github.com/apache/arrow-rs/pull/6334) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([ccciudatu](https://github.com/ccciudatu)) +- docs\[object\_store\]: clarify the backoff strategy that is actually implemented [\#6325](https://github.com/apache/arrow-rs/pull/6325) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([westonpace](https://github.com/westonpace)) +- fix: azure sas token visible in logs [\#6323](https://github.com/apache/arrow-rs/pull/6323) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([alexwilcoxson-rel](https://github.com/alexwilcoxson-rel)) +- object\_store/delimited: Fix `TrailingEscape` condition [\#6265](https://github.com/apache/arrow-rs/pull/6265) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([Turbo87](https://github.com/Turbo87)) +- fix\(object\_store\): only add encryption headers for SSE-C in get request [\#6260](https://github.com/apache/arrow-rs/pull/6260) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([jiachengdb](https://github.com/jiachengdb)) +- docs: Add parquet\_opendal in related projects [\#6236](https://github.com/apache/arrow-rs/pull/6236) ([Xuanwo](https://github.com/Xuanwo)) +- feat\(object\_store\): add support for server-side encryption with customer-provided keys \(SSE-C\) [\#6230](https://github.com/apache/arrow-rs/pull/6230) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([jiachengdb](https://github.com/jiachengdb)) +- feat: further TLS options on ClientOptions: \#5034 [\#6148](https://github.com/apache/arrow-rs/pull/6148) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([ByteBaker](https://github.com/ByteBaker)) + \* *This Changelog was automatically generated by [github_changelog_generator](https://github.com/github-changelog-generator/github-changelog-generator)* diff --git a/Cargo.toml b/Cargo.toml index 3a90322..cab2ac3 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "object_store" -version = "0.11.0" +version = "0.11.1" edition = "2021" license = "MIT/Apache-2.0" readme = "README.md" diff --git a/dev/release/update_change_log.sh b/dev/release/update_change_log.sh index 142bbb0..3072447 100755 --- a/dev/release/update_change_log.sh +++ b/dev/release/update_change_log.sh @@ -29,8 +29,8 @@ set -e -SINCE_TAG="object_store_0.10.2" -FUTURE_RELEASE="object_store_0.11.0" +SINCE_TAG="object_store_0.11.0" +FUTURE_RELEASE="object_store_0.11.1" SOURCE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" SOURCE_TOP_DIR="$(cd "${SOURCE_DIR}/../../" && pwd)" From 2eeaa30e52d2b286d02d4df9a952c4a778c77445 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Wed, 16 Oct 2024 06:26:14 -0400 Subject: [PATCH 351/397] Update `object_store` release documentation (#6565) * Update object_store release documentation * Update object store * prettier * tweak * Update object_store/dev/release/README.md * update instructions * fix * Update object_store/dev/release/README.md --- dev/release/README.md | 207 +++++++++++++++++++++++++++++++++++++++++- 1 file changed, 206 insertions(+), 1 deletion(-) diff --git a/dev/release/README.md b/dev/release/README.md index 89f6e57..4077dca 100644 --- a/dev/release/README.md +++ b/dev/release/README.md @@ -17,4 +17,209 @@ under the License. --> -See instructions in [`/dev/release/README.md`](../../../dev/release/README.md) + +# Release Process + +## Overview + +This file documents the release process for the `object_store` crate. + +At the time of writing, we release a new version of `object_store` on demand rather than on a regular schedule. + +As we are still in an early phase, we use the 0.x version scheme. If any code has +been merged to master that has a breaking API change, as defined in [Rust RFC 1105] +the minor version number is incremented changed (e.g. `0.3.0` to `0.4.0`). +Otherwise the patch version is incremented (e.g. `0.3.0` to `0.3.1`). + +[Rust RFC 1105]: https://github.com/rust-lang/rfcs/blob/master/text/1105-api-evolution.md +# Release Mechanics + +## Process Overview + +As part of the Apache governance model, official releases consist of +signed source tarballs approved by the PMC. + +We then use the code in the approved source tarball to release to +crates.io, the Rust ecosystem's package manager. + +We create a `CHANGELOG.md` so our users know what has been changed between releases. + +The CHANGELOG is created automatically using +[update_change_log.sh](https://github.com/apache/arrow-rs/blob/master/object_store/dev/release/update_change_log.sh) + +This script creates a changelog using github issues and the +labels associated with them. + +## Prepare CHANGELOG and version: + +Now prepare a PR to update `CHANGELOG.md` and versions on `master` to reflect the planned release. + +Note this process is done in the `object_store` directory. See [#6227] for an example + +[#6227]: https://github.com/apache/arrow-rs/pull/6227 + +```bash +# NOTE: Run commands in object_store sub directory (not main repo checkout) +# cd object_store + +git checkout master +git pull +git checkout -b + +# Update versions. Make sure to run it before the next step since we do not want CHANGELOG-old.md affected. +sed -i '' -e 's/0.11.0/0.11.1/g' `find . -name 'Cargo.toml' -or -name '*.md' | grep -v CHANGELOG` +git commit -a -m 'Update version' + +# ensure your github token is available +export CHANGELOG_GITHUB_TOKEN= + +# manually edit ./dev/release/update_change_log.sh to reflect the release version +# create the changelog +./dev/release/update_change_log.sh + +# review change log / and edit associated issues and labels if needed, rerun update_change_log.sh + +# Commit changes +git commit -a -m 'Create changelog' + +# push changes to fork and create a PR to master +git push +``` + +Note that when reviewing the change log, rather than editing the +`CHANGELOG.md`, it is preferred to update the issues and their labels +(e.g. add `invalid` label to exclude them from release notes) + +Merge this PR to `master` prior to the next step. + +## Prepare release candidate tarball + +After you have merged the updates to the `CHANGELOG` and version, +create a release candidate using the following steps. Note you need to +be a committer to run these scripts as they upload to the apache `svn` +distribution servers. + +### Create git tag for the release: + +While the official release artifact is a signed tarball, we also tag the commit it was created for convenience and code archaeology. + +For `object_store` releases, use a string such as `object_store_0.4.0` as the ``. + +Create and push the tag thusly: + +```shell +git fetch apache +git tag apache/master +# push tag to apache +git push apache +``` + +### Pick an Release Candidate (RC) number + +Pick numbers in sequential order, with `1` for `rc1`, `2` for `rc2`, etc. + +### Create, sign, and upload tarball + +Run `create-tarball.sh` with the `` tag and `` and you found in previous steps. + +```shell +./object_store/dev/release/create-tarball.sh 0.11.1 1 +``` + +The `create-tarball.sh` script + +1. creates and uploads a release candidate tarball to the [arrow + dev](https://dist.apache.org/repos/dist/dev/arrow) location on the + apache distribution svn server + +2. provide you an email template to + send to dev@arrow.apache.org for release voting. + +### Vote on Release Candidate tarball + +Send an email, based on the output from the script to dev@arrow.apache.org. The email should look like + +``` +Draft email for dev@arrow.apache.org mailing list + +--------------------------------------------------------- +To: dev@arrow.apache.org +Subject: [VOTE][RUST] Release Apache Arrow Rust Object Store 0.11.1 RC1 + +Hi, + +I would like to propose a release of Apache Arrow Rust Object +Store Implementation, version 0.11.1. + +This release candidate is based on commit: b945b15de9085f5961a478d4f35b0c5c3427e248 [1] + +The proposed release tarball and signatures are hosted at [2]. + +The changelog is located at [3]. + +Please download, verify checksums and signatures, run the unit tests, +and vote on the release. There is a script [4] that automates some of +the verification. + +The vote will be open for at least 72 hours. + +[ ] +1 Release this as Apache Arrow Rust Object Store +[ ] +0 +[ ] -1 Do not release this as Apache Arrow Rust Object Store because... + +[1]: https://github.com/apache/arrow-rs/tree/b945b15de9085f5961a478d4f35b0c5c3427e248 +[2]: https://dist.apache.org/repos/dist/dev/arrow/apache-arrow-object-store-rs-0.11.1-rc1 +[3]: https://github.com/apache/arrow-rs/blob/b945b15de9085f5961a478d4f35b0c5c3427e248/object_store/CHANGELOG.md +[4]: https://github.com/apache/arrow-rs/blob/master/object_store/dev/release/verify-release-candidate.sh +``` + +For the release to become "official" it needs at least three Apache Arrow PMC members to vote +1 on it. + +## Verifying release candidates + +The `object_store/dev/release/verify-release-candidate.sh` script can assist in the verification process. Run it like: + +``` +./object_store/dev/release/verify-release-candidate.sh 0.11.0 1 +``` + +#### If the release is not approved + +If the release is not approved, fix whatever the problem is and try again with the next RC number + +### If the release is approved, + +Move tarball to the release location in SVN, e.g. https://dist.apache.org/repos/dist/release/arrow/arrow-4.1.0/, using the `release-tarball.sh` script: + + +```shell +./object_store/dev/release/release-tarball.sh 4.1.0 2 +``` + +Congratulations! The release is now official! + +### Publish on Crates.io + +Only approved releases of the tarball should be published to +crates.io, in order to conform to Apache Software Foundation +governance standards. + +An Arrow committer can publish this crate after an official project release has +been made to crates.io using the following instructions. + +Follow [these +instructions](https://doc.rust-lang.org/cargo/reference/publishing.html) to +create an account and login to crates.io before asking to be added as an owner +of the [arrow crate](https://crates.io/crates/arrow). + +Download and unpack the official release tarball + +Verify that the Cargo.toml in the tarball contains the correct version +(e.g. `version = "0.11.0"`) and then publish the crate with the +following commands + + +```shell +cargo publish +``` + From 688126dfadf61429c67356199965ee07fc01b0c8 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Sun, 20 Oct 2024 16:21:37 -0400 Subject: [PATCH 352/397] Remove `test_private_bucket` object_store test (#6601) * Fix `test_private_bucket` test * remove test --- src/aws/resolve.rs | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/src/aws/resolve.rs b/src/aws/resolve.rs index 4c74893..25bc74f 100644 --- a/src/aws/resolve.rs +++ b/src/aws/resolve.rs @@ -78,19 +78,6 @@ pub async fn resolve_bucket_region(bucket: &str, client_options: &ClientOptions) mod tests { use super::*; - #[tokio::test] - async fn test_private_bucket() { - let bucket = "bloxbender"; - - let region = resolve_bucket_region(bucket, &ClientOptions::new()) - .await - .unwrap(); - - let expected = "us-west-2".to_string(); - - assert_eq!(region, expected); - } - #[tokio::test] async fn test_bucket_does_not_exist() { let bucket = "please-dont-exist"; From 6a97a5f3e7074ee3887dbd2f4b1a98578d3c09db Mon Sep 17 00:00:00 2001 From: lambda <1wei@live.com> Date: Tue, 29 Oct 2024 23:06:08 +0800 Subject: [PATCH 353/397] [object_store] fix S3 endpoint and trailing slash result in invalid requests (#6641) Co-authored-by: Yiwei Wang --- src/aws/builder.rs | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/src/aws/builder.rs b/src/aws/builder.rs index c52c3f8..eb79f5e 100644 --- a/src/aws/builder.rs +++ b/src/aws/builder.rs @@ -961,7 +961,7 @@ impl AmazonS3Builder { let virtual_hosted = self.virtual_hosted_style_request.get()?; let bucket_endpoint = match (&self.endpoint, zonal_endpoint, virtual_hosted) { (Some(endpoint), _, true) => endpoint.clone(), - (Some(endpoint), _, false) => format!("{endpoint}/{bucket}"), + (Some(endpoint), _, false) => format!("{}/{}", endpoint.trim_end_matches("/"), bucket), (None, Some(endpoint), _) => endpoint, (None, None, true) => format!("https://{bucket}.s3.{region}.amazonaws.com"), (None, None, false) => format!("https://s3.{region}.amazonaws.com/{bucket}"), @@ -1316,6 +1316,29 @@ mod tests { assert_eq!(builder.client.config.region, "us-east-1"); } + #[test] + fn s3_test_bucket_endpoint() { + let builder = AmazonS3Builder::new() + .with_endpoint("http://some.host:1234") + .with_bucket_name("foo") + .build() + .unwrap(); + assert_eq!( + builder.client.config.bucket_endpoint, + "http://some.host:1234/foo" + ); + + let builder = AmazonS3Builder::new() + .with_endpoint("http://some.host:1234/") + .with_bucket_name("foo") + .build() + .unwrap(); + assert_eq!( + builder.client.config.bucket_endpoint, + "http://some.host:1234/foo" + ); + } + #[test] fn s3_test_urls() { let mut builder = AmazonS3Builder::new(); From ad0d4ff074f86c2aa4f4d1b6b10de3b634208a87 Mon Sep 17 00:00:00 2001 From: Micah Wylde Date: Wed, 30 Oct 2024 00:22:22 -0700 Subject: [PATCH 354/397] Lower GCP token min_ttl to 4 minutes and add backoff to token refresh logic (#6638) --- src/client/mod.rs | 2 +- src/client/token.rs | 89 +++++++++++++++++++++++++++++++++++++++++---- src/gcp/builder.rs | 25 ++++++++----- 3 files changed, 99 insertions(+), 17 deletions(-) diff --git a/src/client/mod.rs b/src/client/mod.rs index b65fea7..76d1c1f 100644 --- a/src/client/mod.rs +++ b/src/client/mod.rs @@ -774,7 +774,7 @@ mod cloud { } /// Override the minimum remaining TTL for a cached token to be used - #[cfg(feature = "aws")] + #[cfg(any(feature = "aws", feature = "gcp"))] pub(crate) fn with_min_ttl(mut self, min_ttl: Duration) -> Self { self.cache = self.cache.with_min_ttl(min_ttl); self diff --git a/src/client/token.rs b/src/client/token.rs index f729419..81ffc11 100644 --- a/src/client/token.rs +++ b/src/client/token.rs @@ -33,8 +33,9 @@ pub(crate) struct TemporaryToken { /// [`TemporaryToken`] based on its expiry #[derive(Debug)] pub(crate) struct TokenCache { - cache: Mutex>>, + cache: Mutex, Instant)>>, min_ttl: Duration, + fetch_backoff: Duration, } impl Default for TokenCache { @@ -42,13 +43,16 @@ impl Default for TokenCache { Self { cache: Default::default(), min_ttl: Duration::from_secs(300), + // How long to wait before re-attempting a token fetch after receiving one that + // is still within the min-ttl + fetch_backoff: Duration::from_millis(100), } } } impl TokenCache { /// Override the minimum remaining TTL for a cached token to be used - #[cfg(feature = "aws")] + #[cfg(any(feature = "aws", feature = "gcp"))] pub(crate) fn with_min_ttl(self, min_ttl: Duration) -> Self { Self { min_ttl, ..self } } @@ -61,20 +65,91 @@ impl TokenCache { let now = Instant::now(); let mut locked = self.cache.lock().await; - if let Some(cached) = locked.as_ref() { + if let Some((cached, fetched_at)) = locked.as_ref() { match cached.expiry { - Some(ttl) if ttl.checked_duration_since(now).unwrap_or_default() > self.min_ttl => { - return Ok(cached.token.clone()); + Some(ttl) => { + if ttl.checked_duration_since(now).unwrap_or_default() > self.min_ttl || + // if we've recently attempted to fetch this token and it's not actually + // expired, we'll wait to re-fetch it and return the cached one + (fetched_at.elapsed() < self.fetch_backoff && ttl.checked_duration_since(now).is_some()) + { + return Ok(cached.token.clone()); + } } None => return Ok(cached.token.clone()), - _ => (), } } let cached = f().await?; let token = cached.token.clone(); - *locked = Some(cached); + *locked = Some((cached, Instant::now())); Ok(token) } } + +#[cfg(test)] +mod test { + use crate::client::token::{TemporaryToken, TokenCache}; + use std::sync::atomic::{AtomicU32, Ordering}; + use std::time::{Duration, Instant}; + + // Helper function to create a token with a specific expiry duration from now + fn create_token(expiry_duration: Option) -> TemporaryToken { + TemporaryToken { + token: "test_token".to_string(), + expiry: expiry_duration.map(|d| Instant::now() + d), + } + } + + #[tokio::test] + async fn test_expired_token_is_refreshed() { + let cache = TokenCache::default(); + static COUNTER: AtomicU32 = AtomicU32::new(0); + + async fn get_token() -> Result, String> { + COUNTER.fetch_add(1, Ordering::SeqCst); + Ok::<_, String>(create_token(Some(Duration::from_secs(0)))) + } + + // Should fetch initial token + let _ = cache.get_or_insert_with(get_token).await.unwrap(); + assert_eq!(COUNTER.load(Ordering::SeqCst), 1); + + tokio::time::sleep(Duration::from_millis(2)).await; + + // Token is expired, so should fetch again + let _ = cache.get_or_insert_with(get_token).await.unwrap(); + assert_eq!(COUNTER.load(Ordering::SeqCst), 2); + } + + #[tokio::test] + async fn test_min_ttl_causes_refresh() { + let cache = TokenCache { + cache: Default::default(), + min_ttl: Duration::from_secs(1), + fetch_backoff: Duration::from_millis(1), + }; + + static COUNTER: AtomicU32 = AtomicU32::new(0); + + async fn get_token() -> Result, String> { + COUNTER.fetch_add(1, Ordering::SeqCst); + Ok::<_, String>(create_token(Some(Duration::from_millis(100)))) + } + + // Initial fetch + let _ = cache.get_or_insert_with(get_token).await.unwrap(); + assert_eq!(COUNTER.load(Ordering::SeqCst), 1); + + // Should not fetch again since not expired and within fetch_backoff + let _ = cache.get_or_insert_with(get_token).await.unwrap(); + assert_eq!(COUNTER.load(Ordering::SeqCst), 1); + + tokio::time::sleep(Duration::from_millis(2)).await; + + // Should fetch, since we've passed fetch_backoff + let _ = cache.get_or_insert_with(get_token).await.unwrap(); + assert_eq!(COUNTER.load(Ordering::SeqCst), 2); + } +} diff --git a/src/gcp/builder.rs b/src/gcp/builder.rs index 26cc821..fac923c 100644 --- a/src/gcp/builder.rs +++ b/src/gcp/builder.rs @@ -30,10 +30,13 @@ use serde::{Deserialize, Serialize}; use snafu::{OptionExt, ResultExt, Snafu}; use std::str::FromStr; use std::sync::Arc; +use std::time::Duration; use url::Url; use super::credential::{AuthorizedUserSigningCredentials, InstanceSigningCredentialProvider}; +const TOKEN_MIN_TTL: Duration = Duration::from_secs(4 * 60); + #[derive(Debug, Snafu)] enum Error { #[snafu(display("Missing bucket name"))] @@ -463,13 +466,14 @@ impl GoogleCloudStorageBuilder { )) as _ } else if let Some(credentials) = application_default_credentials.clone() { match credentials { - ApplicationDefaultCredentials::AuthorizedUser(token) => { - Arc::new(TokenCredentialProvider::new( + ApplicationDefaultCredentials::AuthorizedUser(token) => Arc::new( + TokenCredentialProvider::new( token, self.client_options.client()?, self.retry_config.clone(), - )) as _ - } + ) + .with_min_ttl(TOKEN_MIN_TTL), + ) as _, ApplicationDefaultCredentials::ServiceAccount(token) => { Arc::new(TokenCredentialProvider::new( token.token_provider()?, @@ -479,11 +483,14 @@ impl GoogleCloudStorageBuilder { } } } else { - Arc::new(TokenCredentialProvider::new( - InstanceCredentialProvider::default(), - self.client_options.metadata_client()?, - self.retry_config.clone(), - )) as _ + Arc::new( + TokenCredentialProvider::new( + InstanceCredentialProvider::default(), + self.client_options.metadata_client()?, + self.retry_config.clone(), + ) + .with_min_ttl(TOKEN_MIN_TTL), + ) as _ }; let signing_credentials = if let Some(signing_credentials) = self.signing_credentials { From 919454a65a4abf34804b8769716c9dc36f45140b Mon Sep 17 00:00:00 2001 From: Marco Neumann Date: Tue, 5 Nov 2024 16:43:35 +0100 Subject: [PATCH 355/397] Update quick-xml requirement from 0.36.0 to 0.37.0 in /object_store (#6687) Updates the requirements on [quick-xml](https://github.com/tafia/quick-xml) to permit the latest version. - [Release notes](https://github.com/tafia/quick-xml/releases) - [Changelog](https://github.com/tafia/quick-xml/blob/master/Changelog.md) - [Commits](https://github.com/tafia/quick-xml/compare/v0.36.0...v0.37.0) --- updated-dependencies: - dependency-name: quick-xml dependency-type: direct:production ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- Cargo.toml | 2 +- src/gcp/client.rs | 5 ++++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index cab2ac3..86d1392 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -46,7 +46,7 @@ walkdir = "2" # Cloud storage support base64 = { version = "0.22", default-features = false, features = ["std"], optional = true } hyper = { version = "1.2", default-features = false, optional = true } -quick-xml = { version = "0.36.0", features = ["serialize", "overlapped-lists"], optional = true } +quick-xml = { version = "0.37.0", features = ["serialize", "overlapped-lists"], optional = true } serde = { version = "1.0", default-features = false, features = ["derive"], optional = true } serde_json = { version = "1.0", default-features = false, optional = true } rand = { version = "0.8", default-features = false, features = ["std", "std_rng"], optional = true } diff --git a/src/gcp/client.rs b/src/gcp/client.rs index a259f6b..ccc9c34 100644 --- a/src/gcp/client.rs +++ b/src/gcp/client.rs @@ -79,6 +79,9 @@ enum Error { #[snafu(display("Error getting put response body: {}", source))] PutResponseBody { source: reqwest::Error }, + #[snafu(display("Got invalid put request: {}", source))] + InvalidPutRequest { source: quick_xml::se::SeError }, + #[snafu(display("Got invalid put response: {}", source))] InvalidPutResponse { source: quick_xml::de::DeError }, @@ -495,7 +498,7 @@ impl GoogleCloudStorageClient { let credential = self.get_credential().await?; let data = quick_xml::se::to_string(&upload_info) - .context(InvalidPutResponseSnafu)? + .context(InvalidPutRequestSnafu)? // We cannot disable the escaping that transforms "/" to ""e;" :( // https://github.com/tafia/quick-xml/issues/362 // https://github.com/tafia/quick-xml/issues/350 From 23162d87f63b41362657ce5dd444e8c6dc0ee1a1 Mon Sep 17 00:00:00 2001 From: Nikhil Benesch Date: Fri, 8 Nov 2024 10:10:44 -0500 Subject: [PATCH 356/397] Support native S3 conditional writes (#6682) * Support native S3 conditional writes Add support for `PutMode::Create` and `copy_if_not_exists` on native AWS S3, which uses the underlying conditional write primitive that Amazon launched earlier this year [0]. The conditional write primitive is simpler than what's available in other S3-like products (e.g., R2), so new modes for `s3_copy_if_not_exists` and `s3_conditional_put` are added to select the native S3-specific behavior. To maintain strict backwards compatibility (e.g. with older versions of LocalStack), the new behavior is not on by default. It must be explicitly requested by the end user. The implementation for `PutMode::Create` is straightforward. The implementation of `copy_if_not_exists` is a bit more involved, as it requires managing a multipart upload that uses the UploadPartCopy operation, which was not previously supported by this crate's S3 client. To ensure test coverage, the object store workflow now runs the AWS integration tests with conditional put both disabled and enabled. Fix #6285. [0]: https://aws.amazon.com/about-aws/whats-new/2024/08/amazon-s3-conditional-writes/ * Address review feedback * Fix clippy failure * Upgrade localstack in GitHub Actions To a version that supports conditional writes. --- src/aws/client.rs | 95 +++++++++++++++++++++++++++++++++++------ src/aws/mod.rs | 77 +++++++++++++++++++++++++++++---- src/aws/precondition.rs | 33 ++++++++++++++ src/client/s3.rs | 8 ++++ src/integration.rs | 6 +++ 5 files changed, 196 insertions(+), 23 deletions(-) diff --git a/src/aws/client.rs b/src/aws/client.rs index 4b4d0b6..a610e63 100644 --- a/src/aws/client.rs +++ b/src/aws/client.rs @@ -28,8 +28,8 @@ use crate::client::header::{get_put_result, get_version}; use crate::client::list::ListClient; use crate::client::retry::RetryExt; use crate::client::s3::{ - CompleteMultipartUpload, CompleteMultipartUploadResult, InitiateMultipartUploadResult, - ListResponse, + CompleteMultipartUpload, CompleteMultipartUploadResult, CopyPartResult, + InitiateMultipartUploadResult, ListResponse, }; use crate::client::GetOptionsExt; use crate::multipart::PartId; @@ -98,8 +98,11 @@ pub(crate) enum Error { #[snafu(display("Error getting create multipart response body: {}", source))] CreateMultipartResponseBody { source: reqwest::Error }, - #[snafu(display("Error performing complete multipart request: {}", source))] - CompleteMultipartRequest { source: crate::client::retry::Error }, + #[snafu(display("Error performing complete multipart request: {}: {}", path, source))] + CompleteMultipartRequest { + source: crate::client::retry::Error, + path: String, + }, #[snafu(display("Error getting complete multipart response body: {}", source))] CompleteMultipartResponseBody { source: reqwest::Error }, @@ -118,13 +121,32 @@ pub(crate) enum Error { impl From for crate::Error { fn from(err: Error) -> Self { - Self::Generic { - store: STORE, - source: Box::new(err), + match err { + Error::CompleteMultipartRequest { source, path } => source.error(STORE, path), + _ => Self::Generic { + store: STORE, + source: Box::new(err), + }, } } } +pub(crate) enum PutPartPayload<'a> { + Part(PutPayload), + Copy(&'a Path), +} + +impl Default for PutPartPayload<'_> { + fn default() -> Self { + Self::Part(PutPayload::default()) + } +} + +pub(crate) enum CompleteMultipartMode { + Overwrite, + Create, +} + #[derive(Deserialize)] #[serde(rename_all = "PascalCase", rename = "DeleteResult")] struct BatchDeleteResponse { @@ -605,15 +627,24 @@ impl S3Client { path: &Path, upload_id: &MultipartId, part_idx: usize, - data: PutPayload, + data: PutPartPayload<'_>, ) -> Result { + let is_copy = matches!(data, PutPartPayload::Copy(_)); let part = (part_idx + 1).to_string(); let mut request = self .request(Method::PUT, path) - .with_payload(data) .query(&[("partNumber", &part), ("uploadId", upload_id)]) .idempotent(true); + + request = match data { + PutPartPayload::Part(payload) => request.with_payload(payload), + PutPartPayload::Copy(path) => request.header( + "x-amz-copy-source", + &format!("{}/{}", self.config.bucket, encode_path(path)), + ), + }; + if self .config .encryption_headers @@ -625,21 +656,48 @@ impl S3Client { } let response = request.send().await?; - let content_id = get_etag(response.headers()).context(MetadataSnafu)?; + let content_id = match is_copy { + false => get_etag(response.headers()).context(MetadataSnafu)?, + true => { + let response = response + .bytes() + .await + .context(CreateMultipartResponseBodySnafu)?; + let response: CopyPartResult = quick_xml::de::from_reader(response.reader()) + .context(InvalidMultipartResponseSnafu)?; + response.e_tag + } + }; Ok(PartId { content_id }) } + pub(crate) async fn abort_multipart(&self, location: &Path, upload_id: &str) -> Result<()> { + self.request(Method::DELETE, location) + .query(&[("uploadId", upload_id)]) + .with_encryption_headers() + .send() + .await?; + + Ok(()) + } + pub(crate) async fn complete_multipart( &self, location: &Path, upload_id: &str, parts: Vec, + mode: CompleteMultipartMode, ) -> Result { let parts = if parts.is_empty() { // If no parts were uploaded, upload an empty part // otherwise the completion request will fail let part = self - .put_part(location, &upload_id.to_string(), 0, PutPayload::default()) + .put_part( + location, + &upload_id.to_string(), + 0, + PutPartPayload::default(), + ) .await?; vec![part] } else { @@ -651,18 +709,27 @@ impl S3Client { let credential = self.config.get_session_credential().await?; let url = self.config.path_url(location); - let response = self + let request = self .client .request(Method::POST, url) .query(&[("uploadId", upload_id)]) .body(body) - .with_aws_sigv4(credential.authorizer(), None) + .with_aws_sigv4(credential.authorizer(), None); + + let request = match mode { + CompleteMultipartMode::Overwrite => request, + CompleteMultipartMode::Create => request.header("If-None-Match", "*"), + }; + + let response = request .retryable(&self.config.retry_config) .idempotent(true) .retry_error_body(true) .send() .await - .context(CompleteMultipartRequestSnafu)?; + .context(CompleteMultipartRequestSnafu { + path: location.as_ref(), + })?; let version = get_version(response.headers(), VERSION_HEADER).context(MetadataSnafu)?; diff --git a/src/aws/mod.rs b/src/aws/mod.rs index a27ed05..b238d90 100644 --- a/src/aws/mod.rs +++ b/src/aws/mod.rs @@ -36,7 +36,7 @@ use reqwest::{Method, StatusCode}; use std::{sync::Arc, time::Duration}; use url::Url; -use crate::aws::client::{RequestError, S3Client}; +use crate::aws::client::{CompleteMultipartMode, PutPartPayload, RequestError, S3Client}; use crate::client::get::GetClientExt; use crate::client::list::ListClientExt; use crate::client::CredentialProvider; @@ -169,7 +169,10 @@ impl ObjectStore for AmazonS3 { match (opts.mode, &self.client.config.conditional_put) { (PutMode::Overwrite, _) => request.idempotent(true).do_put().await, (PutMode::Create | PutMode::Update(_), None) => Err(Error::NotImplemented), - (PutMode::Create, Some(S3ConditionalPut::ETagMatch)) => { + ( + PutMode::Create, + Some(S3ConditionalPut::ETagMatch | S3ConditionalPut::ETagPutIfNotExists), + ) => { match request.header(&IF_NONE_MATCH, "*").do_put().await { // Technically If-None-Match should return NotModified but some stores, // such as R2, instead return PreconditionFailed @@ -193,6 +196,7 @@ impl ObjectStore for AmazonS3 { source: "ETag required for conditional put".to_string().into(), })?; match put { + S3ConditionalPut::ETagPutIfNotExists => Err(Error::NotImplemented), S3ConditionalPut::ETagMatch => { request.header(&IF_MATCH, etag.as_str()).do_put().await } @@ -293,6 +297,47 @@ impl ObjectStore for AmazonS3 { let (k, v, status) = match &self.client.config.copy_if_not_exists { Some(S3CopyIfNotExists::Header(k, v)) => (k, v, StatusCode::PRECONDITION_FAILED), Some(S3CopyIfNotExists::HeaderWithStatus(k, v, status)) => (k, v, *status), + Some(S3CopyIfNotExists::Multipart) => { + let upload_id = self + .client + .create_multipart(to, PutMultipartOpts::default()) + .await?; + + let res = async { + let part_id = self + .client + .put_part(to, &upload_id, 0, PutPartPayload::Copy(from)) + .await?; + match self + .client + .complete_multipart( + to, + &upload_id, + vec![part_id], + CompleteMultipartMode::Create, + ) + .await + { + Err(e @ Error::Precondition { .. }) => Err(Error::AlreadyExists { + path: to.to_string(), + source: Box::new(e), + }), + Ok(_) => Ok(()), + Err(e) => Err(e), + } + } + .await; + + // If the multipart upload failed, make a best effort attempt to + // clean it up. It's the caller's responsibility to add a + // lifecycle rule if guaranteed cleanup is required, as we + // cannot protect against an ill-timed process crash. + if res.is_err() { + let _ = self.client.abort_multipart(to, &upload_id).await; + } + + return res; + } Some(S3CopyIfNotExists::Dynamo(lock)) => { return lock.copy_if_not_exists(&self.client, from, to).await } @@ -340,7 +385,12 @@ impl MultipartUpload for S3MultiPartUpload { Box::pin(async move { let part = state .client - .put_part(&state.location, &state.upload_id, idx, data) + .put_part( + &state.location, + &state.upload_id, + idx, + PutPartPayload::Part(data), + ) .await?; state.parts.put(idx, part); Ok(()) @@ -352,7 +402,12 @@ impl MultipartUpload for S3MultiPartUpload { self.state .client - .complete_multipart(&self.state.location, &self.state.upload_id, parts) + .complete_multipart( + &self.state.location, + &self.state.upload_id, + parts, + CompleteMultipartMode::Overwrite, + ) .await } @@ -384,7 +439,9 @@ impl MultipartStore for AmazonS3 { part_idx: usize, data: PutPayload, ) -> Result { - self.client.put_part(path, id, part_idx, data).await + self.client + .put_part(path, id, part_idx, PutPartPayload::Part(data)) + .await } async fn complete_multipart( @@ -393,7 +450,9 @@ impl MultipartStore for AmazonS3 { id: &MultipartId, parts: Vec, ) -> Result { - self.client.complete_multipart(path, id, parts).await + self.client + .complete_multipart(path, id, parts, CompleteMultipartMode::Overwrite) + .await } async fn abort_multipart(&self, path: &Path, id: &MultipartId) -> Result<()> { @@ -427,7 +486,6 @@ mod tests { let integration = config.build().unwrap(); let config = &integration.client.config; let test_not_exists = config.copy_if_not_exists.is_some(); - let test_conditional_put = config.conditional_put.is_some(); put_get_delete_list(&integration).await; get_opts(&integration).await; @@ -458,8 +516,9 @@ mod tests { if test_not_exists { copy_if_not_exists(&integration).await; } - if test_conditional_put { - put_opts(&integration, true).await; + if let Some(conditional_put) = &config.conditional_put { + let supports_update = !matches!(conditional_put, S3ConditionalPut::ETagPutIfNotExists); + put_opts(&integration, supports_update).await; } // run integration test with unsigned payload enabled diff --git a/src/aws/precondition.rs b/src/aws/precondition.rs index ad9e215..e505805 100644 --- a/src/aws/precondition.rs +++ b/src/aws/precondition.rs @@ -46,6 +46,21 @@ pub enum S3CopyIfNotExists { /// /// Encoded as `header-with-status:::` ignoring whitespace HeaderWithStatus(String, String, reqwest::StatusCode), + /// Native Amazon S3 supports copy if not exists through a multipart upload + /// where the upload copies an existing object and is completed only if the + /// new object does not already exist. + /// + /// WARNING: When using this mode, `copy_if_not_exists` does not copy tags + /// or attributes from the source object. + /// + /// WARNING: When using this mode, `copy_if_not_exists` makes only a best + /// effort attempt to clean up the multipart upload if the copy operation + /// fails. Consider using a lifecycle rule to automatically clean up + /// abandoned multipart uploads. See [the module + /// docs](super#multipart-uploads) for details. + /// + /// Encoded as `multipart` ignoring whitespace. + Multipart, /// The name of a DynamoDB table to use for coordination /// /// Encoded as either `dynamo:` or `dynamo::` @@ -64,6 +79,7 @@ impl std::fmt::Display for S3CopyIfNotExists { Self::HeaderWithStatus(k, v, code) => { write!(f, "header-with-status: {k}: {v}: {}", code.as_u16()) } + Self::Multipart => f.write_str("multipart"), Self::Dynamo(lock) => write!(f, "dynamo: {}", lock.table_name()), } } @@ -71,6 +87,10 @@ impl std::fmt::Display for S3CopyIfNotExists { impl S3CopyIfNotExists { fn from_str(s: &str) -> Option { + if s.trim() == "multipart" { + return Some(Self::Multipart); + }; + let (variant, value) = s.split_once(':')?; match variant.trim() { "header" => { @@ -118,6 +138,17 @@ pub enum S3ConditionalPut { /// [HTTP precondition]: https://datatracker.ietf.org/doc/html/rfc9110#name-preconditions ETagMatch, + /// Like `ETagMatch`, but with support for `PutMode::Create` and not + /// `PutMode::Option`. + /// + /// This is the limited form of conditional put supported by Amazon S3 + /// as of August 2024 ([announcement]). + /// + /// Encoded as `etag-put-if-not-exists` ignoring whitespace. + /// + /// [announcement]: https://aws.amazon.com/about-aws/whats-new/2024/08/amazon-s3-conditional-writes/ + ETagPutIfNotExists, + /// The name of a DynamoDB table to use for coordination /// /// Encoded as either `dynamo:` or `dynamo::` @@ -133,6 +164,7 @@ impl std::fmt::Display for S3ConditionalPut { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { Self::ETagMatch => write!(f, "etag"), + Self::ETagPutIfNotExists => write!(f, "etag-put-if-not-exists"), Self::Dynamo(lock) => write!(f, "dynamo: {}", lock.table_name()), } } @@ -142,6 +174,7 @@ impl S3ConditionalPut { fn from_str(s: &str) -> Option { match s.trim() { "etag" => Some(Self::ETagMatch), + "etag-put-if-not-exists" => Some(Self::ETagPutIfNotExists), trimmed => match trimmed.split_once(':')? { ("dynamo", s) => Some(Self::Dynamo(DynamoCommit::from_str(s)?)), _ => None, diff --git a/src/client/s3.rs b/src/client/s3.rs index a9c4726..dba752c 100644 --- a/src/client/s3.rs +++ b/src/client/s3.rs @@ -92,6 +92,14 @@ pub(crate) struct InitiateMultipartUploadResult { pub upload_id: String, } +#[cfg(feature = "aws")] +#[derive(Debug, Deserialize)] +#[serde(rename_all = "PascalCase")] +pub(crate) struct CopyPartResult { + #[serde(rename = "ETag")] + pub e_tag: String, +} + #[derive(Debug, Serialize)] #[serde(rename_all = "PascalCase")] pub(crate) struct CompleteMultipartUpload { diff --git a/src/integration.rs b/src/integration.rs index 89b21bc..3017787 100644 --- a/src/integration.rs +++ b/src/integration.rs @@ -651,6 +651,12 @@ pub async fn put_opts(storage: &dyn ObjectStore, supports_update: bool) { assert_eq!(b.as_ref(), b"a"); if !supports_update { + let err = storage + .put_opts(&path, "c".into(), PutMode::Update(v1.clone().into()).into()) + .await + .unwrap_err(); + assert!(matches!(err, Error::NotImplemented { .. }), "{err}"); + return; } From 246870d0e8057a606cf1c4560f3c7f925802cea6 Mon Sep 17 00:00:00 2001 From: Marco Herrera-Rendon Date: Fri, 8 Nov 2024 15:44:51 -0700 Subject: [PATCH 357/397] check sign_payload instead of skip_signature before computing checksum (#6698) * check sign_payload instead of skip_signature before computing checksum * Update object_store/src/aws/client.rs Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> * fix format --------- Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> --- src/aws/client.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/aws/client.rs b/src/aws/client.rs index a610e63..895308f 100644 --- a/src/aws/client.rs +++ b/src/aws/client.rs @@ -372,7 +372,9 @@ impl<'a> Request<'a> { } pub(crate) fn with_payload(mut self, payload: PutPayload) -> Self { - if !self.config.skip_signature || self.config.checksum.is_some() { + if (!self.config.skip_signature && self.config.sign_payload) + || self.config.checksum.is_some() + { let mut sha256 = Context::new(&digest::SHA256); payload.iter().for_each(|x| sha256.update(x)); let payload_sha256 = sha256.finish(); From 32c2f0d67e36609b8d8c1f16f4943aba48067ef9 Mon Sep 17 00:00:00 2001 From: Kyle Barron Date: Fri, 22 Nov 2024 14:55:12 +0000 Subject: [PATCH 358/397] object_store: Add support for requester pays buckets (#6768) * Add support for requester pays buckets * Add tests * fix rustdoc --- src/aws/builder.rs | 24 +++++++++ src/aws/client.rs | 4 +- src/aws/credential.rs | 114 ++++++++++++++++++++++++++++++++++++++++++ src/aws/mod.rs | 3 +- 4 files changed, 143 insertions(+), 2 deletions(-) diff --git a/src/aws/builder.rs b/src/aws/builder.rs index eb79f5e..840245a 100644 --- a/src/aws/builder.rs +++ b/src/aws/builder.rs @@ -170,6 +170,8 @@ pub struct AmazonS3Builder { encryption_bucket_key_enabled: Option>, /// base64-encoded 256-bit customer encryption key for SSE-C. encryption_customer_key_base64: Option, + /// When set to true, charge requester for bucket operations + request_payer: ConfigValue, } /// Configuration keys for [`AmazonS3Builder`] @@ -330,6 +332,13 @@ pub enum AmazonS3ConfigKey { /// - `s3_express` S3Express, + /// Enable Support for S3 Requester Pays + /// + /// Supported keys: + /// - `aws_request_payer` + /// - `request_payer` + RequestPayer, + /// Client options Client(ClientConfigKey), @@ -358,6 +367,7 @@ impl AsRef for AmazonS3ConfigKey { Self::CopyIfNotExists => "aws_copy_if_not_exists", Self::ConditionalPut => "aws_conditional_put", Self::DisableTagging => "aws_disable_tagging", + Self::RequestPayer => "aws_request_payer", Self::Client(opt) => opt.as_ref(), Self::Encryption(opt) => opt.as_ref(), } @@ -389,6 +399,7 @@ impl FromStr for AmazonS3ConfigKey { "aws_copy_if_not_exists" | "copy_if_not_exists" => Ok(Self::CopyIfNotExists), "aws_conditional_put" | "conditional_put" => Ok(Self::ConditionalPut), "aws_disable_tagging" | "disable_tagging" => Ok(Self::DisableTagging), + "aws_request_payer" | "request_payer" => Ok(Self::RequestPayer), // Backwards compatibility "aws_allow_http" => Ok(Self::Client(ClientConfigKey::AllowHttp)), "aws_server_side_encryption" => Ok(Self::Encryption( @@ -510,6 +521,9 @@ impl AmazonS3Builder { AmazonS3ConfigKey::ConditionalPut => { self.conditional_put = Some(ConfigValue::Deferred(value.into())) } + AmazonS3ConfigKey::RequestPayer => { + self.request_payer = ConfigValue::Deferred(value.into()) + } AmazonS3ConfigKey::Encryption(key) => match key { S3EncryptionConfigKey::ServerSideEncryption => { self.encryption_type = Some(ConfigValue::Deferred(value.into())) @@ -567,6 +581,7 @@ impl AmazonS3Builder { self.conditional_put.as_ref().map(ToString::to_string) } AmazonS3ConfigKey::DisableTagging => Some(self.disable_tagging.to_string()), + AmazonS3ConfigKey::RequestPayer => Some(self.request_payer.to_string()), AmazonS3ConfigKey::Encryption(key) => match key { S3EncryptionConfigKey::ServerSideEncryption => { self.encryption_type.as_ref().map(ToString::to_string) @@ -845,6 +860,14 @@ impl AmazonS3Builder { self } + /// Set whether to charge requester for bucket operations. + /// + /// + pub fn with_request_payer(mut self, enabled: bool) -> Self { + self.request_payer = ConfigValue::Parsed(enabled); + self + } + /// Create a [`AmazonS3`] instance from the provided values, /// consuming `self`. pub fn build(mut self) -> Result { @@ -996,6 +1019,7 @@ impl AmazonS3Builder { copy_if_not_exists, conditional_put: put_precondition, encryption_headers, + request_payer: self.request_payer.get()?, }; let client = Arc::new(S3Client::new(config)?); diff --git a/src/aws/client.rs b/src/aws/client.rs index 895308f..b19e0e2 100644 --- a/src/aws/client.rs +++ b/src/aws/client.rs @@ -202,6 +202,7 @@ pub(crate) struct S3Config { pub checksum: Option, pub copy_if_not_exists: Option, pub conditional_put: Option, + pub request_payer: bool, pub(super) encryption_headers: S3EncryptionHeaders, } @@ -249,7 +250,8 @@ impl<'a> SessionCredential<'a> { fn authorizer(&self) -> Option> { let mut authorizer = AwsAuthorizer::new(self.credential.as_deref()?, "s3", &self.config.region) - .with_sign_payload(self.config.sign_payload); + .with_sign_payload(self.config.sign_payload) + .with_request_payer(self.config.request_payer); if self.session_token { let token = HeaderName::from_static("x-amz-s3session-token"); diff --git a/src/aws/credential.rs b/src/aws/credential.rs index 33972c6..ee2f8e2 100644 --- a/src/aws/credential.rs +++ b/src/aws/credential.rs @@ -101,11 +101,14 @@ pub struct AwsAuthorizer<'a> { region: &'a str, token_header: Option, sign_payload: bool, + request_payer: bool, } static DATE_HEADER: HeaderName = HeaderName::from_static("x-amz-date"); static HASH_HEADER: HeaderName = HeaderName::from_static("x-amz-content-sha256"); static TOKEN_HEADER: HeaderName = HeaderName::from_static("x-amz-security-token"); +static REQUEST_PAYER_HEADER: HeaderName = HeaderName::from_static("x-amz-request-payer"); +static REQUEST_PAYER_HEADER_VALUE: HeaderValue = HeaderValue::from_static("requester"); const ALGORITHM: &str = "AWS4-HMAC-SHA256"; impl<'a> AwsAuthorizer<'a> { @@ -118,6 +121,7 @@ impl<'a> AwsAuthorizer<'a> { date: None, sign_payload: true, token_header: None, + request_payer: false, } } @@ -134,6 +138,14 @@ impl<'a> AwsAuthorizer<'a> { self } + /// Set whether to include requester pays headers + /// + /// + pub fn with_request_payer(mut self, request_payer: bool) -> Self { + self.request_payer = request_payer; + self + } + /// Authorize `request` with an optional pre-calculated SHA256 digest by attaching /// the relevant [AWS SigV4] headers /// @@ -180,6 +192,15 @@ impl<'a> AwsAuthorizer<'a> { let header_digest = HeaderValue::from_str(&digest).unwrap(); request.headers_mut().insert(&HASH_HEADER, header_digest); + if self.request_payer { + // For DELETE, GET, HEAD, POST, and PUT requests, include x-amz-request-payer : + // requester in the header + // https://docs.aws.amazon.com/AmazonS3/latest/userguide/ObjectsinRequesterPaysBuckets.html + request + .headers_mut() + .insert(&REQUEST_PAYER_HEADER, REQUEST_PAYER_HEADER_VALUE.clone()); + } + let (signed_headers, canonical_headers) = canonicalize_headers(request.headers()); let scope = self.scope(date); @@ -226,6 +247,13 @@ impl<'a> AwsAuthorizer<'a> { .append_pair("X-Amz-Expires", &expires_in.as_secs().to_string()) .append_pair("X-Amz-SignedHeaders", "host"); + if self.request_payer { + // For signed URLs, include x-amz-request-payer=requester in the request + // https://docs.aws.amazon.com/AmazonS3/latest/userguide/ObjectsinRequesterPaysBuckets.html + url.query_pairs_mut() + .append_pair("x-amz-request-payer", "requester"); + } + // For S3, you must include the X-Amz-Security-Token query parameter in the URL if // using credentials sourced from the STS service. if let Some(ref token) = self.credential.token { @@ -763,12 +791,53 @@ mod tests { region: "us-east-1", sign_payload: true, token_header: None, + request_payer: false, }; signer.authorize(&mut request, None); assert_eq!(request.headers().get(&AUTHORIZATION).unwrap(), "AWS4-HMAC-SHA256 Credential=AKIAIOSFODNN7EXAMPLE/20220806/us-east-1/ec2/aws4_request, SignedHeaders=host;x-amz-content-sha256;x-amz-date, Signature=a3c787a7ed37f7fdfbfd2d7056a3d7c9d85e6d52a2bfbec73793c0be6e7862d4") } + #[test] + fn test_sign_with_signed_payload_request_payer() { + let client = Client::new(); + + // Test credentials from https://docs.aws.amazon.com/AmazonS3/latest/userguide/RESTAuthentication.html + let credential = AwsCredential { + key_id: "AKIAIOSFODNN7EXAMPLE".to_string(), + secret_key: "wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY".to_string(), + token: None, + }; + + // method = 'GET' + // service = 'ec2' + // host = 'ec2.amazonaws.com' + // region = 'us-east-1' + // endpoint = 'https://ec2.amazonaws.com' + // request_parameters = '' + let date = DateTime::parse_from_rfc3339("2022-08-06T18:01:34Z") + .unwrap() + .with_timezone(&Utc); + + let mut request = client + .request(Method::GET, "https://ec2.amazon.com/") + .build() + .unwrap(); + + let signer = AwsAuthorizer { + date: Some(date), + credential: &credential, + service: "ec2", + region: "us-east-1", + sign_payload: true, + token_header: None, + request_payer: true, + }; + + signer.authorize(&mut request, None); + assert_eq!(request.headers().get(&AUTHORIZATION).unwrap(), "AWS4-HMAC-SHA256 Credential=AKIAIOSFODNN7EXAMPLE/20220806/us-east-1/ec2/aws4_request, SignedHeaders=host;x-amz-content-sha256;x-amz-date;x-amz-request-payer, Signature=7030625a9e9b57ed2a40e63d749f4a4b7714b6e15004cab026152f870dd8565d") + } + #[test] fn test_sign_with_unsigned_payload() { let client = Client::new(); @@ -802,6 +871,7 @@ mod tests { region: "us-east-1", token_header: None, sign_payload: false, + request_payer: false, }; authorizer.authorize(&mut request, None); @@ -828,6 +898,7 @@ mod tests { region: "us-east-1", token_header: None, sign_payload: false, + request_payer: false, }; let mut url = Url::parse("https://examplebucket.s3.amazonaws.com/test.txt").unwrap(); @@ -848,6 +919,48 @@ mod tests { ); } + #[test] + fn signed_get_url_request_payer() { + // Values from https://docs.aws.amazon.com/AmazonS3/latest/API/sigv4-query-string-auth.html + let credential = AwsCredential { + key_id: "AKIAIOSFODNN7EXAMPLE".to_string(), + secret_key: "wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY".to_string(), + token: None, + }; + + let date = DateTime::parse_from_rfc3339("2013-05-24T00:00:00Z") + .unwrap() + .with_timezone(&Utc); + + let authorizer = AwsAuthorizer { + date: Some(date), + credential: &credential, + service: "s3", + region: "us-east-1", + token_header: None, + sign_payload: false, + request_payer: true, + }; + + let mut url = Url::parse("https://examplebucket.s3.amazonaws.com/test.txt").unwrap(); + authorizer.sign(Method::GET, &mut url, Duration::from_secs(86400)); + + assert_eq!( + url, + Url::parse( + "https://examplebucket.s3.amazonaws.com/test.txt?\ + X-Amz-Algorithm=AWS4-HMAC-SHA256&\ + X-Amz-Credential=AKIAIOSFODNN7EXAMPLE%2F20130524%2Fus-east-1%2Fs3%2Faws4_request&\ + X-Amz-Date=20130524T000000Z&\ + X-Amz-Expires=86400&\ + X-Amz-SignedHeaders=host&\ + x-amz-request-payer=requester&\ + X-Amz-Signature=9ad7c781cc30121f199b47d35ed3528473e4375b63c5d91cd87c927803e4e00a" + ) + .unwrap() + ); + } + #[test] fn test_sign_port() { let client = Client::new(); @@ -880,6 +993,7 @@ mod tests { region: "us-east-1", token_header: None, sign_payload: true, + request_payer: false, }; authorizer.authorize(&mut request, None); diff --git a/src/aws/mod.rs b/src/aws/mod.rs index b238d90..81511ba 100644 --- a/src/aws/mod.rs +++ b/src/aws/mod.rs @@ -136,7 +136,8 @@ impl Signer for AmazonS3 { /// ``` async fn signed_url(&self, method: Method, path: &Path, expires_in: Duration) -> Result { let credential = self.credentials().get_credential().await?; - let authorizer = AwsAuthorizer::new(&credential, "s3", &self.client.config.region); + let authorizer = AwsAuthorizer::new(&credential, "s3", &self.client.config.region) + .with_request_payer(self.client.config.request_payer); let path_url = self.path_url(path); let mut url = Url::parse(&path_url).map_err(|e| crate::Error::Generic { From 567bd38112b10f94d676779df9949e93bc998fa7 Mon Sep 17 00:00:00 2001 From: Zachary DeLuca Date: Fri, 22 Nov 2024 14:10:23 -0500 Subject: [PATCH 359/397] Add AuthorityHost to AzureConfigKey (#6773) --- src/azure/builder.rs | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/src/azure/builder.rs b/src/azure/builder.rs index 1c4589b..08c9a23 100644 --- a/src/azure/builder.rs +++ b/src/azure/builder.rs @@ -240,6 +240,14 @@ pub enum AzureConfigKey { /// - `authority_id` AuthorityId, + /// Authority host used in oauth flows + /// + /// Supported keys: + /// - `azure_storage_authority_host` + /// - `azure_authority_host` + /// - `authority_host` + AuthorityHost, + /// Shared access signature. /// /// The signature is expected to be percent-encoded, much like they are provided @@ -383,6 +391,7 @@ impl AsRef for AzureConfigKey { Self::ClientId => "azure_storage_client_id", Self::ClientSecret => "azure_storage_client_secret", Self::AuthorityId => "azure_storage_tenant_id", + Self::AuthorityHost => "azure_storage_authority_host", Self::SasKey => "azure_storage_sas_key", Self::Token => "azure_storage_token", Self::UseEmulator => "azure_storage_use_emulator", @@ -427,6 +436,9 @@ impl FromStr for AzureConfigKey { | "azure_authority_id" | "tenant_id" | "authority_id" => Ok(Self::AuthorityId), + "azure_storage_authority_host" | "azure_authority_host" | "authority_host" => { + Ok(Self::AuthorityHost) + } "azure_storage_sas_key" | "azure_storage_sas_token" | "sas_key" | "sas_token" => { Ok(Self::SasKey) } @@ -556,6 +568,7 @@ impl MicrosoftAzureBuilder { AzureConfigKey::ClientId => self.client_id = Some(value.into()), AzureConfigKey::ClientSecret => self.client_secret = Some(value.into()), AzureConfigKey::AuthorityId => self.tenant_id = Some(value.into()), + AzureConfigKey::AuthorityHost => self.authority_host = Some(value.into()), AzureConfigKey::SasKey => self.sas_key = Some(value.into()), AzureConfigKey::Token => self.bearer_token = Some(value.into()), AzureConfigKey::MsiEndpoint => self.msi_endpoint = Some(value.into()), @@ -602,6 +615,7 @@ impl MicrosoftAzureBuilder { AzureConfigKey::ClientId => self.client_id.clone(), AzureConfigKey::ClientSecret => self.client_secret.clone(), AzureConfigKey::AuthorityId => self.tenant_id.clone(), + AzureConfigKey::AuthorityHost => self.authority_host.clone(), AzureConfigKey::SasKey => self.sas_key.clone(), AzureConfigKey::Token => self.bearer_token.clone(), AzureConfigKey::UseEmulator => Some(self.use_emulator.to_string()), From 750e79a16d77990646524dd7a64d980dd7f9d527 Mon Sep 17 00:00:00 2001 From: Piotr Findeisen Date: Sun, 24 Nov 2024 22:55:39 +0100 Subject: [PATCH 360/397] Add version to deprecation messages (#6782) Version is inferred from first release tag containing the commit that added the deprecation. --- src/memory.rs | 2 +- src/prefix.rs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/memory.rs b/src/memory.rs index b458bdd..4584ab7 100644 --- a/src/memory.rs +++ b/src/memory.rs @@ -469,7 +469,7 @@ impl InMemory { } /// Creates a clone of the store - #[deprecated(note = "Use fork() instead")] + #[deprecated(since = "44.0.0", note = "Use fork() instead")] pub async fn clone(&self) -> Self { self.fork() } diff --git a/src/prefix.rs b/src/prefix.rs index 9b10fea..8e52d1f 100644 --- a/src/prefix.rs +++ b/src/prefix.rs @@ -27,7 +27,7 @@ use crate::{ }; #[doc(hidden)] -#[deprecated(note = "Use PrefixStore")] +#[deprecated(since = "36.0.0", note = "Use PrefixStore")] pub type PrefixObjectStore = PrefixStore; /// Store wrapper that applies a constant prefix to all paths handled by the store. From d54fe5ab0d8ba482e240e69ef256d02f430c89d3 Mon Sep 17 00:00:00 2001 From: Piotr Findeisen Date: Mon, 25 Nov 2024 14:57:40 +0100 Subject: [PATCH 361/397] Remove APIs deprecated on or before 49.0.0 (#6786) 49 release was Nov 9, 2023. Remove the APIs that are deprecated since then, or earlier than that. Few such deprecated APIs remain in the code, they are still in use so the code needs update. --- src/memory.rs | 6 ------ src/prefix.rs | 4 ---- 2 files changed, 10 deletions(-) diff --git a/src/memory.rs b/src/memory.rs index 4584ab7..a467e3b 100644 --- a/src/memory.rs +++ b/src/memory.rs @@ -468,12 +468,6 @@ impl InMemory { Self { storage } } - /// Creates a clone of the store - #[deprecated(since = "44.0.0", note = "Use fork() instead")] - pub async fn clone(&self) -> Self { - self.fork() - } - async fn entry(&self, location: &Path) -> Result { let storage = self.storage.read(); let value = storage diff --git a/src/prefix.rs b/src/prefix.rs index 8e52d1f..227887d 100644 --- a/src/prefix.rs +++ b/src/prefix.rs @@ -26,10 +26,6 @@ use crate::{ PutOptions, PutPayload, PutResult, Result, }; -#[doc(hidden)] -#[deprecated(since = "36.0.0", note = "Use PrefixStore")] -pub type PrefixObjectStore = PrefixStore; - /// Store wrapper that applies a constant prefix to all paths handled by the store. #[derive(Debug, Clone)] pub struct PrefixStore { From 1fea3c3cef6d1c2c3c056aff925949832e1d821d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Guedes?= Date: Mon, 25 Nov 2024 11:29:58 -0300 Subject: [PATCH 362/397] Implement bulk_delete_request for Azure (#5681) * Implement bulk_delete_request for Azure * Fix lint and add Azurite bug workaround * Special 404 error case * Clippy fix * Make number of expected headers more conservative and better document invariants * Use multer for multipart parsing * Fix clippy * Fix clippy #2 * Reuse part response buffer * Make multer conditional to azure feature * One more HeaderValue::from_static * Add tests for bulk delete request building and response parsing * Switch back to manual parsing to avoid multer dependency, other PR suggestions * Fixes lint --- Cargo.toml | 5 +- src/azure/client.rs | 486 +++++++++++++++++++++++++++++++++++++++++++- src/azure/mod.rs | 22 +- 3 files changed, 509 insertions(+), 4 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 86d1392..536874f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -55,13 +55,14 @@ ring = { version = "0.17", default-features = false, features = ["std"], optiona rustls-pemfile = { version = "2.0", default-features = false, features = ["std"], optional = true } tokio = { version = "1.29.0", features = ["sync", "macros", "rt", "time", "io-util"] } md-5 = { version = "0.10.6", default-features = false, optional = true } +httparse = { version = "1.8.0", default-features = false, features = ["std"], optional = true } [target.'cfg(target_family="unix")'.dev-dependencies] nix = { version = "0.29.0", features = ["fs"] } [features] cloud = ["serde", "serde_json", "quick-xml", "hyper", "reqwest", "reqwest/json", "reqwest/stream", "chrono/serde", "base64", "rand", "ring"] -azure = ["cloud"] +azure = ["cloud", "httparse"] gcp = ["cloud", "rustls-pemfile"] aws = ["cloud", "md-5"] http = ["cloud"] @@ -75,6 +76,8 @@ hyper-util = "0.1" http-body-util = "0.1" rand = "0.8" tempfile = "3.1.0" +regex = "1.11.1" +http = "1.1.0" [[test]] name = "get_range_file" diff --git a/src/azure/client.rs b/src/azure/client.rs index e78f8db..76dedd7 100644 --- a/src/azure/client.rs +++ b/src/azure/client.rs @@ -31,13 +31,13 @@ use crate::{ PutMultipartOpts, PutOptions, PutPayload, PutResult, Result, RetryConfig, TagSet, }; use async_trait::async_trait; -use base64::prelude::BASE64_STANDARD; +use base64::prelude::{BASE64_STANDARD, BASE64_STANDARD_NO_PAD}; use base64::Engine; use bytes::{Buf, Bytes}; use chrono::{DateTime, Utc}; use hyper::http::HeaderName; use reqwest::{ - header::{HeaderValue, CONTENT_LENGTH, IF_MATCH, IF_NONE_MATCH}, + header::{HeaderMap, HeaderValue, CONTENT_LENGTH, CONTENT_TYPE, IF_MATCH, IF_NONE_MATCH}, Client as ReqwestClient, Method, RequestBuilder, Response, }; use serde::{Deserialize, Serialize}; @@ -79,6 +79,34 @@ pub(crate) enum Error { path: String, }, + #[snafu(display("Error performing bulk delete request: {}", source))] + BulkDeleteRequest { source: crate::client::retry::Error }, + + #[snafu(display("Error receiving bulk delete request body: {}", source))] + BulkDeleteRequestBody { source: reqwest::Error }, + + #[snafu(display( + "Bulk delete request failed due to invalid input: {} (code: {})", + reason, + code + ))] + BulkDeleteRequestInvalidInput { code: String, reason: String }, + + #[snafu(display("Got invalid bulk delete response: {}", reason))] + InvalidBulkDeleteResponse { reason: String }, + + #[snafu(display( + "Bulk delete request failed for key {}: {} (code: {})", + path, + reason, + code + ))] + DeleteFailed { + path: String, + code: String, + reason: String, + }, + #[snafu(display("Error performing list request: {}", source))] ListRequest { source: crate::client::retry::Error }, @@ -247,6 +275,223 @@ impl<'a> PutRequest<'a> { } } +#[inline] +fn extend(dst: &mut Vec, data: &[u8]) { + dst.extend_from_slice(data); +} + +// Write header names as title case. The header name is assumed to be ASCII. +// We need it because Azure is not always treating headers as case insensitive. +fn title_case(dst: &mut Vec, name: &[u8]) { + dst.reserve(name.len()); + + // Ensure first character is uppercased + let mut prev = b'-'; + for &(mut c) in name { + if prev == b'-' { + c.make_ascii_uppercase(); + } + dst.push(c); + prev = c; + } +} + +fn write_headers(headers: &HeaderMap, dst: &mut Vec) { + for (name, value) in headers { + // We need special case handling here otherwise Azure returns 400 + // due to `Content-Id` instead of `Content-ID` + if name == "content-id" { + extend(dst, b"Content-ID"); + } else { + title_case(dst, name.as_str().as_bytes()); + } + extend(dst, b": "); + extend(dst, value.as_bytes()); + extend(dst, b"\r\n"); + } +} + +// https://docs.oasis-open.org/odata/odata/v4.0/errata02/os/complete/part1-protocol/odata-v4.0-errata02-os-part1-protocol-complete.html#_Toc406398359 +fn serialize_part_delete_request( + dst: &mut Vec, + boundary: &str, + idx: usize, + request: reqwest::Request, + relative_url: String, +) { + // Encode start marker for part + extend(dst, b"--"); + extend(dst, boundary.as_bytes()); + extend(dst, b"\r\n"); + + // Encode part headers + let mut part_headers = HeaderMap::new(); + part_headers.insert(CONTENT_TYPE, HeaderValue::from_static("application/http")); + part_headers.insert( + "Content-Transfer-Encoding", + HeaderValue::from_static("binary"), + ); + // Azure returns 400 if we send `Content-Id` instead of `Content-ID` + part_headers.insert("Content-ID", HeaderValue::from(idx)); + write_headers(&part_headers, dst); + extend(dst, b"\r\n"); + + // Encode the subrequest request-line + extend(dst, b"DELETE "); + extend(dst, format!("/{} ", relative_url).as_bytes()); + extend(dst, b"HTTP/1.1"); + extend(dst, b"\r\n"); + + // Encode subrequest headers + write_headers(request.headers(), dst); + extend(dst, b"\r\n"); + extend(dst, b"\r\n"); +} + +fn parse_multipart_response_boundary(response: &Response) -> Result { + let invalid_response = |msg: &str| Error::InvalidBulkDeleteResponse { + reason: msg.to_string(), + }; + + let content_type = response + .headers() + .get(CONTENT_TYPE) + .ok_or_else(|| invalid_response("missing Content-Type"))?; + + let boundary = content_type + .as_ref() + .strip_prefix(b"multipart/mixed; boundary=") + .ok_or_else(|| invalid_response("invalid Content-Type value"))? + .to_vec(); + + let boundary = + String::from_utf8(boundary).map_err(|_| invalid_response("invalid multipart boundary"))?; + + Ok(boundary) +} + +fn invalid_response(msg: &str) -> Error { + Error::InvalidBulkDeleteResponse { + reason: msg.to_string(), + } +} + +#[derive(Debug)] +struct MultipartField { + headers: HeaderMap, + content: Bytes, +} + +fn parse_multipart_body_fields(body: Bytes, boundary: &[u8]) -> Result> { + let start_marker = [b"--", boundary, b"\r\n"].concat(); + let next_marker = &start_marker[..start_marker.len() - 2]; + let end_marker = [b"--", boundary, b"--\r\n"].concat(); + + // There should be at most 256 responses per batch + let mut fields = Vec::with_capacity(256); + let mut remaining: &[u8] = body.as_ref(); + loop { + remaining = remaining + .strip_prefix(start_marker.as_slice()) + .ok_or_else(|| invalid_response("missing start marker for field"))?; + + // The documentation only mentions two headers for fields, we leave some extra margin + let mut scratch = [httparse::EMPTY_HEADER; 10]; + let mut headers = HeaderMap::new(); + match httparse::parse_headers(remaining, &mut scratch) { + Ok(httparse::Status::Complete((pos, headers_slice))) => { + remaining = &remaining[pos..]; + for header in headers_slice { + headers.insert( + HeaderName::from_bytes(header.name.as_bytes()).expect("valid"), + HeaderValue::from_bytes(header.value).expect("valid"), + ); + } + } + _ => return Err(invalid_response("unable to parse field headers").into()), + }; + + let next_pos = remaining + .windows(next_marker.len()) + .position(|window| window == next_marker) + .ok_or_else(|| invalid_response("early EOF while seeking to next boundary"))?; + + fields.push(MultipartField { + headers, + content: body.slice_ref(&remaining[..next_pos]), + }); + + remaining = &remaining[next_pos..]; + + // Support missing final CRLF + if remaining == end_marker || remaining == &end_marker[..end_marker.len() - 2] { + break; + } + } + Ok(fields) +} + +async fn parse_blob_batch_delete_body( + batch_body: Bytes, + boundary: String, + paths: &[Path], +) -> Result>> { + let mut results: Vec> = paths.iter().cloned().map(Ok).collect(); + + for field in parse_multipart_body_fields(batch_body, boundary.as_bytes())? { + let id = field + .headers + .get("content-id") + .and_then(|v| std::str::from_utf8(v.as_bytes()).ok()) + .and_then(|v| v.parse::().ok()); + + // Parse part response headers + // Documentation mentions 5 headers and states that other standard HTTP headers + // may be provided, in order to not incurr in more complexity to support an arbitrary + // amount of headers we chose a conservative amount and error otherwise + // https://learn.microsoft.com/en-us/rest/api/storageservices/delete-blob?tabs=microsoft-entra-id#response-headers + let mut headers = [httparse::EMPTY_HEADER; 48]; + let mut part_response = httparse::Response::new(&mut headers); + match part_response.parse(&field.content) { + Ok(httparse::Status::Complete(_)) => {} + _ => return Err(invalid_response("unable to parse response").into()), + }; + + match (id, part_response.code) { + (Some(_id), Some(code)) if (200..300).contains(&code) => {} + (Some(id), Some(404)) => { + results[id] = Err(crate::Error::NotFound { + path: paths[id].as_ref().to_string(), + source: Error::DeleteFailed { + path: paths[id].as_ref().to_string(), + code: 404.to_string(), + reason: part_response.reason.unwrap_or_default().to_string(), + } + .into(), + }); + } + (Some(id), Some(code)) => { + results[id] = Err(Error::DeleteFailed { + path: paths[id].as_ref().to_string(), + code: code.to_string(), + reason: part_response.reason.unwrap_or_default().to_string(), + } + .into()); + } + (None, Some(code)) => { + return Err(Error::BulkDeleteRequestInvalidInput { + code: code.to_string(), + reason: part_response.reason.unwrap_or_default().to_string(), + } + .into()) + } + _ => return Err(invalid_response("missing part response status code").into()), + } + } + + Ok(results) +} + #[derive(Debug)] pub(crate) struct AzureClient { config: AzureConfig, @@ -380,6 +625,86 @@ impl AzureClient { Ok(()) } + fn build_bulk_delete_body( + &self, + boundary: &str, + paths: &[Path], + credential: &Option>, + ) -> Vec { + let mut body_bytes = Vec::with_capacity(paths.len() * 2048); + + for (idx, path) in paths.iter().enumerate() { + let url = self.config.path_url(path); + + // Build subrequest with proper authorization + let request = self + .client + .request(Method::DELETE, url) + .header(CONTENT_LENGTH, HeaderValue::from(0)) + // Each subrequest must be authorized individually [1] and we use + // the CredentialExt for this. + // [1]: https://learn.microsoft.com/en-us/rest/api/storageservices/blob-batch?tabs=microsoft-entra-id#request-body + .with_azure_authorization(credential, &self.config.account) + .build() + .unwrap(); + + // Url for part requests must be relative and without base + let relative_url = self.config.service.make_relative(request.url()).unwrap(); + + serialize_part_delete_request(&mut body_bytes, boundary, idx, request, relative_url) + } + + // Encode end marker + extend(&mut body_bytes, b"--"); + extend(&mut body_bytes, boundary.as_bytes()); + extend(&mut body_bytes, b"--"); + extend(&mut body_bytes, b"\r\n"); + body_bytes + } + + pub(crate) async fn bulk_delete_request(&self, paths: Vec) -> Result>> { + if paths.is_empty() { + return Ok(Vec::new()); + } + + let credential = self.get_credential().await?; + + // https://www.ietf.org/rfc/rfc2046 + let random_bytes = rand::random::<[u8; 16]>(); // 128 bits + let boundary = format!("batch_{}", BASE64_STANDARD_NO_PAD.encode(random_bytes)); + + let body_bytes = self.build_bulk_delete_body(&boundary, &paths, &credential); + + // Send multipart request + let url = self.config.path_url(&Path::from("/")); + let batch_response = self + .client + .request(Method::POST, url) + .query(&[("restype", "container"), ("comp", "batch")]) + .header( + CONTENT_TYPE, + HeaderValue::from_str(format!("multipart/mixed; boundary={}", boundary).as_str()) + .unwrap(), + ) + .header(CONTENT_LENGTH, HeaderValue::from(body_bytes.len())) + .body(body_bytes) + .with_azure_authorization(&credential, &self.config.account) + .send_retry(&self.config.retry_config) + .await + .context(BulkDeleteRequestSnafu {})?; + + let boundary = parse_multipart_response_boundary(&batch_response)?; + + let batch_body = batch_response + .bytes() + .await + .context(BulkDeleteRequestBodySnafu {})?; + + let results = parse_blob_batch_delete_body(batch_body, boundary, &paths).await?; + + Ok(results) + } + /// Make an Azure Copy request pub(crate) async fn copy_request(&self, from: &Path, to: &Path, overwrite: bool) -> Result<()> { let credential = self.get_credential().await?; @@ -814,8 +1139,10 @@ pub(crate) struct UserDelegationKey { #[cfg(test)] mod tests { use bytes::Bytes; + use regex::bytes::Regex; use super::*; + use crate::StaticCredentialProvider; #[test] fn deserde_azure() { @@ -1005,4 +1332,159 @@ mod tests { let _delegated_key_response_internal: UserDelegationKey = quick_xml::de::from_str(S).unwrap(); } + + #[tokio::test] + async fn test_build_bulk_delete_body() { + let credential_provider = Arc::new(StaticCredentialProvider::new( + AzureCredential::BearerToken("static-token".to_string()), + )); + + let config = AzureConfig { + account: "testaccount".to_string(), + container: "testcontainer".to_string(), + credentials: credential_provider, + service: "http://example.com".try_into().unwrap(), + retry_config: Default::default(), + is_emulator: false, + skip_signature: false, + disable_tagging: false, + client_options: Default::default(), + }; + + let client = AzureClient::new(config).unwrap(); + + let credential = client.get_credential().await.unwrap(); + let paths = &[Path::from("a"), Path::from("b"), Path::from("c")]; + + let boundary = "batch_statictestboundary".to_string(); + + let body_bytes = client.build_bulk_delete_body(&boundary, paths, &credential); + + // Replace Date header value with a static date + let re = Regex::new("Date:[^\r]+").unwrap(); + let body_bytes = re + .replace_all(&body_bytes, b"Date: Tue, 05 Nov 2024 15:01:15 GMT") + .to_vec(); + + let expected_body = b"--batch_statictestboundary\r +Content-Type: application/http\r +Content-Transfer-Encoding: binary\r +Content-ID: 0\r +\r +DELETE /testcontainer/a HTTP/1.1\r +Content-Length: 0\r +Date: Tue, 05 Nov 2024 15:01:15 GMT\r +X-Ms-Version: 2023-11-03\r +Authorization: Bearer static-token\r +\r +\r +--batch_statictestboundary\r +Content-Type: application/http\r +Content-Transfer-Encoding: binary\r +Content-ID: 1\r +\r +DELETE /testcontainer/b HTTP/1.1\r +Content-Length: 0\r +Date: Tue, 05 Nov 2024 15:01:15 GMT\r +X-Ms-Version: 2023-11-03\r +Authorization: Bearer static-token\r +\r +\r +--batch_statictestboundary\r +Content-Type: application/http\r +Content-Transfer-Encoding: binary\r +Content-ID: 2\r +\r +DELETE /testcontainer/c HTTP/1.1\r +Content-Length: 0\r +Date: Tue, 05 Nov 2024 15:01:15 GMT\r +X-Ms-Version: 2023-11-03\r +Authorization: Bearer static-token\r +\r +\r +--batch_statictestboundary--\r\n" + .to_vec(); + + assert_eq!(expected_body, body_bytes); + } + + #[tokio::test] + async fn test_parse_blob_batch_delete_body() { + let response_body = b"--batchresponse_66925647-d0cb-4109-b6d3-28efe3e1e5ed\r +Content-Type: application/http\r +Content-ID: 0\r +\r +HTTP/1.1 202 Accepted\r +x-ms-delete-type-permanent: true\r +x-ms-request-id: 778fdc83-801e-0000-62ff-0334671e284f\r +x-ms-version: 2018-11-09\r +\r +--batchresponse_66925647-d0cb-4109-b6d3-28efe3e1e5ed\r +Content-Type: application/http\r +Content-ID: 1\r +\r +HTTP/1.1 202 Accepted\r +x-ms-delete-type-permanent: true\r +x-ms-request-id: 778fdc83-801e-0000-62ff-0334671e2851\r +x-ms-version: 2018-11-09\r +\r +--batchresponse_66925647-d0cb-4109-b6d3-28efe3e1e5ed\r +Content-Type: application/http\r +Content-ID: 2\r +\r +HTTP/1.1 404 The specified blob does not exist.\r +x-ms-error-code: BlobNotFound\r +x-ms-request-id: 778fdc83-801e-0000-62ff-0334671e2852\r +x-ms-version: 2018-11-09\r +Content-Length: 216\r +Content-Type: application/xml\r +\r + +BlobNotFoundThe specified blob does not exist. +RequestId:778fdc83-801e-0000-62ff-0334671e2852 +Time:2018-06-14T16:46:54.6040685Z\r +--batchresponse_66925647-d0cb-4109-b6d3-28efe3e1e5ed--\r\n"; + + let response: reqwest::Response = http::Response::builder() + .status(202) + .header("Transfer-Encoding", "chunked") + .header( + "Content-Type", + "multipart/mixed; boundary=batchresponse_66925647-d0cb-4109-b6d3-28efe3e1e5ed", + ) + .header("x-ms-request-id", "778fdc83-801e-0000-62ff-033467000000") + .header("x-ms-version", "2018-11-09") + .body(Bytes::from(response_body.as_slice())) + .unwrap() + .into(); + + let boundary = parse_multipart_response_boundary(&response).unwrap(); + let body = response.bytes().await.unwrap(); + + let paths = &[Path::from("a"), Path::from("b"), Path::from("c")]; + + let results = parse_blob_batch_delete_body(body, boundary, paths) + .await + .unwrap(); + + assert!(results[0].is_ok()); + assert_eq!(&paths[0], results[0].as_ref().unwrap()); + + assert!(results[1].is_ok()); + assert_eq!(&paths[1], results[1].as_ref().unwrap()); + + assert!(results[2].is_err()); + let err = results[2].as_ref().unwrap_err(); + let crate::Error::NotFound { source, .. } = err else { + unreachable!("must be not found") + }; + let Some(Error::DeleteFailed { path, code, reason }) = source.downcast_ref::() + else { + unreachable!("must be client error") + }; + + assert_eq!(paths[2].as_ref(), path); + assert_eq!("404", code); + assert_eq!("The specified blob does not exist.", reason); + } } diff --git a/src/azure/mod.rs b/src/azure/mod.rs index f89a184..177bffb 100644 --- a/src/azure/mod.rs +++ b/src/azure/mod.rs @@ -30,7 +30,7 @@ use crate::{ PutMultipartOpts, PutOptions, PutPayload, PutResult, Result, UploadPart, }; use async_trait::async_trait; -use futures::stream::BoxStream; +use futures::stream::{BoxStream, StreamExt, TryStreamExt}; use reqwest::Method; use std::fmt::Debug; use std::sync::Arc; @@ -119,6 +119,26 @@ impl ObjectStore for MicrosoftAzure { self.client.delete_request(location, &()).await } + fn delete_stream<'a>( + &'a self, + locations: BoxStream<'a, Result>, + ) -> BoxStream<'a, Result> { + locations + .try_chunks(256) + .map(move |locations| async { + // Early return the error. We ignore the paths that have already been + // collected into the chunk. + let locations = locations.map_err(|e| e.1)?; + self.client + .bulk_delete_request(locations) + .await + .map(futures::stream::iter) + }) + .buffered(20) + .try_flatten() + .boxed() + } + fn list(&self, prefix: Option<&Path>) -> BoxStream<'_, Result> { self.client.list(prefix) } From d39d865cad680bc665c71158836ea101d3e62867 Mon Sep 17 00:00:00 2001 From: Vrishabh Date: Tue, 26 Nov 2024 01:11:21 +0530 Subject: [PATCH 363/397] fix clippy (#6791) --- src/aws/client.rs | 2 +- src/aws/dynamo.rs | 2 +- src/local.rs | 8 ++++++-- src/path/parts.rs | 2 +- 4 files changed, 9 insertions(+), 5 deletions(-) diff --git a/src/aws/client.rs b/src/aws/client.rs index b19e0e2..51c9177 100644 --- a/src/aws/client.rs +++ b/src/aws/client.rs @@ -246,7 +246,7 @@ struct SessionCredential<'a> { config: &'a S3Config, } -impl<'a> SessionCredential<'a> { +impl SessionCredential<'_> { fn authorizer(&self) -> Option> { let mut authorizer = AwsAuthorizer::new(self.credential.as_deref()?, "s3", &self.config.region) diff --git a/src/aws/dynamo.rs b/src/aws/dynamo.rs index ece3b8a..6283e76 100644 --- a/src/aws/dynamo.rs +++ b/src/aws/dynamo.rs @@ -471,7 +471,7 @@ enum ReturnValues { /// This provides cheap, ordered serialization of maps struct Map<'a, K, V>(&'a [(K, V)]); -impl<'a, K: Serialize, V: Serialize> Serialize for Map<'a, K, V> { +impl Serialize for Map<'_, K, V> { fn serialize(&self, serializer: S) -> Result where S: Serializer, diff --git a/src/local.rs b/src/local.rs index 11324b1..78fce9c 100644 --- a/src/local.rs +++ b/src/local.rs @@ -1004,7 +1004,7 @@ fn get_inode(metadata: &Metadata) -> u64 { #[cfg(not(unix))] /// On platforms where an inode isn't available, fallback to just relying on size and mtime -fn get_inode(metadata: &Metadata) -> u64 { +fn get_inode(_metadata: &Metadata) -> u64 { 0 } @@ -1060,7 +1060,10 @@ mod tests { use std::fs; use futures::TryStreamExt; - use tempfile::{NamedTempFile, TempDir}; + use tempfile::TempDir; + + #[cfg(target_family = "unix")] + use tempfile::NamedTempFile; use crate::integration::*; @@ -1248,6 +1251,7 @@ mod tests { fs.list_with_delimiter(None).await.unwrap(); } + #[cfg(target_family = "unix")] async fn check_list(integration: &LocalFileSystem, prefix: Option<&Path>, expected: &[&str]) { let result: Vec<_> = integration.list(prefix).try_collect().await.unwrap(); diff --git a/src/path/parts.rs b/src/path/parts.rs index df7097c..de2e1a7 100644 --- a/src/path/parts.rs +++ b/src/path/parts.rs @@ -126,7 +126,7 @@ impl From for PathPart<'static> { } } -impl<'a> AsRef for PathPart<'a> { +impl AsRef for PathPart<'_> { fn as_ref(&self) -> &str { self.raw.as_ref() } From 4963d942a178bbd148c63d24d7769401f6dfa302 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Tue, 26 Nov 2024 04:47:49 -0500 Subject: [PATCH 364/397] Update references from `master` to `main` (#6795) * Update references from `master` to `main` * more * Update parquet/examples/async_read_parquet.rs * Update object_store/dev/release/README.md Co-authored-by: Matthijs Brobbel --------- Co-authored-by: Matthijs Brobbel --- Cargo.toml | 2 +- dev/release/README.md | 16 ++++++++-------- dev/release/create-tarball.sh | 2 +- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 536874f..a047336 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -23,7 +23,7 @@ license = "MIT/Apache-2.0" readme = "README.md" description = "A generic object store interface for uniformly interacting with AWS S3, Google Cloud Storage, Azure Blob Storage and local files." keywords = ["object", "storage", "cloud"] -repository = "https://github.com/apache/arrow-rs/tree/master/object_store" +repository = "https://github.com/apache/arrow-rs/tree/main/object_store" rust-version = "1.64.0" [package.metadata.docs.rs] diff --git a/dev/release/README.md b/dev/release/README.md index 4077dca..912ff4c 100644 --- a/dev/release/README.md +++ b/dev/release/README.md @@ -27,7 +27,7 @@ This file documents the release process for the `object_store` crate. At the time of writing, we release a new version of `object_store` on demand rather than on a regular schedule. As we are still in an early phase, we use the 0.x version scheme. If any code has -been merged to master that has a breaking API change, as defined in [Rust RFC 1105] +been merged to main that has a breaking API change, as defined in [Rust RFC 1105] the minor version number is incremented changed (e.g. `0.3.0` to `0.4.0`). Otherwise the patch version is incremented (e.g. `0.3.0` to `0.3.1`). @@ -45,14 +45,14 @@ crates.io, the Rust ecosystem's package manager. We create a `CHANGELOG.md` so our users know what has been changed between releases. The CHANGELOG is created automatically using -[update_change_log.sh](https://github.com/apache/arrow-rs/blob/master/object_store/dev/release/update_change_log.sh) +[update_change_log.sh](https://github.com/apache/arrow-rs/blob/main/object_store/dev/release/update_change_log.sh) This script creates a changelog using github issues and the labels associated with them. ## Prepare CHANGELOG and version: -Now prepare a PR to update `CHANGELOG.md` and versions on `master` to reflect the planned release. +Now prepare a PR to update `CHANGELOG.md` and versions on `main` to reflect the planned release. Note this process is done in the `object_store` directory. See [#6227] for an example @@ -62,7 +62,7 @@ Note this process is done in the `object_store` directory. See [#6227] for an e # NOTE: Run commands in object_store sub directory (not main repo checkout) # cd object_store -git checkout master +git checkout main git pull git checkout -b @@ -82,7 +82,7 @@ export CHANGELOG_GITHUB_TOKEN= # Commit changes git commit -a -m 'Create changelog' -# push changes to fork and create a PR to master +# push changes to fork and create a PR to main git push ``` @@ -90,7 +90,7 @@ Note that when reviewing the change log, rather than editing the `CHANGELOG.md`, it is preferred to update the issues and their labels (e.g. add `invalid` label to exclude them from release notes) -Merge this PR to `master` prior to the next step. +Merge this PR to `main` prior to the next step. ## Prepare release candidate tarball @@ -109,7 +109,7 @@ Create and push the tag thusly: ```shell git fetch apache -git tag apache/master +git tag apache/main # push tag to apache git push apache ``` @@ -170,7 +170,7 @@ The vote will be open for at least 72 hours. [1]: https://github.com/apache/arrow-rs/tree/b945b15de9085f5961a478d4f35b0c5c3427e248 [2]: https://dist.apache.org/repos/dist/dev/arrow/apache-arrow-object-store-rs-0.11.1-rc1 [3]: https://github.com/apache/arrow-rs/blob/b945b15de9085f5961a478d4f35b0c5c3427e248/object_store/CHANGELOG.md -[4]: https://github.com/apache/arrow-rs/blob/master/object_store/dev/release/verify-release-candidate.sh +[4]: https://github.com/apache/arrow-rs/blob/main/object_store/dev/release/verify-release-candidate.sh ``` For the release to become "official" it needs at least three Apache Arrow PMC members to vote +1 on it. diff --git a/dev/release/create-tarball.sh b/dev/release/create-tarball.sh index bbffde8..efc26fd 100755 --- a/dev/release/create-tarball.sh +++ b/dev/release/create-tarball.sh @@ -101,7 +101,7 @@ The vote will be open for at least 72 hours. [1]: https://github.com/apache/arrow-rs/tree/${release_hash} [2]: ${url} [3]: https://github.com/apache/arrow-rs/blob/${release_hash}/object_store/CHANGELOG.md -[4]: https://github.com/apache/arrow-rs/blob/master/object_store/dev/release/verify-release-candidate.sh +[4]: https://github.com/apache/arrow-rs/blob/main/object_store/dev/release/verify-release-candidate.sh MAIL echo "---------------------------------------------------------" From c8aa1da28d4c7718a63abe38102694fd884147e4 Mon Sep 17 00:00:00 2001 From: Nikhil Benesch Date: Sat, 30 Nov 2024 05:22:59 -0500 Subject: [PATCH 365/397] object-store: remove S3ConditionalPut::ETagPutIfNotExists (#6802) * Support real S3's If-Match semantics As of today [0] S3 now supports the If-Match for in-place conditional writes. This commit adjusts the existing support for S3ConditionalPut::Etag mode for compatibility with real S3's particular semantics, which vary slightly from MinIO and R2. Specifically: * Real S3 can occasionally return 409 Conflict when concurrent If-Match requests are in progress. These requests need to be retried. * Real S3 returns 404 Not Found instead of 412 Precondition Failed when issuing an If-Match request against an object that does not exist. Fix #6799. [0]: https://aws.amazon.com/about-aws/whats-new/2024/11/amazon-s3-functionality-conditional-writes/ * object-store: remove S3ConditionalPut::ETagPutIfNotExists Now that real S3 supports `If-Match`, we no longer need this special conditional put mode for real S3. * [XXX put in real release version] Upgrade localstack * Update .github/workflows/object_store.yml --------- Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> --- src/aws/client.rs | 10 ++++++++++ src/aws/mod.rs | 32 +++++++++++++++++++++++--------- src/aws/precondition.rs | 13 ------------- src/client/retry.rs | 14 +++++++++++++- 4 files changed, 46 insertions(+), 23 deletions(-) diff --git a/src/aws/client.rs b/src/aws/client.rs index 51c9177..4724968 100644 --- a/src/aws/client.rs +++ b/src/aws/client.rs @@ -290,6 +290,7 @@ pub(crate) struct Request<'a> { payload: Option, use_session_creds: bool, idempotent: bool, + retry_on_conflict: bool, retry_error_body: bool, } @@ -317,6 +318,13 @@ impl<'a> Request<'a> { Self { idempotent, ..self } } + pub(crate) fn retry_on_conflict(self, retry_on_conflict: bool) -> Self { + Self { + retry_on_conflict, + ..self + } + } + pub(crate) fn retry_error_body(self, retry_error_body: bool) -> Self { Self { retry_error_body, @@ -412,6 +420,7 @@ impl<'a> Request<'a> { self.builder .with_aws_sigv4(credential.authorizer(), sha) .retryable(&self.config.retry_config) + .retry_on_conflict(self.retry_on_conflict) .idempotent(self.idempotent) .retry_error_body(self.retry_error_body) .payload(self.payload) @@ -448,6 +457,7 @@ impl S3Client { config: &self.config, use_session_creds: true, idempotent: false, + retry_on_conflict: false, retry_error_body: false, } } diff --git a/src/aws/mod.rs b/src/aws/mod.rs index 81511ba..d7c8c9b 100644 --- a/src/aws/mod.rs +++ b/src/aws/mod.rs @@ -170,10 +170,7 @@ impl ObjectStore for AmazonS3 { match (opts.mode, &self.client.config.conditional_put) { (PutMode::Overwrite, _) => request.idempotent(true).do_put().await, (PutMode::Create | PutMode::Update(_), None) => Err(Error::NotImplemented), - ( - PutMode::Create, - Some(S3ConditionalPut::ETagMatch | S3ConditionalPut::ETagPutIfNotExists), - ) => { + (PutMode::Create, Some(S3ConditionalPut::ETagMatch)) => { match request.header(&IF_NONE_MATCH, "*").do_put().await { // Technically If-None-Match should return NotModified but some stores, // such as R2, instead return PreconditionFailed @@ -197,9 +194,26 @@ impl ObjectStore for AmazonS3 { source: "ETag required for conditional put".to_string().into(), })?; match put { - S3ConditionalPut::ETagPutIfNotExists => Err(Error::NotImplemented), S3ConditionalPut::ETagMatch => { - request.header(&IF_MATCH, etag.as_str()).do_put().await + match request + .header(&IF_MATCH, etag.as_str()) + // Real S3 will occasionally report 409 Conflict + // if there are concurrent `If-Match` requests + // in flight, so we need to be prepared to retry + // 409 responses. + .retry_on_conflict(true) + .do_put() + .await + { + // Real S3 reports NotFound rather than PreconditionFailed when the + // object doesn't exist. Convert to PreconditionFailed for + // consistency with R2. This also matches what the HTTP spec + // says the behavior should be. + Err(Error::NotFound { path, source }) => { + Err(Error::Precondition { path, source }) + } + r => r, + } } S3ConditionalPut::Dynamo(d) => { d.conditional_op(&self.client, location, Some(&etag), move || { @@ -487,6 +501,7 @@ mod tests { let integration = config.build().unwrap(); let config = &integration.client.config; let test_not_exists = config.copy_if_not_exists.is_some(); + let test_conditional_put = config.conditional_put.is_some(); put_get_delete_list(&integration).await; get_opts(&integration).await; @@ -517,9 +532,8 @@ mod tests { if test_not_exists { copy_if_not_exists(&integration).await; } - if let Some(conditional_put) = &config.conditional_put { - let supports_update = !matches!(conditional_put, S3ConditionalPut::ETagPutIfNotExists); - put_opts(&integration, supports_update).await; + if test_conditional_put { + put_opts(&integration, true).await; } // run integration test with unsigned payload enabled diff --git a/src/aws/precondition.rs b/src/aws/precondition.rs index e505805..b261ad0 100644 --- a/src/aws/precondition.rs +++ b/src/aws/precondition.rs @@ -138,17 +138,6 @@ pub enum S3ConditionalPut { /// [HTTP precondition]: https://datatracker.ietf.org/doc/html/rfc9110#name-preconditions ETagMatch, - /// Like `ETagMatch`, but with support for `PutMode::Create` and not - /// `PutMode::Option`. - /// - /// This is the limited form of conditional put supported by Amazon S3 - /// as of August 2024 ([announcement]). - /// - /// Encoded as `etag-put-if-not-exists` ignoring whitespace. - /// - /// [announcement]: https://aws.amazon.com/about-aws/whats-new/2024/08/amazon-s3-conditional-writes/ - ETagPutIfNotExists, - /// The name of a DynamoDB table to use for coordination /// /// Encoded as either `dynamo:` or `dynamo::` @@ -164,7 +153,6 @@ impl std::fmt::Display for S3ConditionalPut { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { Self::ETagMatch => write!(f, "etag"), - Self::ETagPutIfNotExists => write!(f, "etag-put-if-not-exists"), Self::Dynamo(lock) => write!(f, "dynamo: {}", lock.table_name()), } } @@ -174,7 +162,6 @@ impl S3ConditionalPut { fn from_str(s: &str) -> Option { match s.trim() { "etag" => Some(Self::ETagMatch), - "etag-put-if-not-exists" => Some(Self::ETagPutIfNotExists), trimmed => match trimmed.split_once(':')? { ("dynamo", s) => Some(Self::Dynamo(DynamoCommit::from_str(s)?)), _ => None, diff --git a/src/client/retry.rs b/src/client/retry.rs index 601bffd..a8a8e58 100644 --- a/src/client/retry.rs +++ b/src/client/retry.rs @@ -200,6 +200,7 @@ pub(crate) struct RetryableRequest { sensitive: bool, idempotent: Option, + retry_on_conflict: bool, payload: Option, retry_error_body: bool, @@ -217,6 +218,15 @@ impl RetryableRequest { } } + /// Set whether this request should be retried on a 409 Conflict response. + #[cfg(feature = "aws")] + pub(crate) fn retry_on_conflict(self, retry_on_conflict: bool) -> Self { + Self { + retry_on_conflict, + ..self + } + } + /// Set whether this request contains sensitive data /// /// This will avoid printing out the URL in error messages @@ -340,7 +350,8 @@ impl RetryableRequest { let status = r.status(); if retries == max_retries || now.elapsed() > retry_timeout - || !status.is_server_error() + || !(status.is_server_error() + || (self.retry_on_conflict && status == StatusCode::CONFLICT)) { return Err(match status.is_client_error() { true => match r.text().await { @@ -467,6 +478,7 @@ impl RetryExt for reqwest::RequestBuilder { idempotent: None, payload: None, sensitive: false, + retry_on_conflict: false, retry_error_body: false, } } From 27f79e12c14caa435a36db0b2e46d95bf73135fb Mon Sep 17 00:00:00 2001 From: Brent Gardner Date: Mon, 9 Dec 2024 11:09:03 -0700 Subject: [PATCH 366/397] Fix multipart uploads with checksums on object locked buckets (#6794) Fix multipart uploads with checksums on object locked buckets (#6794) --- src/aws/client.rs | 39 +++++++++++++++++++++++------- src/aws/mod.rs | 60 +++++++++++++++++++++++++++++++++++++++++++++++ src/client/s3.rs | 27 ++++++++++++++++++--- 3 files changed, 115 insertions(+), 11 deletions(-) diff --git a/src/aws/client.rs b/src/aws/client.rs index 4724968..81015e8 100644 --- a/src/aws/client.rs +++ b/src/aws/client.rs @@ -29,7 +29,7 @@ use crate::client::list::ListClient; use crate::client::retry::RetryExt; use crate::client::s3::{ CompleteMultipartUpload, CompleteMultipartUploadResult, CopyPartResult, - InitiateMultipartUploadResult, ListResponse, + InitiateMultipartUploadResult, ListResponse, PartMetadata, }; use crate::client::GetOptionsExt; use crate::multipart::PartId; @@ -62,6 +62,7 @@ use std::sync::Arc; const VERSION_HEADER: &str = "x-amz-version-id"; const SHA256_CHECKSUM: &str = "x-amz-checksum-sha256"; const USER_DEFINED_METADATA_HEADER_PREFIX: &str = "x-amz-meta-"; +const ALGORITHM: &str = "x-amz-checksum-algorithm"; /// A specialized `Error` for object store-related errors #[derive(Debug, Snafu)] @@ -390,10 +391,9 @@ impl<'a> Request<'a> { let payload_sha256 = sha256.finish(); if let Some(Checksum::SHA256) = self.config.checksum { - self.builder = self.builder.header( - "x-amz-checksum-sha256", - BASE64_STANDARD.encode(payload_sha256), - ); + self.builder = self + .builder + .header(SHA256_CHECKSUM, BASE64_STANDARD.encode(payload_sha256)); } self.payload_sha256 = Some(payload_sha256); } @@ -617,8 +617,15 @@ impl S3Client { location: &Path, opts: PutMultipartOpts, ) -> Result { - let response = self - .request(Method::POST, location) + let mut request = self.request(Method::POST, location); + if let Some(algorithm) = self.config.checksum { + match algorithm { + Checksum::SHA256 => { + request = request.header(ALGORITHM, "SHA256"); + } + } + } + let response = request .query(&[("uploads", "")]) .with_encryption_headers() .with_attributes(opts.attributes) @@ -669,8 +676,13 @@ impl S3Client { request = request.with_encryption_headers(); } let response = request.send().await?; + let checksum_sha256 = response + .headers() + .get(SHA256_CHECKSUM) + .and_then(|v| v.to_str().ok()) + .map(|v| v.to_string()); - let content_id = match is_copy { + let e_tag = match is_copy { false => get_etag(response.headers()).context(MetadataSnafu)?, true => { let response = response @@ -682,6 +694,17 @@ impl S3Client { response.e_tag } }; + + let content_id = if self.config.checksum == Some(Checksum::SHA256) { + let meta = PartMetadata { + e_tag, + checksum_sha256, + }; + quick_xml::se::to_string(&meta).unwrap() + } else { + e_tag + }; + Ok(PartId { content_id }) } diff --git a/src/aws/mod.rs b/src/aws/mod.rs index d7c8c9b..a7f9264 100644 --- a/src/aws/mod.rs +++ b/src/aws/mod.rs @@ -493,6 +493,66 @@ mod tests { const NON_EXISTENT_NAME: &str = "nonexistentname"; + #[tokio::test] + async fn write_multipart_file_with_signature() { + maybe_skip_integration!(); + + let store = AmazonS3Builder::from_env() + .with_checksum_algorithm(Checksum::SHA256) + .build() + .unwrap(); + + let str = "test.bin"; + let path = Path::parse(str).unwrap(); + let opts = PutMultipartOpts::default(); + let mut upload = store.put_multipart_opts(&path, opts).await.unwrap(); + + upload + .put_part(PutPayload::from(vec![0u8; 10_000_000])) + .await + .unwrap(); + upload + .put_part(PutPayload::from(vec![0u8; 5_000_000])) + .await + .unwrap(); + + let res = upload.complete().await.unwrap(); + assert!(res.e_tag.is_some(), "Should have valid etag"); + + store.delete(&path).await.unwrap(); + } + + #[tokio::test] + async fn write_multipart_file_with_signature_object_lock() { + maybe_skip_integration!(); + + let bucket = "test-object-lock"; + let store = AmazonS3Builder::from_env() + .with_bucket_name(bucket) + .with_checksum_algorithm(Checksum::SHA256) + .build() + .unwrap(); + + let str = "test.bin"; + let path = Path::parse(str).unwrap(); + let opts = PutMultipartOpts::default(); + let mut upload = store.put_multipart_opts(&path, opts).await.unwrap(); + + upload + .put_part(PutPayload::from(vec![0u8; 10_000_000])) + .await + .unwrap(); + upload + .put_part(PutPayload::from(vec![0u8; 5_000_000])) + .await + .unwrap(); + + let res = upload.complete().await.unwrap(); + assert!(res.e_tag.is_some(), "Should have valid etag"); + + store.delete(&path).await.unwrap(); + } + #[tokio::test] async fn s3_test() { maybe_skip_integration!(); diff --git a/src/client/s3.rs b/src/client/s3.rs index dba752c..7fe956b 100644 --- a/src/client/s3.rs +++ b/src/client/s3.rs @@ -106,14 +106,32 @@ pub(crate) struct CompleteMultipartUpload { pub part: Vec, } +#[derive(Serialize, Deserialize)] +pub(crate) struct PartMetadata { + pub e_tag: String, + #[serde(skip_serializing_if = "Option::is_none")] + pub checksum_sha256: Option, +} + impl From> for CompleteMultipartUpload { fn from(value: Vec) -> Self { let part = value .into_iter() .enumerate() - .map(|(part_number, part)| MultipartPart { - e_tag: part.content_id, - part_number: part_number + 1, + .map(|(part_idx, part)| { + let md = match quick_xml::de::from_str::(&part.content_id) { + Ok(md) => md, + // fallback to old way + Err(_) => PartMetadata { + e_tag: part.content_id.clone(), + checksum_sha256: None, + }, + }; + MultipartPart { + e_tag: md.e_tag, + part_number: part_idx + 1, + checksum_sha256: md.checksum_sha256, + } }) .collect(); Self { part } @@ -126,6 +144,9 @@ pub(crate) struct MultipartPart { pub e_tag: String, #[serde(rename = "PartNumber")] pub part_number: usize, + #[serde(rename = "ChecksumSHA256")] + #[serde(skip_serializing_if = "Option::is_none")] + pub checksum_sha256: Option, } #[derive(Debug, Deserialize)] From 1b2043ecce8188397a585e750c13bc2c5c9daa36 Mon Sep 17 00:00:00 2001 From: Phillip LeBlanc Date: Wed, 11 Dec 2024 23:31:02 +0900 Subject: [PATCH 367/397] Always explicitly disable `gzip` automatic decompression on reqwest client used by object_store (#6843) * Explicitly disable gzip on reqwest client used by object_store * Add comment * Add integration test for checking reqwest gzip feature * Fix lint * Add comment explaining why gzip feature is enabled --- Cargo.toml | 2 ++ src/client/mod.rs | 4 ++++ tests/http.rs | 43 +++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 49 insertions(+) create mode 100644 tests/http.rs diff --git a/Cargo.toml b/Cargo.toml index a047336..bcc8e0b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -77,6 +77,8 @@ http-body-util = "0.1" rand = "0.8" tempfile = "3.1.0" regex = "1.11.1" +# The "gzip" feature for reqwest is enabled for an integration test. +reqwest = { version = "0.12", features = ["gzip"] } http = "1.1.0" [[test]] diff --git a/src/client/mod.rs b/src/client/mod.rs index 76d1c1f..1b7ce5a 100644 --- a/src/client/mod.rs +++ b/src/client/mod.rs @@ -671,6 +671,10 @@ impl ClientOptions { builder = builder.danger_accept_invalid_certs(true) } + // Reqwest will remove the `Content-Length` header if it is configured to + // transparently decompress the body via the non-default `gzip` feature. + builder = builder.no_gzip(); + builder .https_only(!self.allow_http.get()?) .build() diff --git a/tests/http.rs b/tests/http.rs new file mode 100644 index 0000000..a9b3145 --- /dev/null +++ b/tests/http.rs @@ -0,0 +1,43 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Tests the HTTP store implementation + +#[cfg(feature = "http")] +use object_store::{http::HttpBuilder, path::Path, GetOptions, GetRange, ObjectStore}; + +/// Tests that even when reqwest has the `gzip` feature enabled, the HTTP store +/// does not error on a missing `Content-Length` header. +#[tokio::test] +#[cfg(feature = "http")] +async fn test_http_store_gzip() { + let http_store = HttpBuilder::new() + .with_url("https://raw.githubusercontent.com/apache/arrow-rs/refs/heads/main") + .build() + .unwrap(); + + let _ = http_store + .get_opts( + &Path::parse("LICENSE.txt").unwrap(), + GetOptions { + range: Some(GetRange::Bounded(0..100)), + ..Default::default() + }, + ) + .await + .unwrap(); +} From b14d44a61dda99f4f30d4c9d47a5b024c6ae198e Mon Sep 17 00:00:00 2001 From: Andrew Varnon Date: Thu, 12 Dec 2024 17:58:55 -0500 Subject: [PATCH 368/397] Use randomized content ID for Azure multipart uploads (#6869) * Use randomized content ID for Azure multipart uploads * Update object_store/src/azure/client.rs Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> * fixup! Use randomized content ID for Azure multipart uploads * fixup! Use randomized content ID for Azure multipart uploads * fixup! Use randomized content ID for Azure multipart uploads * fixup! Use randomized content ID for Azure multipart uploads * fixup! Use randomized content ID for Azure multipart uploads * fixup! Use randomized content ID for Azure multipart uploads * fixup! Use randomized content ID for Azure multipart uploads * fixup! Use randomized content ID for Azure multipart uploads --------- Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> --- src/aws/mod.rs | 1 + src/azure/client.rs | 6 ++-- src/azure/mod.rs | 1 + src/gcp/mod.rs | 1 + src/integration.rs | 87 +++++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 94 insertions(+), 2 deletions(-) diff --git a/src/aws/mod.rs b/src/aws/mod.rs index a7f9264..7f449c4 100644 --- a/src/aws/mod.rs +++ b/src/aws/mod.rs @@ -570,6 +570,7 @@ mod tests { rename_and_copy(&integration).await; stream_get(&integration).await; multipart(&integration, &integration).await; + multipart_race_condition(&integration, true).await; signing(&integration).await; s3_encryption(&integration).await; put_get_attributes(&integration).await; diff --git a/src/azure/client.rs b/src/azure/client.rs index 76dedd7..69ff395 100644 --- a/src/azure/client.rs +++ b/src/azure/client.rs @@ -36,6 +36,7 @@ use base64::Engine; use bytes::{Buf, Bytes}; use chrono::{DateTime, Utc}; use hyper::http::HeaderName; +use rand::Rng as _; use reqwest::{ header::{HeaderMap, HeaderValue, CONTENT_LENGTH, CONTENT_TYPE, IF_MATCH, IF_NONE_MATCH}, Client as ReqwestClient, Method, RequestBuilder, Response, @@ -556,10 +557,11 @@ impl AzureClient { pub(crate) async fn put_block( &self, path: &Path, - part_idx: usize, + _part_idx: usize, payload: PutPayload, ) -> Result { - let content_id = format!("{part_idx:20}"); + let part_idx = u128::from_be_bytes(rand::thread_rng().gen()); + let content_id = format!("{part_idx:032x}"); let block_id = BASE64_STANDARD.encode(&content_id); self.put_request(path, payload) diff --git a/src/azure/mod.rs b/src/azure/mod.rs index 177bffb..81b6667 100644 --- a/src/azure/mod.rs +++ b/src/azure/mod.rs @@ -314,6 +314,7 @@ mod tests { stream_get(&integration).await; put_opts(&integration, true).await; multipart(&integration, &integration).await; + multipart_race_condition(&integration, false).await; signing(&integration).await; let validate = !integration.client.config().disable_tagging; diff --git a/src/gcp/mod.rs b/src/gcp/mod.rs index 039ec46..5199135 100644 --- a/src/gcp/mod.rs +++ b/src/gcp/mod.rs @@ -297,6 +297,7 @@ mod test { // https://github.com/fsouza/fake-gcs-server/issues/852 stream_get(&integration).await; multipart(&integration, &integration).await; + multipart_race_condition(&integration, true).await; // Fake GCS server doesn't currently honor preconditions get_opts(&integration).await; put_opts(&integration, true).await; diff --git a/src/integration.rs b/src/integration.rs index 3017787..20e95fd 100644 --- a/src/integration.rs +++ b/src/integration.rs @@ -24,6 +24,8 @@ //! //! They are intended solely for testing purposes. +use core::str; + use crate::multipart::MultipartStore; use crate::path::Path; use crate::{ @@ -1109,3 +1111,88 @@ async fn delete_fixtures(storage: &DynObjectStore) { .await .unwrap(); } + +/// Tests a race condition where 2 threads are performing multipart writes to the same path +pub async fn multipart_race_condition(storage: &dyn ObjectStore, last_writer_wins: bool) { + let path = Path::from("test_multipart_race_condition"); + + let mut multipart_upload_1 = storage.put_multipart(&path).await.unwrap(); + let mut multipart_upload_2 = storage.put_multipart(&path).await.unwrap(); + + multipart_upload_1 + .put_part(Bytes::from(format!("1:{:05300000},", 0)).into()) + .await + .unwrap(); + multipart_upload_2 + .put_part(Bytes::from(format!("2:{:05300000},", 0)).into()) + .await + .unwrap(); + + multipart_upload_2 + .put_part(Bytes::from(format!("2:{:05300000},", 1)).into()) + .await + .unwrap(); + multipart_upload_1 + .put_part(Bytes::from(format!("1:{:05300000},", 1)).into()) + .await + .unwrap(); + + multipart_upload_1 + .put_part(Bytes::from(format!("1:{:05300000},", 2)).into()) + .await + .unwrap(); + multipart_upload_2 + .put_part(Bytes::from(format!("2:{:05300000},", 2)).into()) + .await + .unwrap(); + + multipart_upload_2 + .put_part(Bytes::from(format!("2:{:05300000},", 3)).into()) + .await + .unwrap(); + multipart_upload_1 + .put_part(Bytes::from(format!("1:{:05300000},", 3)).into()) + .await + .unwrap(); + + multipart_upload_1 + .put_part(Bytes::from(format!("1:{:05300000},", 4)).into()) + .await + .unwrap(); + multipart_upload_2 + .put_part(Bytes::from(format!("2:{:05300000},", 4)).into()) + .await + .unwrap(); + + multipart_upload_1.complete().await.unwrap(); + + if last_writer_wins { + multipart_upload_2.complete().await.unwrap(); + } else { + let err = multipart_upload_2.complete().await.unwrap_err(); + + assert!(matches!(err, crate::Error::Generic { .. }), "{err}"); + } + + let get_result = storage.get(&path).await.unwrap(); + let bytes = get_result.bytes().await.unwrap(); + let string_contents = str::from_utf8(&bytes).unwrap(); + + if last_writer_wins { + assert!(string_contents.starts_with( + format!( + "2:{:05300000},2:{:05300000},2:{:05300000},2:{:05300000},2:{:05300000},", + 0, 1, 2, 3, 4 + ) + .as_str() + )); + } else { + assert!(string_contents.starts_with( + format!( + "1:{:05300000},1:{:05300000},1:{:05300000},1:{:05300000},1:{:05300000},", + 0, 1, 2, 3, 4 + ) + .as_str() + )); + } +} From 14dfe426abc861d25744d920f3eb2f0f616e9b82 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 20 Dec 2024 16:18:55 -0500 Subject: [PATCH 369/397] [object_store]: Version and Changelog for 0.11.2 (#6908) * [object_store]: Version and Changelog for 0.11.2 * increment version * update script * changelog * tweaks * Update object_store/CHANGELOG.md Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> --------- Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> --- CHANGELOG-old.md | 39 ++++++++++++++++++++++++ CHANGELOG.md | 51 ++++++++++++++++---------------- Cargo.toml | 2 +- dev/release/README.md | 5 +++- dev/release/update_change_log.sh | 4 +-- 5 files changed, 72 insertions(+), 29 deletions(-) diff --git a/CHANGELOG-old.md b/CHANGELOG-old.md index 28dbde4..c426892 100644 --- a/CHANGELOG-old.md +++ b/CHANGELOG-old.md @@ -19,6 +19,45 @@ # Historical Changelog + +## [object_store_0.11.1](https://github.com/apache/arrow-rs/tree/object_store_0.11.1) (2024-10-15) + +[Full Changelog](https://github.com/apache/arrow-rs/compare/object_store_0.11.0...object_store_0.11.1) + +**Implemented enhancements:** + +- There is no way to pass object store client options as environment variables [\#6333](https://github.com/apache/arrow-rs/issues/6333) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Better Document Backoff Algorithm [\#6324](https://github.com/apache/arrow-rs/issues/6324) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Add direction to `list_with_offset` [\#6274](https://github.com/apache/arrow-rs/issues/6274) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Support server-side encryption with customer-provided keys \(SSE-C\) [\#6229](https://github.com/apache/arrow-rs/issues/6229) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] + +**Fixed bugs:** + +- \[object-store\] Requested tokio version is too old - does not compile [\#6458](https://github.com/apache/arrow-rs/issues/6458) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Azure SAS tokens are visible when retry errors are logged via object\_store [\#6322](https://github.com/apache/arrow-rs/issues/6322) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] + +**Merged pull requests:** + +- object\_store: fix typo in with\_connect\_timeout\_disabled that actually disabled non-connect timeouts [\#6563](https://github.com/apache/arrow-rs/pull/6563) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([adriangb](https://github.com/adriangb)) +- object\_store: Clarify what is a prefix in list\(\) documentation [\#6520](https://github.com/apache/arrow-rs/pull/6520) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([progval](https://github.com/progval)) +- object\_store: enable lint `unreachable_pub` [\#6512](https://github.com/apache/arrow-rs/pull/6512) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([ByteBaker](https://github.com/ByteBaker)) +- \[object\_store\] Retry S3 requests with 200 response with "Error" in body [\#6508](https://github.com/apache/arrow-rs/pull/6508) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([PeterKeDer](https://github.com/PeterKeDer)) +- \[object-store\] Require tokio 1.29.0. [\#6459](https://github.com/apache/arrow-rs/pull/6459) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([ashtuchkin](https://github.com/ashtuchkin)) +- feat: expose HTTP/2 max frame size in `object_store` [\#6442](https://github.com/apache/arrow-rs/pull/6442) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([crepererum](https://github.com/crepererum)) +- Derive `Clone` for `object_store::aws::AmazonS3` [\#6414](https://github.com/apache/arrow-rs/pull/6414) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([ethe](https://github.com/ethe)) +- object\_score: Support Azure Fabric OAuth Provider [\#6382](https://github.com/apache/arrow-rs/pull/6382) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([RobinLin666](https://github.com/RobinLin666)) +- `object_store::GetOptions` derive `Clone` [\#6361](https://github.com/apache/arrow-rs/pull/6361) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([samuelcolvin](https://github.com/samuelcolvin)) +- \[object\_store\] Propagate env vars as object store client options [\#6334](https://github.com/apache/arrow-rs/pull/6334) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([ccciudatu](https://github.com/ccciudatu)) +- docs\[object\_store\]: clarify the backoff strategy that is actually implemented [\#6325](https://github.com/apache/arrow-rs/pull/6325) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([westonpace](https://github.com/westonpace)) +- fix: azure sas token visible in logs [\#6323](https://github.com/apache/arrow-rs/pull/6323) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([alexwilcoxson-rel](https://github.com/alexwilcoxson-rel)) +- object\_store/delimited: Fix `TrailingEscape` condition [\#6265](https://github.com/apache/arrow-rs/pull/6265) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([Turbo87](https://github.com/Turbo87)) +- fix\(object\_store\): only add encryption headers for SSE-C in get request [\#6260](https://github.com/apache/arrow-rs/pull/6260) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([jiachengdb](https://github.com/jiachengdb)) +- docs: Add parquet\_opendal in related projects [\#6236](https://github.com/apache/arrow-rs/pull/6236) ([Xuanwo](https://github.com/Xuanwo)) +- feat\(object\_store\): add support for server-side encryption with customer-provided keys \(SSE-C\) [\#6230](https://github.com/apache/arrow-rs/pull/6230) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([jiachengdb](https://github.com/jiachengdb)) +- feat: further TLS options on ClientOptions: \#5034 [\#6148](https://github.com/apache/arrow-rs/pull/6148) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([ByteBaker](https://github.com/ByteBaker)) + + + ## [object_store_0.11.0](https://github.com/apache/arrow-rs/tree/object_store_0.11.0) (2024-08-12) [Full Changelog](https://github.com/apache/arrow-rs/compare/object_store_0.10.2...object_store_0.11.0) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9558598..0e834c5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,41 +19,42 @@ # Changelog -## [object_store_0.11.1](https://github.com/apache/arrow-rs/tree/object_store_0.11.1) (2024-10-15) +## [object_store_0.11.2](https://github.com/apache/arrow-rs/tree/object_store_0.11.2) (2024-12-20) -[Full Changelog](https://github.com/apache/arrow-rs/compare/object_store_0.11.0...object_store_0.11.1) +[Full Changelog](https://github.com/apache/arrow-rs/compare/object_store_0.11.1...object_store_0.11.2) **Implemented enhancements:** -- There is no way to pass object store client options as environment variables [\#6333](https://github.com/apache/arrow-rs/issues/6333) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- Better Document Backoff Algorithm [\#6324](https://github.com/apache/arrow-rs/issues/6324) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- Add direction to `list_with_offset` [\#6274](https://github.com/apache/arrow-rs/issues/6274) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- Support server-side encryption with customer-provided keys \(SSE-C\) [\#6229](https://github.com/apache/arrow-rs/issues/6229) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- object-store's AzureClient should protect against multiple streams performing put\_block in parallel for the same BLOB path [\#6868](https://github.com/apache/arrow-rs/issues/6868) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Support S3 Put IfMatch [\#6799](https://github.com/apache/arrow-rs/issues/6799) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- object\_store Azure Government using OAuth [\#6759](https://github.com/apache/arrow-rs/issues/6759) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Support for AWS Requester Pays buckets [\#6716](https://github.com/apache/arrow-rs/issues/6716) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- \[object-store\]: Implement credential\_process support for S3 [\#6422](https://github.com/apache/arrow-rs/issues/6422) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- object\_store: Conditional put and rename\_if\_not\_exist on S3 [\#6285](https://github.com/apache/arrow-rs/issues/6285) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] **Fixed bugs:** -- \[object-store\] Requested tokio version is too old - does not compile [\#6458](https://github.com/apache/arrow-rs/issues/6458) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- Azure SAS tokens are visible when retry errors are logged via object\_store [\#6322](https://github.com/apache/arrow-rs/issues/6322) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- `object_store` errors when `reqwest` `gzip` feature is enabled [\#6842](https://github.com/apache/arrow-rs/issues/6842) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Multi-part s3 uploads fail when using checksum [\#6793](https://github.com/apache/arrow-rs/issues/6793) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- `with_unsigned_payload` shouldn't generate payload hash [\#6697](https://github.com/apache/arrow-rs/issues/6697) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- \[Object\_store\] min\_ttl is too high for GKE tokens [\#6625](https://github.com/apache/arrow-rs/issues/6625) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- object\_store `test_private_bucket` fails - store: "S3", source: BucketNotFound { bucket: "bloxbender" } [\#6600](https://github.com/apache/arrow-rs/issues/6600) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- S3 endpoint and trailing slash result in weird/invalid requests [\#6580](https://github.com/apache/arrow-rs/issues/6580) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] **Merged pull requests:** -- object\_store: fix typo in with\_connect\_timeout\_disabled that actually disabled non-connect timeouts [\#6563](https://github.com/apache/arrow-rs/pull/6563) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([adriangb](https://github.com/adriangb)) -- object\_store: Clarify what is a prefix in list\(\) documentation [\#6520](https://github.com/apache/arrow-rs/pull/6520) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([progval](https://github.com/progval)) -- object\_store: enable lint `unreachable_pub` [\#6512](https://github.com/apache/arrow-rs/pull/6512) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([ByteBaker](https://github.com/ByteBaker)) -- \[object\_store\] Retry S3 requests with 200 response with "Error" in body [\#6508](https://github.com/apache/arrow-rs/pull/6508) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([PeterKeDer](https://github.com/PeterKeDer)) -- \[object-store\] Require tokio 1.29.0. [\#6459](https://github.com/apache/arrow-rs/pull/6459) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([ashtuchkin](https://github.com/ashtuchkin)) -- feat: expose HTTP/2 max frame size in `object_store` [\#6442](https://github.com/apache/arrow-rs/pull/6442) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([crepererum](https://github.com/crepererum)) -- Derive `Clone` for `object_store::aws::AmazonS3` [\#6414](https://github.com/apache/arrow-rs/pull/6414) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([ethe](https://github.com/ethe)) -- object\_score: Support Azure Fabric OAuth Provider [\#6382](https://github.com/apache/arrow-rs/pull/6382) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([RobinLin666](https://github.com/RobinLin666)) -- `object_store::GetOptions` derive `Clone` [\#6361](https://github.com/apache/arrow-rs/pull/6361) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([samuelcolvin](https://github.com/samuelcolvin)) -- \[object\_store\] Propagate env vars as object store client options [\#6334](https://github.com/apache/arrow-rs/pull/6334) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([ccciudatu](https://github.com/ccciudatu)) -- docs\[object\_store\]: clarify the backoff strategy that is actually implemented [\#6325](https://github.com/apache/arrow-rs/pull/6325) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([westonpace](https://github.com/westonpace)) -- fix: azure sas token visible in logs [\#6323](https://github.com/apache/arrow-rs/pull/6323) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([alexwilcoxson-rel](https://github.com/alexwilcoxson-rel)) -- object\_store/delimited: Fix `TrailingEscape` condition [\#6265](https://github.com/apache/arrow-rs/pull/6265) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([Turbo87](https://github.com/Turbo87)) -- fix\(object\_store\): only add encryption headers for SSE-C in get request [\#6260](https://github.com/apache/arrow-rs/pull/6260) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([jiachengdb](https://github.com/jiachengdb)) -- docs: Add parquet\_opendal in related projects [\#6236](https://github.com/apache/arrow-rs/pull/6236) ([Xuanwo](https://github.com/Xuanwo)) -- feat\(object\_store\): add support for server-side encryption with customer-provided keys \(SSE-C\) [\#6230](https://github.com/apache/arrow-rs/pull/6230) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([jiachengdb](https://github.com/jiachengdb)) -- feat: further TLS options on ClientOptions: \#5034 [\#6148](https://github.com/apache/arrow-rs/pull/6148) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([ByteBaker](https://github.com/ByteBaker)) +- Use randomized content ID for Azure multipart uploads [\#6869](https://github.com/apache/arrow-rs/pull/6869) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([avarnon](https://github.com/avarnon)) +- Always explicitly disable `gzip` automatic decompression on reqwest client used by object\_store [\#6843](https://github.com/apache/arrow-rs/pull/6843) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([phillipleblanc](https://github.com/phillipleblanc)) +- object-store: remove S3ConditionalPut::ETagPutIfNotExists [\#6802](https://github.com/apache/arrow-rs/pull/6802) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([benesch](https://github.com/benesch)) +- Fix multipart uploads with checksums on object locked buckets [\#6794](https://github.com/apache/arrow-rs/pull/6794) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([avantgardnerio](https://github.com/avantgardnerio)) +- Add AuthorityHost to AzureConfigKey [\#6773](https://github.com/apache/arrow-rs/pull/6773) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([zadeluca](https://github.com/zadeluca)) +- object\_store: Add support for requester pays buckets [\#6768](https://github.com/apache/arrow-rs/pull/6768) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([kylebarron](https://github.com/kylebarron)) +- check sign\_payload instead of skip\_signature before computing checksum [\#6698](https://github.com/apache/arrow-rs/pull/6698) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([mherrerarendon](https://github.com/mherrerarendon)) +- Update quick-xml requirement from 0.36.0 to 0.37.0 in /object\_store [\#6687](https://github.com/apache/arrow-rs/pull/6687) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([crepererum](https://github.com/crepererum)) +- Support native S3 conditional writes [\#6682](https://github.com/apache/arrow-rs/pull/6682) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([benesch](https://github.com/benesch)) +- \[object\_store\] fix S3 endpoint and trailing slash result in invalid requests [\#6641](https://github.com/apache/arrow-rs/pull/6641) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([adbmal](https://github.com/adbmal)) +- Lower GCP token min\_ttl to 4 minutes and add backoff to token refresh logic [\#6638](https://github.com/apache/arrow-rs/pull/6638) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([mwylde](https://github.com/mwylde)) +- Remove `test_private_bucket` object\_store test [\#6601](https://github.com/apache/arrow-rs/pull/6601) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([alamb](https://github.com/alamb)) diff --git a/Cargo.toml b/Cargo.toml index bcc8e0b..bf254b3 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "object_store" -version = "0.11.1" +version = "0.11.2" edition = "2021" license = "MIT/Apache-2.0" readme = "README.md" diff --git a/dev/release/README.md b/dev/release/README.md index 912ff4c..2dd1f62 100644 --- a/dev/release/README.md +++ b/dev/release/README.md @@ -24,7 +24,10 @@ This file documents the release process for the `object_store` crate. -At the time of writing, we release a new version of `object_store` on demand rather than on a regular schedule. +We release a new version of `object_store` according to the schedule listed in +the [main README.md] + +[main README.md]: https://github.com/apache/arrow-rs?tab=readme-ov-file#object_store-crate As we are still in an early phase, we use the 0.x version scheme. If any code has been merged to main that has a breaking API change, as defined in [Rust RFC 1105] diff --git a/dev/release/update_change_log.sh b/dev/release/update_change_log.sh index 3072447..2797b62 100755 --- a/dev/release/update_change_log.sh +++ b/dev/release/update_change_log.sh @@ -29,8 +29,8 @@ set -e -SINCE_TAG="object_store_0.11.0" -FUTURE_RELEASE="object_store_0.11.1" +SINCE_TAG="object_store_0.11.1" +FUTURE_RELEASE="object_store_0.11.2" SOURCE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" SOURCE_TOP_DIR="$(cd "${SOURCE_DIR}/../../" && pwd)" From 4860b01fef40b504caff7cca831bed94b0dd78a5 Mon Sep 17 00:00:00 2001 From: Tobias Bieniek Date: Thu, 2 Jan 2025 10:09:34 +0100 Subject: [PATCH 370/397] object_store: Add enabled-by-default "fs" feature (#6636) --- Cargo.toml | 4 +++- src/chunked.rs | 4 ++++ src/lib.rs | 17 +++++++++++------ src/limit.rs | 1 + src/parse.rs | 4 ++-- src/throttle.rs | 2 ++ src/util.rs | 2 +- 7 files changed, 24 insertions(+), 10 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index bf254b3..a127be3 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -41,7 +41,7 @@ percent-encoding = "2.1" snafu = { version = "0.8", default-features = false, features = ["std", "rust_1_61"] } tracing = { version = "0.1" } url = "2.2" -walkdir = "2" +walkdir = { version = "2", optional = true } # Cloud storage support base64 = { version = "0.22", default-features = false, features = ["std"], optional = true } @@ -61,8 +61,10 @@ httparse = { version = "1.8.0", default-features = false, features = ["std"], op nix = { version = "0.29.0", features = ["fs"] } [features] +default = ["fs"] cloud = ["serde", "serde_json", "quick-xml", "hyper", "reqwest", "reqwest/json", "reqwest/stream", "chrono/serde", "base64", "rand", "ring"] azure = ["cloud", "httparse"] +fs = ["walkdir"] gcp = ["cloud", "rustls-pemfile"] aws = ["cloud", "md-5"] http = ["cloud"] diff --git a/src/chunked.rs b/src/chunked.rs index 98cc204..3f83c13 100644 --- a/src/chunked.rs +++ b/src/chunked.rs @@ -86,6 +86,7 @@ impl ObjectStore for ChunkedStore { async fn get_opts(&self, location: &Path, options: GetOptions) -> Result { let r = self.inner.get_opts(location, options).await?; let stream = match r.payload { + #[cfg(all(feature = "fs", not(target_arch = "wasm32")))] GetResultPayload::File(file, path) => { crate::local::chunked_stream(file, path, r.range.clone(), self.chunk_size) } @@ -178,7 +179,9 @@ impl ObjectStore for ChunkedStore { mod tests { use futures::StreamExt; + #[cfg(feature = "fs")] use crate::integration::*; + #[cfg(feature = "fs")] use crate::local::LocalFileSystem; use crate::memory::InMemory; use crate::path::Path; @@ -209,6 +212,7 @@ mod tests { } } + #[cfg(feature = "fs")] #[tokio::test] async fn test_chunked() { let temporary = tempfile::tempdir().unwrap(); diff --git a/src/lib.rs b/src/lib.rs index 4d8d8f0..6f57332 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -66,10 +66,13 @@ //! By default, this crate provides the following implementations: //! //! * Memory: [`InMemory`](memory::InMemory) -//! * Local filesystem: [`LocalFileSystem`](local::LocalFileSystem) //! //! Feature flags are used to enable support for other implementations: //! +#![cfg_attr( + feature = "fs", + doc = "* Local filesystem: [`LocalFileSystem`](local::LocalFileSystem)" +)] #![cfg_attr( feature = "gcp", doc = "* [`gcp`]: [Google Cloud Storage](https://cloud.google.com/storage/) support. See [`GoogleCloudStorageBuilder`](gcp::GoogleCloudStorageBuilder)" @@ -513,7 +516,7 @@ pub mod gcp; #[cfg(feature = "http")] pub mod http; pub mod limit; -#[cfg(not(target_arch = "wasm32"))] +#[cfg(all(feature = "fs", not(target_arch = "wasm32")))] pub mod local; pub mod memory; pub mod path; @@ -557,7 +560,7 @@ pub use upload::*; pub use util::{coalesce_ranges, collect_bytes, GetRange, OBJECT_STORE_COALESCE_DEFAULT}; use crate::path::Path; -#[cfg(not(target_arch = "wasm32"))] +#[cfg(all(feature = "fs", not(target_arch = "wasm32")))] use crate::util::maybe_spawn_blocking; use async_trait::async_trait; use bytes::Bytes; @@ -565,7 +568,7 @@ use chrono::{DateTime, Utc}; use futures::{stream::BoxStream, StreamExt, TryStreamExt}; use snafu::Snafu; use std::fmt::{Debug, Formatter}; -#[cfg(not(target_arch = "wasm32"))] +#[cfg(all(feature = "fs", not(target_arch = "wasm32")))] use std::io::{Read, Seek, SeekFrom}; use std::ops::Range; use std::sync::Arc; @@ -1028,6 +1031,7 @@ pub struct GetResult { /// be able to optimise the case of a file already present on local disk pub enum GetResultPayload { /// The file, path + #[cfg(all(feature = "fs", not(target_arch = "wasm32")))] File(std::fs::File, std::path::PathBuf), /// An opaque stream of bytes Stream(BoxStream<'static, Result>), @@ -1036,6 +1040,7 @@ pub enum GetResultPayload { impl Debug for GetResultPayload { fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { match self { + #[cfg(all(feature = "fs", not(target_arch = "wasm32")))] Self::File(_, _) => write!(f, "GetResultPayload(File)"), Self::Stream(_) => write!(f, "GetResultPayload(Stream)"), } @@ -1047,7 +1052,7 @@ impl GetResult { pub async fn bytes(self) -> Result { let len = self.range.end - self.range.start; match self.payload { - #[cfg(not(target_arch = "wasm32"))] + #[cfg(all(feature = "fs", not(target_arch = "wasm32")))] GetResultPayload::File(mut file, path) => { maybe_spawn_blocking(move || { file.seek(SeekFrom::Start(self.range.start as _)) @@ -1087,7 +1092,7 @@ impl GetResult { /// no additional complexity or overheads pub fn into_stream(self) -> BoxStream<'static, Result> { match self.payload { - #[cfg(not(target_arch = "wasm32"))] + #[cfg(all(feature = "fs", not(target_arch = "wasm32")))] GetResultPayload::File(file, path) => { const CHUNK_SIZE: usize = 8 * 1024; local::chunked_stream(file, path, self.range, CHUNK_SIZE) diff --git a/src/limit.rs b/src/limit.rs index 64b96ad..6a3c3b5 100644 --- a/src/limit.rs +++ b/src/limit.rs @@ -199,6 +199,7 @@ impl ObjectStore for LimitStore { fn permit_get_result(r: GetResult, permit: OwnedSemaphorePermit) -> GetResult { let payload = match r.payload { + #[cfg(all(feature = "fs", not(target_arch = "wasm32")))] v @ GetResultPayload::File(_, _) => v, GetResultPayload::Stream(s) => { GetResultPayload::Stream(PermitWrapper::new(s, permit).boxed()) diff --git a/src/parse.rs b/src/parse.rs index debc9e5..a391930 100644 --- a/src/parse.rs +++ b/src/parse.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -#[cfg(not(target_arch = "wasm32"))] +#[cfg(all(feature = "fs", not(target_arch = "wasm32")))] use crate::local::LocalFileSystem; use crate::memory::InMemory; use crate::path::Path; @@ -179,7 +179,7 @@ where let path = Path::parse(path)?; let store = match scheme { - #[cfg(not(target_arch = "wasm32"))] + #[cfg(all(feature = "fs", not(target_arch = "wasm32")))] ObjectStoreScheme::Local => Box::new(LocalFileSystem::new()) as _, ObjectStoreScheme::Memory => Box::new(InMemory::new()) as _, #[cfg(feature = "aws")] diff --git a/src/throttle.rs b/src/throttle.rs index d07276c..b9dff5c 100644 --- a/src/throttle.rs +++ b/src/throttle.rs @@ -307,8 +307,10 @@ fn usize_to_u32_saturate(x: usize) -> u32 { } fn throttle_get(result: GetResult, wait_get_per_byte: Duration) -> GetResult { + #[allow(clippy::infallible_destructuring_match)] let s = match result.payload { GetResultPayload::Stream(s) => s, + #[cfg(all(feature = "fs", not(target_arch = "wasm32")))] GetResultPayload::File(_, _) => unimplemented!(), }; diff --git a/src/util.rs b/src/util.rs index ecf90f9..99102a9 100644 --- a/src/util.rs +++ b/src/util.rs @@ -75,7 +75,7 @@ where } } -#[cfg(not(target_arch = "wasm32"))] +#[cfg(all(feature = "fs", not(target_arch = "wasm32")))] /// Takes a function and spawns it to a tokio blocking pool if available pub(crate) async fn maybe_spawn_blocking(f: F) -> Result where From 2eacb3811cd9850d8f051c36ebf96693c12cfcf2 Mon Sep 17 00:00:00 2001 From: Tobias Bieniek Date: Thu, 2 Jan 2025 23:25:44 +0100 Subject: [PATCH 371/397] object_store: Migrate from `snafu` to `thiserror` (#6266) * object_store: Add `thiserror` dependency * object_store/memory: Migrate from `snafu` to `thiserror` * object_store/parse: Migrate from `snafu` to `thiserror` * object_store/util: Migrate from `snafu` to `thiserror` * object_store/local: Migrate from `snafu` to `thiserror` * object_store/delimited: Migrate from `snafu` to `thiserror` * object_store/path/parts: Migrate from `snafu` to `thiserror` * object_store/path: Migrate from `snafu` to `thiserror` * object_store/http: Migrate from `snafu` to `thiserror` * object_store/client: Migrate from `snafu` to `thiserror` * object_store/aws: Migrate from `snafu` to `thiserror` * object_store/azure: Migrate from `snafu` to `thiserror` * object_store/gcp: Migrate from `snafu` to `thiserror` * object_store/lib: Migrate from `snafu` to `thiserror` * Remove `snafu` dependency --- Cargo.toml | 2 +- src/aws/builder.rs | 52 ++++++----- src/aws/client.rs | 87 ++++++++++-------- src/aws/credential.rs | 17 ++-- src/aws/resolve.rs | 30 +++---- src/azure/builder.rs | 65 ++++++++------ src/azure/client.rs | 93 ++++++++++--------- src/azure/credential.rs | 41 +++++---- src/client/get.rs | 97 ++++++++++---------- src/client/header.rs | 54 ++++++++---- src/client/retry.rs | 13 ++- src/delimited.rs | 15 ++-- src/gcp/builder.rs | 48 ++++++---- src/gcp/client.rs | 91 ++++++++++--------- src/gcp/credential.rs | 57 ++++++------ src/http/client.rs | 52 +++++++---- src/http/mod.rs | 13 ++- src/lib.rs | 36 ++++---- src/local.rs | 191 ++++++++++++++++++---------------------- src/memory.rs | 31 +++---- src/parse.rs | 12 +-- src/path/mod.rs | 35 +++++--- src/path/parts.rs | 7 +- src/util.rs | 9 +- 24 files changed, 620 insertions(+), 528 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index a127be3..6f5e9db 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -38,7 +38,7 @@ humantime = "2.1" itertools = "0.13.0" parking_lot = { version = "0.12" } percent-encoding = "2.1" -snafu = { version = "0.8", default-features = false, features = ["std", "rust_1_61"] } +thiserror = "2.0.2" tracing = { version = "0.1" } url = "2.2" walkdir = { version = "2", optional = true } diff --git a/src/aws/builder.rs b/src/aws/builder.rs index 840245a..d29fa78 100644 --- a/src/aws/builder.rs +++ b/src/aws/builder.rs @@ -32,7 +32,6 @@ use itertools::Itertools; use md5::{Digest, Md5}; use reqwest::header::{HeaderMap, HeaderValue}; use serde::{Deserialize, Serialize}; -use snafu::{OptionExt, ResultExt, Snafu}; use std::str::FromStr; use std::sync::Arc; use std::time::Duration; @@ -43,46 +42,46 @@ use url::Url; static DEFAULT_METADATA_ENDPOINT: &str = "http://169.254.169.254"; /// A specialized `Error` for object store-related errors -#[derive(Debug, Snafu)] +#[derive(Debug, thiserror::Error)] enum Error { - #[snafu(display("Missing bucket name"))] + #[error("Missing bucket name")] MissingBucketName, - #[snafu(display("Missing AccessKeyId"))] + #[error("Missing AccessKeyId")] MissingAccessKeyId, - #[snafu(display("Missing SecretAccessKey"))] + #[error("Missing SecretAccessKey")] MissingSecretAccessKey, - #[snafu(display("Unable parse source url. Url: {}, Error: {}", url, source))] + #[error("Unable parse source url. Url: {}, Error: {}", url, source)] UnableToParseUrl { source: url::ParseError, url: String, }, - #[snafu(display( + #[error( "Unknown url scheme cannot be parsed into storage location: {}", scheme - ))] + )] UnknownUrlScheme { scheme: String }, - #[snafu(display("URL did not match any known pattern for scheme: {}", url))] + #[error("URL did not match any known pattern for scheme: {}", url)] UrlNotRecognised { url: String }, - #[snafu(display("Configuration key: '{}' is not known.", key))] + #[error("Configuration key: '{}' is not known.", key)] UnknownConfigurationKey { key: String }, - #[snafu(display("Invalid Zone suffix for bucket '{bucket}'"))] + #[error("Invalid Zone suffix for bucket '{bucket}'")] ZoneSuffix { bucket: String }, - #[snafu(display("Invalid encryption type: {}. Valid values are \"AES256\", \"sse:kms\", \"sse:kms:dsse\" and \"sse-c\".", passed))] + #[error("Invalid encryption type: {}. Valid values are \"AES256\", \"sse:kms\", \"sse:kms:dsse\" and \"sse-c\".", passed)] InvalidEncryptionType { passed: String }, - #[snafu(display( + #[error( "Invalid encryption header values. Header: {}, source: {}", header, source - ))] + )] InvalidEncryptionHeader { header: &'static str, source: Box, @@ -603,8 +602,15 @@ impl AmazonS3Builder { /// This is a separate member function to allow fallible computation to /// be deferred until [`Self::build`] which in turn allows deriving [`Clone`] fn parse_url(&mut self, url: &str) -> Result<()> { - let parsed = Url::parse(url).context(UnableToParseUrlSnafu { url })?; - let host = parsed.host_str().context(UrlNotRecognisedSnafu { url })?; + let parsed = Url::parse(url).map_err(|source| { + let url = url.into(); + Error::UnableToParseUrl { url, source } + })?; + + let host = parsed + .host_str() + .ok_or_else(|| Error::UrlNotRecognised { url: url.into() })?; + match parsed.scheme() { "s3" | "s3a" => self.bucket_name = Some(host.to_string()), "https" => match host.splitn(4, '.').collect_tuple() { @@ -630,9 +636,12 @@ impl AmazonS3Builder { self.bucket_name = Some(bucket.into()); } } - _ => return Err(UrlNotRecognisedSnafu { url }.build().into()), + _ => return Err(Error::UrlNotRecognised { url: url.into() }.into()), }, - scheme => return Err(UnknownUrlSchemeSnafu { scheme }.build().into()), + scheme => { + let scheme = scheme.into(); + return Err(Error::UnknownUrlScheme { scheme }.into()); + } }; Ok(()) } @@ -875,7 +884,7 @@ impl AmazonS3Builder { self.parse_url(&url)?; } - let bucket = self.bucket_name.context(MissingBucketNameSnafu)?; + let bucket = self.bucket_name.ok_or(Error::MissingBucketName)?; let region = self.region.unwrap_or_else(|| "us-east-1".to_string()); let checksum = self.checksum_algorithm.map(|x| x.get()).transpose()?; let copy_if_not_exists = self.copy_if_not_exists.map(|x| x.get()).transpose()?; @@ -957,7 +966,10 @@ impl AmazonS3Builder { let (session_provider, zonal_endpoint) = match self.s3_express.get()? { true => { - let zone = parse_bucket_az(&bucket).context(ZoneSuffixSnafu { bucket: &bucket })?; + let zone = parse_bucket_az(&bucket).ok_or_else(|| { + let bucket = bucket.clone(); + Error::ZoneSuffix { bucket } + })?; // https://docs.aws.amazon.com/AmazonS3/latest/userguide/s3-express-Regions-and-Zones.html let endpoint = format!("https://{bucket}.s3express-{zone}.{region}.amazonaws.com"); diff --git a/src/aws/client.rs b/src/aws/client.rs index 81015e8..25fdd33 100644 --- a/src/aws/client.rs +++ b/src/aws/client.rs @@ -56,7 +56,6 @@ use reqwest::{Client as ReqwestClient, Method, RequestBuilder, Response}; use ring::digest; use ring::digest::Context; use serde::{Deserialize, Serialize}; -use snafu::{ResultExt, Snafu}; use std::sync::Arc; const VERSION_HEADER: &str = "x-amz-version-id"; @@ -65,56 +64,56 @@ const USER_DEFINED_METADATA_HEADER_PREFIX: &str = "x-amz-meta-"; const ALGORITHM: &str = "x-amz-checksum-algorithm"; /// A specialized `Error` for object store-related errors -#[derive(Debug, Snafu)] +#[derive(Debug, thiserror::Error)] pub(crate) enum Error { - #[snafu(display("Error performing DeleteObjects request: {}", source))] + #[error("Error performing DeleteObjects request: {}", source)] DeleteObjectsRequest { source: crate::client::retry::Error }, - #[snafu(display( + #[error( "DeleteObjects request failed for key {}: {} (code: {})", path, message, code - ))] + )] DeleteFailed { path: String, code: String, message: String, }, - #[snafu(display("Error getting DeleteObjects response body: {}", source))] + #[error("Error getting DeleteObjects response body: {}", source)] DeleteObjectsResponse { source: reqwest::Error }, - #[snafu(display("Got invalid DeleteObjects response: {}", source))] + #[error("Got invalid DeleteObjects response: {}", source)] InvalidDeleteObjectsResponse { source: Box, }, - #[snafu(display("Error performing list request: {}", source))] + #[error("Error performing list request: {}", source)] ListRequest { source: crate::client::retry::Error }, - #[snafu(display("Error getting list response body: {}", source))] + #[error("Error getting list response body: {}", source)] ListResponseBody { source: reqwest::Error }, - #[snafu(display("Error getting create multipart response body: {}", source))] + #[error("Error getting create multipart response body: {}", source)] CreateMultipartResponseBody { source: reqwest::Error }, - #[snafu(display("Error performing complete multipart request: {}: {}", path, source))] + #[error("Error performing complete multipart request: {}: {}", path, source)] CompleteMultipartRequest { source: crate::client::retry::Error, path: String, }, - #[snafu(display("Error getting complete multipart response body: {}", source))] + #[error("Error getting complete multipart response body: {}", source)] CompleteMultipartResponseBody { source: reqwest::Error }, - #[snafu(display("Got invalid list response: {}", source))] + #[error("Got invalid list response: {}", source)] InvalidListResponse { source: quick_xml::de::DeError }, - #[snafu(display("Got invalid multipart response: {}", source))] + #[error("Got invalid multipart response: {}", source)] InvalidMultipartResponse { source: quick_xml::de::DeError }, - #[snafu(display("Unable to extract metadata from headers: {}", source))] + #[error("Unable to extract metadata from headers: {}", source)] Metadata { source: crate::client::header::Error, }, @@ -263,10 +262,15 @@ impl SessionCredential<'_> { } } -#[derive(Debug, Snafu)] +#[derive(Debug, thiserror::Error)] pub enum RequestError { - #[snafu(context(false))] - Generic { source: crate::Error }, + #[error(transparent)] + Generic { + #[from] + source: crate::Error, + }, + + #[error("Retry")] Retry { source: crate::client::retry::Error, path: String, @@ -426,12 +430,16 @@ impl<'a> Request<'a> { .payload(self.payload) .send() .await - .context(RetrySnafu { path }) + .map_err(|source| { + let path = path.into(); + RequestError::Retry { source, path } + }) } pub(crate) async fn do_put(self) -> Result { let response = self.send().await?; - Ok(get_put_result(response.headers(), VERSION_HEADER).context(MetadataSnafu)?) + Ok(get_put_result(response.headers(), VERSION_HEADER) + .map_err(|source| Error::Metadata { source })?) } } @@ -535,10 +543,10 @@ impl S3Client { .with_aws_sigv4(credential.authorizer(), Some(digest.as_ref())) .send_retry(&self.config.retry_config) .await - .context(DeleteObjectsRequestSnafu {})? + .map_err(|source| Error::DeleteObjectsRequest { source })? .bytes() .await - .context(DeleteObjectsResponseSnafu {})?; + .map_err(|source| Error::DeleteObjectsResponse { source })?; let response: BatchDeleteResponse = quick_xml::de::from_reader(response.reader()).map_err(|err| { @@ -635,10 +643,10 @@ impl S3Client { .await? .bytes() .await - .context(CreateMultipartResponseBodySnafu)?; + .map_err(|source| Error::CreateMultipartResponseBody { source })?; - let response: InitiateMultipartUploadResult = - quick_xml::de::from_reader(response.reader()).context(InvalidMultipartResponseSnafu)?; + let response: InitiateMultipartUploadResult = quick_xml::de::from_reader(response.reader()) + .map_err(|source| Error::InvalidMultipartResponse { source })?; Ok(response.upload_id) } @@ -683,14 +691,14 @@ impl S3Client { .map(|v| v.to_string()); let e_tag = match is_copy { - false => get_etag(response.headers()).context(MetadataSnafu)?, + false => get_etag(response.headers()).map_err(|source| Error::Metadata { source })?, true => { let response = response .bytes() .await - .context(CreateMultipartResponseBodySnafu)?; + .map_err(|source| Error::CreateMultipartResponseBody { source })?; let response: CopyPartResult = quick_xml::de::from_reader(response.reader()) - .context(InvalidMultipartResponseSnafu)?; + .map_err(|source| Error::InvalidMultipartResponse { source })?; response.e_tag } }; @@ -764,19 +772,21 @@ impl S3Client { .retry_error_body(true) .send() .await - .context(CompleteMultipartRequestSnafu { - path: location.as_ref(), + .map_err(|source| Error::CompleteMultipartRequest { + source, + path: location.as_ref().to_string(), })?; - let version = get_version(response.headers(), VERSION_HEADER).context(MetadataSnafu)?; + let version = get_version(response.headers(), VERSION_HEADER) + .map_err(|source| Error::Metadata { source })?; let data = response .bytes() .await - .context(CompleteMultipartResponseBodySnafu)?; + .map_err(|source| Error::CompleteMultipartResponseBody { source })?; - let response: CompleteMultipartUploadResult = - quick_xml::de::from_reader(data.reader()).context(InvalidMultipartResponseSnafu)?; + let response: CompleteMultipartUploadResult = quick_xml::de::from_reader(data.reader()) + .map_err(|source| Error::InvalidMultipartResponse { source })?; Ok(PutResult { e_tag: Some(response.e_tag), @@ -884,13 +894,14 @@ impl ListClient for S3Client { .with_aws_sigv4(credential.authorizer(), None) .send_retry(&self.config.retry_config) .await - .context(ListRequestSnafu)? + .map_err(|source| Error::ListRequest { source })? .bytes() .await - .context(ListResponseBodySnafu)?; + .map_err(|source| Error::ListResponseBody { source })?; + + let mut response: ListResponse = quick_xml::de::from_reader(response.reader()) + .map_err(|source| Error::InvalidListResponse { source })?; - let mut response: ListResponse = - quick_xml::de::from_reader(response.reader()).context(InvalidListResponseSnafu)?; let token = response.next_continuation_token.take(); Ok((response.try_into()?, token)) diff --git a/src/aws/credential.rs b/src/aws/credential.rs index ee2f8e2..9c74e1c 100644 --- a/src/aws/credential.rs +++ b/src/aws/credential.rs @@ -29,23 +29,22 @@ use percent_encoding::utf8_percent_encode; use reqwest::header::{HeaderMap, HeaderValue, AUTHORIZATION}; use reqwest::{Client, Method, Request, RequestBuilder, StatusCode}; use serde::Deserialize; -use snafu::{ResultExt, Snafu}; use std::collections::BTreeMap; use std::sync::Arc; use std::time::{Duration, Instant}; use tracing::warn; use url::Url; -#[derive(Debug, Snafu)] +#[derive(Debug, thiserror::Error)] #[allow(clippy::enum_variant_names)] enum Error { - #[snafu(display("Error performing CreateSession request: {source}"))] + #[error("Error performing CreateSession request: {source}")] CreateSessionRequest { source: crate::client::retry::Error }, - #[snafu(display("Error getting CreateSession response: {source}"))] + #[error("Error getting CreateSession response: {source}")] CreateSessionResponse { source: reqwest::Error }, - #[snafu(display("Invalid CreateSessionOutput response: {source}"))] + #[error("Invalid CreateSessionOutput response: {source}")] CreateSessionOutput { source: quick_xml::DeError }, } @@ -726,13 +725,13 @@ impl TokenProvider for SessionProvider { .with_aws_sigv4(Some(authorizer), None) .send_retry(retry) .await - .context(CreateSessionRequestSnafu)? + .map_err(|source| Error::CreateSessionRequest { source })? .bytes() .await - .context(CreateSessionResponseSnafu)?; + .map_err(|source| Error::CreateSessionResponse { source })?; - let resp: CreateSessionOutput = - quick_xml::de::from_reader(bytes.reader()).context(CreateSessionOutputSnafu)?; + let resp: CreateSessionOutput = quick_xml::de::from_reader(bytes.reader()) + .map_err(|source| Error::CreateSessionOutput { source })?; let creds = resp.credentials; Ok(TemporaryToken { diff --git a/src/aws/resolve.rs b/src/aws/resolve.rs index 25bc74f..db899ea 100644 --- a/src/aws/resolve.rs +++ b/src/aws/resolve.rs @@ -17,21 +17,20 @@ use crate::aws::STORE; use crate::{ClientOptions, Result}; -use snafu::{ensure, OptionExt, ResultExt, Snafu}; /// A specialized `Error` for object store-related errors -#[derive(Debug, Snafu)] +#[derive(Debug, thiserror::Error)] enum Error { - #[snafu(display("Bucket '{}' not found", bucket))] + #[error("Bucket '{}' not found", bucket)] BucketNotFound { bucket: String }, - #[snafu(display("Failed to resolve region for bucket '{}'", bucket))] + #[error("Failed to resolve region for bucket '{}'", bucket)] ResolveRegion { bucket: String, source: reqwest::Error, }, - #[snafu(display("Failed to parse the region for bucket '{}'", bucket))] + #[error("Failed to parse the region for bucket '{}'", bucket)] RegionParse { bucket: String }, } @@ -54,22 +53,23 @@ pub async fn resolve_bucket_region(bucket: &str, client_options: &ClientOptions) let client = client_options.client()?; - let response = client - .head(&endpoint) - .send() - .await - .context(ResolveRegionSnafu { bucket })?; + let response = client.head(&endpoint).send().await.map_err(|source| { + let bucket = bucket.into(); + Error::ResolveRegion { bucket, source } + })?; - ensure!( - response.status() != StatusCode::NOT_FOUND, - BucketNotFoundSnafu { bucket } - ); + if response.status() == StatusCode::NOT_FOUND { + let bucket = bucket.into(); + return Err(Error::BucketNotFound { bucket }.into()); + } let region = response .headers() .get("x-amz-bucket-region") .and_then(|x| x.to_str().ok()) - .context(RegionParseSnafu { bucket })?; + .ok_or_else(|| Error::RegionParse { + bucket: bucket.into(), + })?; Ok(region.to_string()) } diff --git a/src/azure/builder.rs b/src/azure/builder.rs index 08c9a23..f0572eb 100644 --- a/src/azure/builder.rs +++ b/src/azure/builder.rs @@ -26,7 +26,6 @@ use crate::config::ConfigValue; use crate::{ClientConfigKey, ClientOptions, Result, RetryConfig, StaticCredentialProvider}; use percent_encoding::percent_decode_str; use serde::{Deserialize, Serialize}; -use snafu::{OptionExt, ResultExt, Snafu}; use std::str::FromStr; use std::sync::Arc; use url::Url; @@ -45,48 +44,48 @@ const EMULATOR_ACCOUNT_KEY: &str = const MSI_ENDPOINT_ENV_KEY: &str = "IDENTITY_ENDPOINT"; /// A specialized `Error` for Azure builder-related errors -#[derive(Debug, Snafu)] +#[derive(Debug, thiserror::Error)] enum Error { - #[snafu(display("Unable parse source url. Url: {}, Error: {}", url, source))] + #[error("Unable parse source url. Url: {}, Error: {}", url, source)] UnableToParseUrl { source: url::ParseError, url: String, }, - #[snafu(display( + #[error( "Unable parse emulator url {}={}, Error: {}", env_name, env_value, source - ))] + )] UnableToParseEmulatorUrl { env_name: String, env_value: String, source: url::ParseError, }, - #[snafu(display("Account must be specified"))] + #[error("Account must be specified")] MissingAccount {}, - #[snafu(display("Container name must be specified"))] + #[error("Container name must be specified")] MissingContainerName {}, - #[snafu(display( + #[error( "Unknown url scheme cannot be parsed into storage location: {}", scheme - ))] + )] UnknownUrlScheme { scheme: String }, - #[snafu(display("URL did not match any known pattern for scheme: {}", url))] + #[error("URL did not match any known pattern for scheme: {}", url)] UrlNotRecognised { url: String }, - #[snafu(display("Failed parsing an SAS key"))] + #[error("Failed parsing an SAS key")] DecodeSasKey { source: std::str::Utf8Error }, - #[snafu(display("Missing component in SAS query pair"))] + #[error("Missing component in SAS query pair")] MissingSasComponent {}, - #[snafu(display("Configuration key: '{}' is not known.", key))] + #[error("Configuration key: '{}' is not known.", key)] UnknownConfigurationKey { key: String }, } @@ -642,11 +641,17 @@ impl MicrosoftAzureBuilder { /// This is a separate member function to allow fallible computation to /// be deferred until [`Self::build`] which in turn allows deriving [`Clone`] fn parse_url(&mut self, url: &str) -> Result<()> { - let parsed = Url::parse(url).context(UnableToParseUrlSnafu { url })?; - let host = parsed.host_str().context(UrlNotRecognisedSnafu { url })?; + let parsed = Url::parse(url).map_err(|source| { + let url = url.into(); + Error::UnableToParseUrl { url, source } + })?; + + let host = parsed + .host_str() + .ok_or_else(|| Error::UrlNotRecognised { url: url.into() })?; let validate = |s: &str| match s.contains('.') { - true => Err(UrlNotRecognisedSnafu { url }.build()), + true => Err(Error::UrlNotRecognised { url: url.into() }), false => Ok(s.to_string()), }; @@ -665,7 +670,7 @@ impl MicrosoftAzureBuilder { self.account_name = Some(validate(a)?); self.use_fabric_endpoint = true.into(); } else { - return Err(UrlNotRecognisedSnafu { url }.build().into()); + return Err(Error::UrlNotRecognised { url: url.into() }.into()); } } "https" => match host.split_once('.') { @@ -689,9 +694,12 @@ impl MicrosoftAzureBuilder { } self.use_fabric_endpoint = true.into(); } - _ => return Err(UrlNotRecognisedSnafu { url }.build().into()), + _ => return Err(Error::UrlNotRecognised { url: url.into() }.into()), }, - scheme => return Err(UnknownUrlSchemeSnafu { scheme }.build().into()), + scheme => { + let scheme = scheme.into(); + return Err(Error::UnknownUrlScheme { scheme }.into()); + } } Ok(()) } @@ -924,8 +932,10 @@ impl MicrosoftAzureBuilder { }, }; - let url = - Url::parse(&account_url).context(UnableToParseUrlSnafu { url: account_url })?; + let url = Url::parse(&account_url).map_err(|source| { + let url = account_url.clone(); + Error::UnableToParseUrl { url, source } + })?; let credential = if let Some(credential) = self.credentials { credential @@ -1030,10 +1040,13 @@ impl MicrosoftAzureBuilder { /// if present, otherwise falls back to default_url fn url_from_env(env_name: &str, default_url: &str) -> Result { let url = match std::env::var(env_name) { - Ok(env_value) => Url::parse(&env_value).context(UnableToParseEmulatorUrlSnafu { - env_name, - env_value, - })?, + Ok(env_value) => { + Url::parse(&env_value).map_err(|source| Error::UnableToParseEmulatorUrl { + env_name: env_name.into(), + env_value, + source, + })? + } Err(_) => Url::parse(default_url).expect("Failed to parse default URL"), }; Ok(url) @@ -1042,7 +1055,7 @@ fn url_from_env(env_name: &str, default_url: &str) -> Result { fn split_sas(sas: &str) -> Result, Error> { let sas = percent_decode_str(sas) .decode_utf8() - .context(DecodeSasKeySnafu {})?; + .map_err(|source| Error::DecodeSasKey { source })?; let kv_str_pairs = sas .trim_start_matches('?') .split('&') diff --git a/src/azure/client.rs b/src/azure/client.rs index 69ff395..ea3a5fa 100644 --- a/src/azure/client.rs +++ b/src/azure/client.rs @@ -42,7 +42,6 @@ use reqwest::{ Client as ReqwestClient, Method, RequestBuilder, Response, }; use serde::{Deserialize, Serialize}; -use snafu::{OptionExt, ResultExt, Snafu}; use std::collections::HashMap; use std::sync::Arc; use std::time::Duration; @@ -60,84 +59,84 @@ static MS_CONTENT_LANGUAGE: HeaderName = HeaderName::from_static("x-ms-blob-cont static TAGS_HEADER: HeaderName = HeaderName::from_static("x-ms-tags"); /// A specialized `Error` for object store-related errors -#[derive(Debug, Snafu)] +#[derive(Debug, thiserror::Error)] pub(crate) enum Error { - #[snafu(display("Error performing get request {}: {}", path, source))] + #[error("Error performing get request {}: {}", path, source)] GetRequest { source: crate::client::retry::Error, path: String, }, - #[snafu(display("Error performing put request {}: {}", path, source))] + #[error("Error performing put request {}: {}", path, source)] PutRequest { source: crate::client::retry::Error, path: String, }, - #[snafu(display("Error performing delete request {}: {}", path, source))] + #[error("Error performing delete request {}: {}", path, source)] DeleteRequest { source: crate::client::retry::Error, path: String, }, - #[snafu(display("Error performing bulk delete request: {}", source))] + #[error("Error performing bulk delete request: {}", source)] BulkDeleteRequest { source: crate::client::retry::Error }, - #[snafu(display("Error receiving bulk delete request body: {}", source))] + #[error("Error receiving bulk delete request body: {}", source)] BulkDeleteRequestBody { source: reqwest::Error }, - #[snafu(display( + #[error( "Bulk delete request failed due to invalid input: {} (code: {})", reason, code - ))] + )] BulkDeleteRequestInvalidInput { code: String, reason: String }, - #[snafu(display("Got invalid bulk delete response: {}", reason))] + #[error("Got invalid bulk delete response: {}", reason)] InvalidBulkDeleteResponse { reason: String }, - #[snafu(display( + #[error( "Bulk delete request failed for key {}: {} (code: {})", path, reason, code - ))] + )] DeleteFailed { path: String, code: String, reason: String, }, - #[snafu(display("Error performing list request: {}", source))] + #[error("Error performing list request: {}", source)] ListRequest { source: crate::client::retry::Error }, - #[snafu(display("Error getting list response body: {}", source))] + #[error("Error getting list response body: {}", source)] ListResponseBody { source: reqwest::Error }, - #[snafu(display("Got invalid list response: {}", source))] + #[error("Got invalid list response: {}", source)] InvalidListResponse { source: quick_xml::de::DeError }, - #[snafu(display("Unable to extract metadata from headers: {}", source))] + #[error("Unable to extract metadata from headers: {}", source)] Metadata { source: crate::client::header::Error, }, - #[snafu(display("ETag required for conditional update"))] + #[error("ETag required for conditional update")] MissingETag, - #[snafu(display("Error requesting user delegation key: {}", source))] + #[error("Error requesting user delegation key: {}", source)] DelegationKeyRequest { source: crate::client::retry::Error }, - #[snafu(display("Error getting user delegation key response body: {}", source))] + #[error("Error getting user delegation key response body: {}", source)] DelegationKeyResponseBody { source: reqwest::Error }, - #[snafu(display("Got invalid user delegation key response: {}", source))] + #[error("Got invalid user delegation key response: {}", source)] DelegationKeyResponse { source: quick_xml::de::DeError }, - #[snafu(display("Generating SAS keys with SAS tokens auth is not supported"))] + #[error("Generating SAS keys with SAS tokens auth is not supported")] SASforSASNotSupported, - #[snafu(display("Generating SAS keys while skipping signatures is not supported"))] + #[error("Generating SAS keys while skipping signatures is not supported")] SASwithSkipSignature, } @@ -268,8 +267,9 @@ impl<'a> PutRequest<'a> { .payload(Some(self.payload)) .send() .await - .context(PutRequestSnafu { - path: self.path.as_ref(), + .map_err(|source| { + let path = self.path.as_ref().into(); + Error::PutRequest { path, source } })?; Ok(response) @@ -544,13 +544,14 @@ impl AzureClient { PutMode::Overwrite => builder.idempotent(true), PutMode::Create => builder.header(&IF_NONE_MATCH, "*"), PutMode::Update(v) => { - let etag = v.e_tag.as_ref().context(MissingETagSnafu)?; + let etag = v.e_tag.as_ref().ok_or(Error::MissingETag)?; builder.header(&IF_MATCH, etag) } }; let response = builder.header(&BLOB_TYPE, "BlockBlob").send().await?; - Ok(get_put_result(response.headers(), VERSION_HEADER).context(MetadataSnafu)?) + Ok(get_put_result(response.headers(), VERSION_HEADER) + .map_err(|source| Error::Metadata { source })?) } /// PUT a block @@ -595,7 +596,8 @@ impl AzureClient { .send() .await?; - Ok(get_put_result(response.headers(), VERSION_HEADER).context(MetadataSnafu)?) + Ok(get_put_result(response.headers(), VERSION_HEADER) + .map_err(|source| Error::Metadata { source })?) } /// Make an Azure Delete request @@ -620,8 +622,9 @@ impl AzureClient { .sensitive(sensitive) .send() .await - .context(DeleteRequestSnafu { - path: path.as_ref(), + .map_err(|source| { + let path = path.as_ref().into(); + Error::DeleteRequest { source, path } })?; Ok(()) @@ -693,14 +696,14 @@ impl AzureClient { .with_azure_authorization(&credential, &self.config.account) .send_retry(&self.config.retry_config) .await - .context(BulkDeleteRequestSnafu {})?; + .map_err(|source| Error::BulkDeleteRequest { source })?; let boundary = parse_multipart_response_boundary(&batch_response)?; let batch_body = batch_response .bytes() .await - .context(BulkDeleteRequestBodySnafu {})?; + .map_err(|source| Error::BulkDeleteRequestBody { source })?; let results = parse_blob_batch_delete_body(batch_body, boundary, &paths).await?; @@ -780,13 +783,13 @@ impl AzureClient { .idempotent(true) .send() .await - .context(DelegationKeyRequestSnafu)? + .map_err(|source| Error::DelegationKeyRequest { source })? .bytes() .await - .context(DelegationKeyResponseBodySnafu)?; + .map_err(|source| Error::DelegationKeyResponseBody { source })?; - let response: UserDelegationKey = - quick_xml::de::from_reader(response.reader()).context(DelegationKeyResponseSnafu)?; + let response: UserDelegationKey = quick_xml::de::from_reader(response.reader()) + .map_err(|source| Error::DelegationKeyResponse { source })?; Ok(response) } @@ -842,9 +845,11 @@ impl AzureClient { .sensitive(sensitive) .send() .await - .context(GetRequestSnafu { - path: path.as_ref(), + .map_err(|source| { + let path = path.as_ref().into(); + Error::GetRequest { source, path } })?; + Ok(response) } } @@ -900,8 +905,9 @@ impl GetClient for AzureClient { .sensitive(sensitive) .send() .await - .context(GetRequestSnafu { - path: path.as_ref(), + .map_err(|source| { + let path = path.as_ref().into(); + Error::GetRequest { source, path } })?; match response.headers().get("x-ms-resource-type") { @@ -962,13 +968,14 @@ impl ListClient for AzureClient { .sensitive(sensitive) .send() .await - .context(ListRequestSnafu)? + .map_err(|source| Error::ListRequest { source })? .bytes() .await - .context(ListResponseBodySnafu)?; + .map_err(|source| Error::ListResponseBody { source })?; + + let mut response: ListResultInternal = quick_xml::de::from_reader(response.reader()) + .map_err(|source| Error::InvalidListResponse { source })?; - let mut response: ListResultInternal = - quick_xml::de::from_reader(response.reader()).context(InvalidListResponseSnafu)?; let token = response.next_marker.take(); Ok((to_list_result(response, prefix)?, token)) diff --git a/src/azure/credential.rs b/src/azure/credential.rs index 2832eed..c9e6ac6 100644 --- a/src/azure/credential.rs +++ b/src/azure/credential.rs @@ -32,7 +32,6 @@ use reqwest::header::{ }; use reqwest::{Client, Method, Request, RequestBuilder}; use serde::Deserialize; -use snafu::{ResultExt, Snafu}; use std::borrow::Cow; use std::collections::HashMap; use std::fmt::Debug; @@ -71,27 +70,27 @@ const AZURE_STORAGE_SCOPE: &str = "https://storage.azure.com/.default"; /// const AZURE_STORAGE_RESOURCE: &str = "https://storage.azure.com"; -#[derive(Debug, Snafu)] +#[derive(Debug, thiserror::Error)] pub enum Error { - #[snafu(display("Error performing token request: {}", source))] + #[error("Error performing token request: {}", source)] TokenRequest { source: crate::client::retry::Error }, - #[snafu(display("Error getting token response body: {}", source))] + #[error("Error getting token response body: {}", source)] TokenResponseBody { source: reqwest::Error }, - #[snafu(display("Error reading federated token file "))] + #[error("Error reading federated token file ")] FederatedTokenFile, - #[snafu(display("Invalid Access Key: {}", source))] + #[error("Invalid Access Key: {}", source)] InvalidAccessKey { source: base64::DecodeError }, - #[snafu(display("'az account get-access-token' command failed: {message}"))] + #[error("'az account get-access-token' command failed: {message}")] AzureCli { message: String }, - #[snafu(display("Failed to parse azure cli response: {source}"))] + #[error("Failed to parse azure cli response: {source}")] AzureCliResponse { source: serde_json::Error }, - #[snafu(display("Generating SAS keys with SAS tokens auth is not supported"))] + #[error("Generating SAS keys with SAS tokens auth is not supported")] SASforSASNotSupported, } @@ -113,7 +112,10 @@ pub struct AzureAccessKey(Vec); impl AzureAccessKey { /// Create a new [`AzureAccessKey`], checking it for validity pub fn try_new(key: &str) -> Result { - let key = BASE64_STANDARD.decode(key).context(InvalidAccessKeySnafu)?; + let key = BASE64_STANDARD + .decode(key) + .map_err(|source| Error::InvalidAccessKey { source })?; + Ok(Self(key)) } } @@ -636,10 +638,10 @@ impl TokenProvider for ClientSecretOAuthProvider { .idempotent(true) .send() .await - .context(TokenRequestSnafu)? + .map_err(|source| Error::TokenRequest { source })? .json() .await - .context(TokenResponseBodySnafu)?; + .map_err(|source| Error::TokenResponseBody { source })?; Ok(TemporaryToken { token: Arc::new(AzureCredential::BearerToken(response.access_token)), @@ -744,10 +746,10 @@ impl TokenProvider for ImdsManagedIdentityProvider { let response: ImdsTokenResponse = builder .send_retry(retry) .await - .context(TokenRequestSnafu)? + .map_err(|source| Error::TokenRequest { source })? .json() .await - .context(TokenResponseBodySnafu)?; + .map_err(|source| Error::TokenResponseBody { source })?; Ok(TemporaryToken { token: Arc::new(AzureCredential::BearerToken(response.access_token)), @@ -820,10 +822,10 @@ impl TokenProvider for WorkloadIdentityOAuthProvider { .idempotent(true) .send() .await - .context(TokenRequestSnafu)? + .map_err(|source| Error::TokenRequest { source })? .json() .await - .context(TokenResponseBodySnafu)?; + .map_err(|source| Error::TokenResponseBody { source })?; Ok(TemporaryToken { token: Arc::new(AzureCredential::BearerToken(response.access_token)), @@ -900,7 +902,8 @@ impl AzureCliCredential { })?; let token_response = serde_json::from_str::(output) - .context(AzureCliResponseSnafu)?; + .map_err(|source| Error::AzureCliResponse { source })?; + if !token_response.token_type.eq_ignore_ascii_case("bearer") { return Err(Error::AzureCli { message: format!( @@ -1033,10 +1036,10 @@ impl TokenProvider for FabricTokenOAuthProvider { .idempotent(true) .send() .await - .context(TokenRequestSnafu)? + .map_err(|source| Error::TokenRequest { source })? .text() .await - .context(TokenResponseBodySnafu)?; + .map_err(|source| Error::TokenResponseBody { source })?; let exp_in = Self::validate_and_get_expiry(&access_token) .map_or(3600, |expiry| expiry - Self::get_current_timestamp()); Ok(TemporaryToken { diff --git a/src/client/get.rs b/src/client/get.rs index 5dd62cb..57aca89 100644 --- a/src/client/get.rs +++ b/src/client/get.rs @@ -29,7 +29,6 @@ use hyper::header::{ use hyper::StatusCode; use reqwest::header::ToStrError; use reqwest::Response; -use snafu::{ensure, OptionExt, ResultExt, Snafu}; /// A client that can perform a get request #[async_trait] @@ -95,49 +94,51 @@ impl ContentRange { } /// A specialized `Error` for get-related errors -#[derive(Debug, Snafu)] +#[derive(Debug, thiserror::Error)] enum GetResultError { - #[snafu(context(false))] + #[error(transparent)] Header { + #[from] source: crate::client::header::Error, }, - #[snafu(transparent)] + #[error(transparent)] InvalidRangeRequest { + #[from] source: crate::util::InvalidGetRange, }, - #[snafu(display("Received non-partial response when range requested"))] + #[error("Received non-partial response when range requested")] NotPartial, - #[snafu(display("Content-Range header not present in partial response"))] + #[error("Content-Range header not present in partial response")] NoContentRange, - #[snafu(display("Failed to parse value for CONTENT_RANGE header: \"{value}\""))] + #[error("Failed to parse value for CONTENT_RANGE header: \"{value}\"")] ParseContentRange { value: String }, - #[snafu(display("Content-Range header contained non UTF-8 characters"))] + #[error("Content-Range header contained non UTF-8 characters")] InvalidContentRange { source: ToStrError }, - #[snafu(display("Cache-Control header contained non UTF-8 characters"))] + #[error("Cache-Control header contained non UTF-8 characters")] InvalidCacheControl { source: ToStrError }, - #[snafu(display("Content-Disposition header contained non UTF-8 characters"))] + #[error("Content-Disposition header contained non UTF-8 characters")] InvalidContentDisposition { source: ToStrError }, - #[snafu(display("Content-Encoding header contained non UTF-8 characters"))] + #[error("Content-Encoding header contained non UTF-8 characters")] InvalidContentEncoding { source: ToStrError }, - #[snafu(display("Content-Language header contained non UTF-8 characters"))] + #[error("Content-Language header contained non UTF-8 characters")] InvalidContentLanguage { source: ToStrError }, - #[snafu(display("Content-Type header contained non UTF-8 characters"))] + #[error("Content-Type header contained non UTF-8 characters")] InvalidContentType { source: ToStrError }, - #[snafu(display("Metadata value for \"{key:?}\" contained non UTF-8 characters"))] + #[error("Metadata value for \"{key:?}\" contained non UTF-8 characters")] InvalidMetadata { key: String }, - #[snafu(display("Requested {expected:?}, got {actual:?}"))] + #[error("Requested {expected:?}, got {actual:?}")] UnexpectedRange { expected: Range, actual: Range, @@ -153,17 +154,24 @@ fn get_result( // ensure that we receive the range we asked for let range = if let Some(expected) = range { - ensure!( - response.status() == StatusCode::PARTIAL_CONTENT, - NotPartialSnafu - ); + if response.status() != StatusCode::PARTIAL_CONTENT { + return Err(GetResultError::NotPartial); + } + let val = response .headers() .get(CONTENT_RANGE) - .context(NoContentRangeSnafu)?; + .ok_or(GetResultError::NoContentRange)?; + + let value = val + .to_str() + .map_err(|source| GetResultError::InvalidContentRange { source })?; + + let value = ContentRange::from_str(value).ok_or_else(|| { + let value = value.into(); + GetResultError::ParseContentRange { value } + })?; - let value = val.to_str().context(InvalidContentRangeSnafu)?; - let value = ContentRange::from_str(value).context(ParseContentRangeSnafu { value })?; let actual = value.range; // Update size to reflect full size of object (#5272) @@ -171,10 +179,9 @@ fn get_result( let expected = expected.as_range(meta.size)?; - ensure!( - actual == expected, - UnexpectedRangeSnafu { expected, actual } - ); + if actual != expected { + return Err(GetResultError::UnexpectedRange { expected, actual }); + } actual } else { @@ -182,11 +189,11 @@ fn get_result( }; macro_rules! parse_attributes { - ($headers:expr, $(($header:expr, $attr:expr, $err:expr)),*) => {{ + ($headers:expr, $(($header:expr, $attr:expr, $map_err:expr)),*) => {{ let mut attributes = Attributes::new(); $( if let Some(x) = $headers.get($header) { - let x = x.to_str().context($err)?; + let x = x.to_str().map_err($map_err)?; attributes.insert($attr, x.to_string().into()); } )* @@ -196,31 +203,23 @@ fn get_result( let mut attributes = parse_attributes!( response.headers(), - ( - CACHE_CONTROL, - Attribute::CacheControl, - InvalidCacheControlSnafu - ), + (CACHE_CONTROL, Attribute::CacheControl, |source| { + GetResultError::InvalidCacheControl { source } + }), ( CONTENT_DISPOSITION, Attribute::ContentDisposition, - InvalidContentDispositionSnafu - ), - ( - CONTENT_ENCODING, - Attribute::ContentEncoding, - InvalidContentEncodingSnafu + |source| GetResultError::InvalidContentDisposition { source } ), - ( - CONTENT_LANGUAGE, - Attribute::ContentLanguage, - InvalidContentLanguageSnafu - ), - ( - CONTENT_TYPE, - Attribute::ContentType, - InvalidContentTypeSnafu - ) + (CONTENT_ENCODING, Attribute::ContentEncoding, |source| { + GetResultError::InvalidContentEncoding { source } + }), + (CONTENT_LANGUAGE, Attribute::ContentLanguage, |source| { + GetResultError::InvalidContentLanguage { source } + }), + (CONTENT_TYPE, Attribute::ContentType, |source| { + GetResultError::InvalidContentType { source } + }) ); // Add attributes that match the user-defined metadata prefix (e.g. x-amz-meta-) diff --git a/src/client/header.rs b/src/client/header.rs index 07c04c1..db06da6 100644 --- a/src/client/header.rs +++ b/src/client/header.rs @@ -22,7 +22,6 @@ use crate::ObjectMeta; use chrono::{DateTime, TimeZone, Utc}; use hyper::header::{CONTENT_LENGTH, ETAG, LAST_MODIFIED}; use hyper::HeaderMap; -use snafu::{OptionExt, ResultExt, Snafu}; #[derive(Debug, Copy, Clone)] /// Configuration for header extraction @@ -44,27 +43,27 @@ pub(crate) struct HeaderConfig { pub user_defined_metadata_prefix: Option<&'static str>, } -#[derive(Debug, Snafu)] +#[derive(Debug, thiserror::Error)] pub(crate) enum Error { - #[snafu(display("ETag Header missing from response"))] + #[error("ETag Header missing from response")] MissingEtag, - #[snafu(display("Received header containing non-ASCII data"))] + #[error("Received header containing non-ASCII data")] BadHeader { source: reqwest::header::ToStrError }, - #[snafu(display("Last-Modified Header missing from response"))] + #[error("Last-Modified Header missing from response")] MissingLastModified, - #[snafu(display("Content-Length Header missing from response"))] + #[error("Content-Length Header missing from response")] MissingContentLength, - #[snafu(display("Invalid last modified '{}': {}", last_modified, source))] + #[error("Invalid last modified '{}': {}", last_modified, source)] InvalidLastModified { last_modified: String, source: chrono::ParseError, }, - #[snafu(display("Invalid content length '{}': {}", content_length, source))] + #[error("Invalid content length '{}': {}", content_length, source)] InvalidContentLength { content_length: String, source: std::num::ParseIntError, @@ -86,7 +85,11 @@ pub(crate) fn get_put_result( #[cfg(any(feature = "aws", feature = "gcp", feature = "azure"))] pub(crate) fn get_version(headers: &HeaderMap, version: &str) -> Result, Error> { Ok(match headers.get(version) { - Some(x) => Some(x.to_str().context(BadHeaderSnafu)?.to_string()), + Some(x) => Some( + x.to_str() + .map_err(|source| Error::BadHeader { source })? + .to_string(), + ), None => None, }) } @@ -94,7 +97,10 @@ pub(crate) fn get_version(headers: &HeaderMap, version: &str) -> Result Result { let e_tag = headers.get(ETAG).ok_or(Error::MissingEtag)?; - Ok(e_tag.to_str().context(BadHeaderSnafu)?.to_string()) + Ok(e_tag + .to_str() + .map_err(|source| Error::BadHeader { source })? + .to_string()) } /// Extracts [`ObjectMeta`] from the provided [`HeaderMap`] @@ -105,9 +111,15 @@ pub(crate) fn header_meta( ) -> Result { let last_modified = match headers.get(LAST_MODIFIED) { Some(last_modified) => { - let last_modified = last_modified.to_str().context(BadHeaderSnafu)?; + let last_modified = last_modified + .to_str() + .map_err(|source| Error::BadHeader { source })?; + DateTime::parse_from_rfc2822(last_modified) - .context(InvalidLastModifiedSnafu { last_modified })? + .map_err(|source| Error::InvalidLastModified { + last_modified: last_modified.into(), + source, + })? .with_timezone(&Utc) } None if cfg.last_modified_required => return Err(Error::MissingLastModified), @@ -122,15 +134,25 @@ pub(crate) fn header_meta( let content_length = headers .get(CONTENT_LENGTH) - .context(MissingContentLengthSnafu)?; + .ok_or(Error::MissingContentLength)?; + + let content_length = content_length + .to_str() + .map_err(|source| Error::BadHeader { source })?; - let content_length = content_length.to_str().context(BadHeaderSnafu)?; let size = content_length .parse() - .context(InvalidContentLengthSnafu { content_length })?; + .map_err(|source| Error::InvalidContentLength { + content_length: content_length.into(), + source, + })?; let version = match cfg.version_header.and_then(|h| headers.get(h)) { - Some(v) => Some(v.to_str().context(BadHeaderSnafu)?.to_string()), + Some(v) => Some( + v.to_str() + .map_err(|source| Error::BadHeader { source })? + .to_string(), + ), None => None, }; diff --git a/src/client/retry.rs b/src/client/retry.rs index a8a8e58..8938b08 100644 --- a/src/client/retry.rs +++ b/src/client/retry.rs @@ -22,30 +22,29 @@ use crate::PutPayload; use futures::future::BoxFuture; use reqwest::header::LOCATION; use reqwest::{Client, Request, Response, StatusCode}; -use snafu::Error as SnafuError; -use snafu::Snafu; +use std::error::Error as StdError; use std::time::{Duration, Instant}; use tracing::info; /// Retry request error -#[derive(Debug, Snafu)] +#[derive(Debug, thiserror::Error)] pub enum Error { - #[snafu(display("Received redirect without LOCATION, this normally indicates an incorrectly configured region"))] + #[error("Received redirect without LOCATION, this normally indicates an incorrectly configured region")] BareRedirect, - #[snafu(display("Server error, body contains Error, with status {status}: {}", body.as_deref().unwrap_or("No Body")))] + #[error("Server error, body contains Error, with status {status}: {}", body.as_deref().unwrap_or("No Body"))] Server { status: StatusCode, body: Option, }, - #[snafu(display("Client error with status {status}: {}", body.as_deref().unwrap_or("No Body")))] + #[error("Client error with status {status}: {}", body.as_deref().unwrap_or("No Body"))] Client { status: StatusCode, body: Option, }, - #[snafu(display("Error after {retries} retries in {elapsed:?}, max_retries:{max_retries}, retry_timeout:{retry_timeout:?}, source:{source}"))] + #[error("Error after {retries} retries in {elapsed:?}, max_retries:{max_retries}, retry_timeout:{retry_timeout:?}, source:{source}")] Reqwest { retries: usize, max_retries: usize, diff --git a/src/delimited.rs b/src/delimited.rs index 96f88bf..5b11a0b 100644 --- a/src/delimited.rs +++ b/src/delimited.rs @@ -21,16 +21,15 @@ use std::collections::VecDeque; use bytes::Bytes; use futures::{Stream, StreamExt}; -use snafu::{ensure, Snafu}; use super::Result; -#[derive(Debug, Snafu)] +#[derive(Debug, thiserror::Error)] enum Error { - #[snafu(display("encountered unterminated string"))] + #[error("encountered unterminated string")] UnterminatedString, - #[snafu(display("encountered trailing escape character"))] + #[error("encountered trailing escape character")] TrailingEscape, } @@ -125,8 +124,12 @@ impl LineDelimiter { /// Returns `true` if there is no remaining data to be read fn finish(&mut self) -> Result { if !self.remainder.is_empty() { - ensure!(!self.is_quote, UnterminatedStringSnafu); - ensure!(!self.is_escape, TrailingEscapeSnafu); + if self.is_quote { + Err(Error::UnterminatedString)?; + } + if self.is_escape { + Err(Error::TrailingEscape)?; + } self.complete .push_back(Bytes::from(std::mem::take(&mut self.remainder))) diff --git a/src/gcp/builder.rs b/src/gcp/builder.rs index fac923c..cc5c1e1 100644 --- a/src/gcp/builder.rs +++ b/src/gcp/builder.rs @@ -27,7 +27,6 @@ use crate::gcp::{ }; use crate::{ClientConfigKey, ClientOptions, Result, RetryConfig, StaticCredentialProvider}; use serde::{Deserialize, Serialize}; -use snafu::{OptionExt, ResultExt, Snafu}; use std::str::FromStr; use std::sync::Arc; use std::time::Duration; @@ -37,33 +36,33 @@ use super::credential::{AuthorizedUserSigningCredentials, InstanceSigningCredent const TOKEN_MIN_TTL: Duration = Duration::from_secs(4 * 60); -#[derive(Debug, Snafu)] +#[derive(Debug, thiserror::Error)] enum Error { - #[snafu(display("Missing bucket name"))] + #[error("Missing bucket name")] MissingBucketName {}, - #[snafu(display("One of service account path or service account key may be provided."))] + #[error("One of service account path or service account key may be provided.")] ServiceAccountPathAndKeyProvided, - #[snafu(display("Unable parse source url. Url: {}, Error: {}", url, source))] + #[error("Unable parse source url. Url: {}, Error: {}", url, source)] UnableToParseUrl { source: url::ParseError, url: String, }, - #[snafu(display( + #[error( "Unknown url scheme cannot be parsed into storage location: {}", scheme - ))] + )] UnknownUrlScheme { scheme: String }, - #[snafu(display("URL did not match any known pattern for scheme: {}", url))] + #[error("URL did not match any known pattern for scheme: {}", url)] UrlNotRecognised { url: String }, - #[snafu(display("Configuration key: '{}' is not known.", key))] + #[error("Configuration key: '{}' is not known.", key)] UnknownConfigurationKey { key: String }, - #[snafu(display("GCP credential error: {}", source))] + #[error("GCP credential error: {}", source)] Credential { source: credential::Error }, } @@ -319,12 +318,21 @@ impl GoogleCloudStorageBuilder { /// This is a separate member function to allow fallible computation to /// be deferred until [`Self::build`] which in turn allows deriving [`Clone`] fn parse_url(&mut self, url: &str) -> Result<()> { - let parsed = Url::parse(url).context(UnableToParseUrlSnafu { url })?; - let host = parsed.host_str().context(UrlNotRecognisedSnafu { url })?; + let parsed = Url::parse(url).map_err(|source| Error::UnableToParseUrl { + source, + url: url.to_string(), + })?; + + let host = parsed.host_str().ok_or_else(|| Error::UrlNotRecognised { + url: url.to_string(), + })?; match parsed.scheme() { "gs" => self.bucket_name = Some(host.to_string()), - scheme => return Err(UnknownUrlSchemeSnafu { scheme }.build().into()), + scheme => { + let scheme = scheme.to_string(); + return Err(Error::UnknownUrlScheme { scheme }.into()); + } } Ok(()) } @@ -428,12 +436,14 @@ impl GoogleCloudStorageBuilder { // First try to initialize from the service account information. let service_account_credentials = match (self.service_account_path, self.service_account_key) { - (Some(path), None) => { - Some(ServiceAccountCredentials::from_file(path).context(CredentialSnafu)?) - } - (None, Some(key)) => { - Some(ServiceAccountCredentials::from_key(&key).context(CredentialSnafu)?) - } + (Some(path), None) => Some( + ServiceAccountCredentials::from_file(path) + .map_err(|source| Error::Credential { source })?, + ), + (None, Some(key)) => Some( + ServiceAccountCredentials::from_key(&key) + .map_err(|source| Error::Credential { source })?, + ), (None, None) => None, (Some(_), Some(_)) => return Err(Error::ServiceAccountPathAndKeyProvided.into()), }; diff --git a/src/gcp/client.rs b/src/gcp/client.rs index ccc9c34..1928d13 100644 --- a/src/gcp/client.rs +++ b/src/gcp/client.rs @@ -44,7 +44,6 @@ use percent_encoding::{percent_encode, utf8_percent_encode, NON_ALPHANUMERIC}; use reqwest::header::HeaderName; use reqwest::{Client, Method, RequestBuilder, Response, StatusCode}; use serde::{Deserialize, Serialize}; -use snafu::{OptionExt, ResultExt, Snafu}; use std::sync::Arc; const VERSION_HEADER: &str = "x-goog-generation"; @@ -53,62 +52,62 @@ const USER_DEFINED_METADATA_HEADER_PREFIX: &str = "x-goog-meta-"; static VERSION_MATCH: HeaderName = HeaderName::from_static("x-goog-if-generation-match"); -#[derive(Debug, Snafu)] +#[derive(Debug, thiserror::Error)] enum Error { - #[snafu(display("Error performing list request: {}", source))] + #[error("Error performing list request: {}", source)] ListRequest { source: crate::client::retry::Error }, - #[snafu(display("Error getting list response body: {}", source))] + #[error("Error getting list response body: {}", source)] ListResponseBody { source: reqwest::Error }, - #[snafu(display("Got invalid list response: {}", source))] + #[error("Got invalid list response: {}", source)] InvalidListResponse { source: quick_xml::de::DeError }, - #[snafu(display("Error performing get request {}: {}", path, source))] + #[error("Error performing get request {}: {}", path, source)] GetRequest { source: crate::client::retry::Error, path: String, }, - #[snafu(display("Error performing request {}: {}", path, source))] + #[error("Error performing request {}: {}", path, source)] Request { source: crate::client::retry::Error, path: String, }, - #[snafu(display("Error getting put response body: {}", source))] + #[error("Error getting put response body: {}", source)] PutResponseBody { source: reqwest::Error }, - #[snafu(display("Got invalid put request: {}", source))] + #[error("Got invalid put request: {}", source)] InvalidPutRequest { source: quick_xml::se::SeError }, - #[snafu(display("Got invalid put response: {}", source))] + #[error("Got invalid put response: {}", source)] InvalidPutResponse { source: quick_xml::de::DeError }, - #[snafu(display("Unable to extract metadata from headers: {}", source))] + #[error("Unable to extract metadata from headers: {}", source)] Metadata { source: crate::client::header::Error, }, - #[snafu(display("Version required for conditional update"))] + #[error("Version required for conditional update")] MissingVersion, - #[snafu(display("Error performing complete multipart request: {}", source))] + #[error("Error performing complete multipart request: {}", source)] CompleteMultipartRequest { source: crate::client::retry::Error }, - #[snafu(display("Error getting complete multipart response body: {}", source))] + #[error("Error getting complete multipart response body: {}", source)] CompleteMultipartResponseBody { source: reqwest::Error }, - #[snafu(display("Got invalid multipart response: {}", source))] + #[error("Got invalid multipart response: {}", source)] InvalidMultipartResponse { source: quick_xml::de::DeError }, - #[snafu(display("Error signing blob: {}", source))] + #[error("Error signing blob: {}", source)] SignBlobRequest { source: crate::client::retry::Error }, - #[snafu(display("Got invalid signing blob response: {}", source))] + #[error("Got invalid signing blob response: {}", source)] InvalidSignBlobResponse { source: reqwest::Error }, - #[snafu(display("Got invalid signing blob signature: {}", source))] + #[error("Got invalid signing blob signature: {}", source)] InvalidSignBlobSignature { source: base64::DecodeError }, } @@ -236,15 +235,17 @@ impl<'a> Request<'a> { .payload(self.payload) .send() .await - .context(RequestSnafu { - path: self.path.as_ref(), + .map_err(|source| { + let path = self.path.as_ref().into(); + Error::Request { source, path } })?; Ok(resp) } async fn do_put(self) -> Result { let response = self.send().await?; - Ok(get_put_result(response.headers(), VERSION_HEADER).context(MetadataSnafu)?) + Ok(get_put_result(response.headers(), VERSION_HEADER) + .map_err(|source| Error::Metadata { source })?) } } @@ -336,17 +337,17 @@ impl GoogleCloudStorageClient { .idempotent(true) .send() .await - .context(SignBlobRequestSnafu)?; + .map_err(|source| Error::SignBlobRequest { source })?; //If successful, the signature is returned in the signedBlob field in the response. let response = response .json::() .await - .context(InvalidSignBlobResponseSnafu)?; + .map_err(|source| Error::InvalidSignBlobResponse { source })?; let signed_blob = BASE64_STANDARD .decode(response.signed_blob) - .context(InvalidSignBlobSignatureSnafu)?; + .map_err(|source| Error::InvalidSignBlobSignature { source })?; Ok(hex_encode(&signed_blob)) } @@ -389,7 +390,7 @@ impl GoogleCloudStorageClient { PutMode::Overwrite => builder.idempotent(true), PutMode::Create => builder.header(&VERSION_MATCH, "0"), PutMode::Update(v) => { - let etag = v.version.as_ref().context(MissingVersionSnafu)?; + let etag = v.version.as_ref().ok_or(Error::MissingVersion)?; builder.header(&VERSION_MATCH, etag) } }; @@ -443,9 +444,14 @@ impl GoogleCloudStorageClient { .send() .await?; - let data = response.bytes().await.context(PutResponseBodySnafu)?; + let data = response + .bytes() + .await + .map_err(|source| Error::PutResponseBody { source })?; + let result: InitiateMultipartUploadResult = - quick_xml::de::from_reader(data.as_ref().reader()).context(InvalidPutResponseSnafu)?; + quick_xml::de::from_reader(data.as_ref().reader()) + .map_err(|source| Error::InvalidPutResponse { source })?; Ok(result.upload_id) } @@ -467,8 +473,9 @@ impl GoogleCloudStorageClient { .query(&[("uploadId", multipart_id)]) .send_retry(&self.config.retry_config) .await - .context(RequestSnafu { - path: path.as_ref(), + .map_err(|source| { + let path = path.as_ref().into(); + Error::Request { source, path } })?; Ok(()) @@ -498,7 +505,7 @@ impl GoogleCloudStorageClient { let credential = self.get_credential().await?; let data = quick_xml::se::to_string(&upload_info) - .context(InvalidPutRequestSnafu)? + .map_err(|source| Error::InvalidPutRequest { source })? // We cannot disable the escaping that transforms "/" to ""e;" :( // https://github.com/tafia/quick-xml/issues/362 // https://github.com/tafia/quick-xml/issues/350 @@ -514,17 +521,18 @@ impl GoogleCloudStorageClient { .idempotent(true) .send() .await - .context(CompleteMultipartRequestSnafu)?; + .map_err(|source| Error::CompleteMultipartRequest { source })?; - let version = get_version(response.headers(), VERSION_HEADER).context(MetadataSnafu)?; + let version = get_version(response.headers(), VERSION_HEADER) + .map_err(|source| Error::Metadata { source })?; let data = response .bytes() .await - .context(CompleteMultipartResponseBodySnafu)?; + .map_err(|source| Error::CompleteMultipartResponseBody { source })?; - let response: CompleteMultipartUploadResult = - quick_xml::de::from_reader(data.reader()).context(InvalidMultipartResponseSnafu)?; + let response: CompleteMultipartUploadResult = quick_xml::de::from_reader(data.reader()) + .map_err(|source| Error::InvalidMultipartResponse { source })?; Ok(PutResult { e_tag: Some(response.e_tag), @@ -615,8 +623,9 @@ impl GetClient for GoogleCloudStorageClient { .with_get_options(options) .send_retry(&self.config.retry_config) .await - .context(GetRequestSnafu { - path: path.as_ref(), + .map_err(|source| { + let path = path.as_ref().into(); + Error::GetRequest { source, path } })?; Ok(response) @@ -665,13 +674,13 @@ impl ListClient for GoogleCloudStorageClient { .bearer_auth(&credential.bearer) .send_retry(&self.config.retry_config) .await - .context(ListRequestSnafu)? + .map_err(|source| Error::ListRequest { source })? .bytes() .await - .context(ListResponseBodySnafu)?; + .map_err(|source| Error::ListResponseBody { source })?; - let mut response: ListResponse = - quick_xml::de::from_reader(response.reader()).context(InvalidListResponseSnafu)?; + let mut response: ListResponse = quick_xml::de::from_reader(response.reader()) + .map_err(|source| Error::InvalidListResponse { source })?; let token = response.next_continuation_token.take(); Ok((response.try_into()?, token)) diff --git a/src/gcp/credential.rs b/src/gcp/credential.rs index 155a80b..4b21ad1 100644 --- a/src/gcp/credential.rs +++ b/src/gcp/credential.rs @@ -33,7 +33,6 @@ use percent_encoding::utf8_percent_encode; use reqwest::{Client, Method}; use ring::signature::RsaKeyPair; use serde::Deserialize; -use snafu::{ResultExt, Snafu}; use std::collections::BTreeMap; use std::env; use std::fs::File; @@ -54,36 +53,39 @@ const DEFAULT_GCS_SIGN_BLOB_HOST: &str = "storage.googleapis.com"; const DEFAULT_METADATA_HOST: &str = "metadata.google.internal"; const DEFAULT_METADATA_IP: &str = "169.254.169.254"; -#[derive(Debug, Snafu)] +#[derive(Debug, thiserror::Error)] pub enum Error { - #[snafu(display("Unable to open service account file from {}: {}", path.display(), source))] + #[error("Unable to open service account file from {}: {}", path.display(), source)] OpenCredentials { source: std::io::Error, path: PathBuf, }, - #[snafu(display("Unable to decode service account file: {}", source))] + #[error("Unable to decode service account file: {}", source)] DecodeCredentials { source: serde_json::Error }, - #[snafu(display("No RSA key found in pem file"))] + #[error("No RSA key found in pem file")] MissingKey, - #[snafu(display("Invalid RSA key: {}", source), context(false))] - InvalidKey { source: ring::error::KeyRejected }, + #[error("Invalid RSA key: {}", source)] + InvalidKey { + #[from] + source: ring::error::KeyRejected, + }, - #[snafu(display("Error signing: {}", source))] + #[error("Error signing: {}", source)] Sign { source: ring::error::Unspecified }, - #[snafu(display("Error encoding jwt payload: {}", source))] + #[error("Error encoding jwt payload: {}", source)] Encode { source: serde_json::Error }, - #[snafu(display("Unsupported key encoding: {}", encoding))] + #[error("Unsupported key encoding: {}", encoding)] UnsupportedKey { encoding: String }, - #[snafu(display("Error performing token request: {}", source))] + #[error("Error performing token request: {}", source)] TokenRequest { source: crate::client::retry::Error }, - #[snafu(display("Error getting token response body: {}", source))] + #[error("Error getting token response body: {}", source)] TokenResponseBody { source: reqwest::Error }, } @@ -153,7 +155,7 @@ impl ServiceAccountKey { string_to_sign.as_bytes(), &mut signature, ) - .context(SignSnafu)?; + .map_err(|source| Error::Sign { source })?; Ok(hex_encode(&signature)) } @@ -289,7 +291,7 @@ impl TokenProvider for SelfSignedJwt { message.as_bytes(), &mut sig_bytes, ) - .context(SignSnafu)?; + .map_err(|source| Error::Sign { source })?; let signature = BASE64_URL_SAFE_NO_PAD.encode(sig_bytes); let bearer = [message, signature].join("."); @@ -305,11 +307,12 @@ fn read_credentials_file(service_account_path: impl AsRef) - where T: serde::de::DeserializeOwned, { - let file = File::open(&service_account_path).context(OpenCredentialsSnafu { - path: service_account_path.as_ref().to_owned(), + let file = File::open(&service_account_path).map_err(|source| { + let path = service_account_path.as_ref().to_owned(); + Error::OpenCredentials { source, path } })?; let reader = BufReader::new(file); - serde_json::from_reader(reader).context(DecodeCredentialsSnafu) + serde_json::from_reader(reader).map_err(|source| Error::DecodeCredentials { source }) } /// A deserialized `service-account-********.json`-file. @@ -341,7 +344,7 @@ impl ServiceAccountCredentials { /// Create a new [`ServiceAccountCredentials`] from a string. pub(crate) fn from_key(key: &str) -> Result { - serde_json::from_str(key).context(DecodeCredentialsSnafu) + serde_json::from_str(key).map_err(|source| Error::DecodeCredentials { source }) } /// Create a [`SelfSignedJwt`] from this credentials struct. @@ -380,7 +383,7 @@ fn seconds_since_epoch() -> u64 { } fn b64_encode_obj(obj: &T) -> Result { - let string = serde_json::to_string(obj).context(EncodeSnafu)?; + let string = serde_json::to_string(obj).map_err(|source| Error::Encode { source })?; Ok(BASE64_URL_SAFE_NO_PAD.encode(string)) } @@ -404,10 +407,10 @@ async fn make_metadata_request( .query(&[("audience", "https://www.googleapis.com/oauth2/v4/token")]) .send_retry(retry) .await - .context(TokenRequestSnafu)? + .map_err(|source| Error::TokenRequest { source })? .json() .await - .context(TokenResponseBodySnafu)?; + .map_err(|source| Error::TokenResponseBody { source })?; Ok(response) } @@ -467,10 +470,10 @@ async fn make_metadata_request_for_email( .header("Metadata-Flavor", "Google") .send_retry(retry) .await - .context(TokenRequestSnafu)? + .map_err(|source| Error::TokenRequest { source })? .text() .await - .context(TokenResponseBodySnafu)?; + .map_err(|source| Error::TokenResponseBody { source })?; Ok(response) } @@ -608,10 +611,10 @@ impl AuthorizedUserSigningCredentials { .query(&[("access_token", &self.credential.refresh_token)]) .send_retry(retry) .await - .context(TokenRequestSnafu)? + .map_err(|source| Error::TokenRequest { source })? .json::() .await - .context(TokenResponseBodySnafu)?; + .map_err(|source| Error::TokenResponseBody { source })?; Ok(response.email) } @@ -659,10 +662,10 @@ impl TokenProvider for AuthorizedUserCredentials { .idempotent(true) .send() .await - .context(TokenRequestSnafu)? + .map_err(|source| Error::TokenRequest { source })? .json::() .await - .context(TokenResponseBodySnafu)?; + .map_err(|source| Error::TokenResponseBody { source })?; Ok(TemporaryToken { token: Arc::new(GcpCredential { diff --git a/src/http/client.rs b/src/http/client.rs index eeb7e56..41e6464 100644 --- a/src/http/client.rs +++ b/src/http/client.rs @@ -32,42 +32,41 @@ use hyper::header::{ use percent_encoding::percent_decode_str; use reqwest::{Method, Response, StatusCode}; use serde::Deserialize; -use snafu::{OptionExt, ResultExt, Snafu}; use url::Url; -#[derive(Debug, Snafu)] +#[derive(Debug, thiserror::Error)] enum Error { - #[snafu(display("Request error: {}", source))] + #[error("Request error: {}", source)] Request { source: retry::Error }, - #[snafu(display("Request error: {}", source))] + #[error("Request error: {}", source)] Reqwest { source: reqwest::Error }, - #[snafu(display("Range request not supported by {}", href))] + #[error("Range request not supported by {}", href)] RangeNotSupported { href: String }, - #[snafu(display("Error decoding PROPFIND response: {}", source))] + #[error("Error decoding PROPFIND response: {}", source)] InvalidPropFind { source: quick_xml::de::DeError }, - #[snafu(display("Missing content size for {}", href))] + #[error("Missing content size for {}", href)] MissingSize { href: String }, - #[snafu(display("Error getting properties of \"{}\" got \"{}\"", href, status))] + #[error("Error getting properties of \"{}\" got \"{}\"", href, status)] PropStatus { href: String, status: String }, - #[snafu(display("Failed to parse href \"{}\": {}", href, source))] + #[error("Failed to parse href \"{}\": {}", href, source)] InvalidHref { href: String, source: url::ParseError, }, - #[snafu(display("Path \"{}\" contained non-unicode characters: {}", path, source))] + #[error("Path \"{}\" contained non-unicode characters: {}", path, source)] NonUnicode { path: String, source: std::str::Utf8Error, }, - #[snafu(display("Encountered invalid path \"{}\": {}", path, source))] + #[error("Encountered invalid path \"{}\": {}", path, source)] InvalidPath { path: String, source: crate::path::Error, @@ -129,7 +128,7 @@ impl Client { .request(method, url) .send_retry(&self.retry_config) .await - .context(RequestSnafu)?; + .map_err(|source| Error::Request { source })?; Ok(()) } @@ -236,7 +235,10 @@ impl Client { .await; let response = match result { - Ok(result) => result.bytes().await.context(ReqwestSnafu)?, + Ok(result) => result + .bytes() + .await + .map_err(|source| Error::Reqwest { source })?, Err(e) if matches!(e.status(), Some(StatusCode::NOT_FOUND)) => { return match depth { "0" => { @@ -255,7 +257,9 @@ impl Client { Err(source) => return Err(Error::Request { source }.into()), }; - let status = quick_xml::de::from_reader(response.reader()).context(InvalidPropFindSnafu)?; + let status = quick_xml::de::from_reader(response.reader()) + .map_err(|source| Error::InvalidPropFind { source })?; + Ok(status) } @@ -397,14 +401,23 @@ impl MultiStatusResponse { let url = Url::options() .base_url(Some(base_url)) .parse(&self.href) - .context(InvalidHrefSnafu { href: &self.href })?; + .map_err(|source| Error::InvalidHref { + href: self.href.clone(), + source, + })?; // Reverse any percent encoding let path = percent_decode_str(url.path()) .decode_utf8() - .context(NonUnicodeSnafu { path: url.path() })?; + .map_err(|source| Error::NonUnicode { + path: url.path().into(), + source, + })?; - Ok(Path::parse(path.as_ref()).context(InvalidPathSnafu { path })?) + Ok(Path::parse(path.as_ref()).map_err(|source| { + let path = path.into(); + Error::InvalidPath { path, source } + })?) } fn size(&self) -> Result { @@ -412,7 +425,10 @@ impl MultiStatusResponse { .prop_stat .prop .content_length - .context(MissingSizeSnafu { href: &self.href })?; + .ok_or_else(|| Error::MissingSize { + href: self.href.clone(), + })?; + Ok(size) } diff --git a/src/http/mod.rs b/src/http/mod.rs index 4b1c927..417f728 100644 --- a/src/http/mod.rs +++ b/src/http/mod.rs @@ -35,7 +35,6 @@ use async_trait::async_trait; use futures::stream::BoxStream; use futures::{StreamExt, TryStreamExt}; use itertools::Itertools; -use snafu::{OptionExt, ResultExt, Snafu}; use url::Url; use crate::client::get::GetClientExt; @@ -49,18 +48,18 @@ use crate::{ mod client; -#[derive(Debug, Snafu)] +#[derive(Debug, thiserror::Error)] enum Error { - #[snafu(display("Must specify a URL"))] + #[error("Must specify a URL")] MissingUrl, - #[snafu(display("Unable parse source url. Url: {}, Error: {}", url, source))] + #[error("Unable parse source url. Url: {}, Error: {}", url, source)] UnableToParseUrl { source: url::ParseError, url: String, }, - #[snafu(display("Unable to extract metadata from headers: {}", source))] + #[error("Unable to extract metadata from headers: {}", source)] Metadata { source: crate::client::header::Error, }, @@ -235,8 +234,8 @@ impl HttpBuilder { /// Build an [`HttpStore`] with the configured options pub fn build(self) -> Result { - let url = self.url.context(MissingUrlSnafu)?; - let parsed = Url::parse(&url).context(UnableToParseUrlSnafu { url })?; + let url = self.url.ok_or(Error::MissingUrl)?; + let parsed = Url::parse(&url).map_err(|source| Error::UnableToParseUrl { url, source })?; Ok(HttpStore { client: Client::new(parsed, self.client_options, self.retry_config)?, diff --git a/src/lib.rs b/src/lib.rs index 6f57332..987ffac 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -566,7 +566,6 @@ use async_trait::async_trait; use bytes::Bytes; use chrono::{DateTime, Utc}; use futures::{stream::BoxStream, StreamExt, TryStreamExt}; -use snafu::Snafu; use std::fmt::{Debug, Formatter}; #[cfg(all(feature = "fs", not(target_arch = "wasm32")))] use std::io::{Read, Seek, SeekFrom}; @@ -1229,11 +1228,11 @@ pub struct PutResult { pub type Result = std::result::Result; /// A specialized `Error` for object store-related errors -#[derive(Debug, Snafu)] +#[derive(Debug, thiserror::Error)] #[non_exhaustive] pub enum Error { /// A fallback error type when no variant matches - #[snafu(display("Generic {} error: {}", store, source))] + #[error("Generic {} error: {}", store, source)] Generic { /// The store this error originated from store: &'static str, @@ -1242,7 +1241,7 @@ pub enum Error { }, /// Error when the object is not found at given location - #[snafu(display("Object at location {} not found: {}", path, source))] + #[error("Object at location {} not found: {}", path, source)] NotFound { /// The path to file path: String, @@ -1251,31 +1250,30 @@ pub enum Error { }, /// Error for invalid path - #[snafu( - display("Encountered object with invalid path: {}", source), - context(false) - )] + #[error("Encountered object with invalid path: {}", source)] InvalidPath { /// The wrapped error + #[from] source: path::Error, }, /// Error when `tokio::spawn` failed - #[snafu(display("Error joining spawned task: {}", source), context(false))] + #[error("Error joining spawned task: {}", source)] JoinError { /// The wrapped error + #[from] source: tokio::task::JoinError, }, /// Error when the attempted operation is not supported - #[snafu(display("Operation not supported: {}", source))] + #[error("Operation not supported: {}", source)] NotSupported { /// The wrapped error source: Box, }, /// Error when the object already exists - #[snafu(display("Object at location {} already exists: {}", path, source))] + #[error("Object at location {} already exists: {}", path, source)] AlreadyExists { /// The path to the path: String, @@ -1284,7 +1282,7 @@ pub enum Error { }, /// Error when the required conditions failed for the operation - #[snafu(display("Request precondition failure for path {}: {}", path, source))] + #[error("Request precondition failure for path {}: {}", path, source)] Precondition { /// The path to the file path: String, @@ -1293,7 +1291,7 @@ pub enum Error { }, /// Error when the object at the location isn't modified - #[snafu(display("Object at location {} not modified: {}", path, source))] + #[error("Object at location {} not modified: {}", path, source)] NotModified { /// The path to the file path: String, @@ -1302,16 +1300,16 @@ pub enum Error { }, /// Error when an operation is not implemented - #[snafu(display("Operation not yet implemented."))] + #[error("Operation not yet implemented.")] NotImplemented, /// Error when the used credentials don't have enough permission /// to perform the requested operation - #[snafu(display( + #[error( "The operation lacked the necessary privileges to complete for path {}: {}", path, source - ))] + )] PermissionDenied { /// The path to the file path: String, @@ -1320,11 +1318,11 @@ pub enum Error { }, /// Error when the used credentials lack valid authentication - #[snafu(display( + #[error( "The operation lacked valid authentication credentials for path {}: {}", path, source - ))] + )] Unauthenticated { /// The path to the file path: String, @@ -1333,7 +1331,7 @@ pub enum Error { }, /// Error when a configuration key is invalid for the store used - #[snafu(display("Configuration key: '{}' is not valid for store '{}'.", key, store))] + #[error("Configuration key: '{}' is not valid for store '{}'.", key, store)] UnknownConfigurationKey { /// The object store used store: &'static str, diff --git a/src/local.rs b/src/local.rs index 78fce9c..b193481 100644 --- a/src/local.rs +++ b/src/local.rs @@ -30,7 +30,6 @@ use chrono::{DateTime, Utc}; use futures::{stream::BoxStream, StreamExt}; use futures::{FutureExt, TryStreamExt}; use parking_lot::Mutex; -use snafu::{ensure, OptionExt, ResultExt, Snafu}; use url::Url; use walkdir::{DirEntry, WalkDir}; @@ -43,117 +42,80 @@ use crate::{ }; /// A specialized `Error` for filesystem object store-related errors -#[derive(Debug, Snafu)] +#[derive(Debug, thiserror::Error)] pub(crate) enum Error { - #[snafu(display("File size for {} did not fit in a usize: {}", path, source))] + #[error("File size for {} did not fit in a usize: {}", path, source)] FileSizeOverflowedUsize { source: std::num::TryFromIntError, path: String, }, - #[snafu(display("Unable to walk dir: {}", source))] - UnableToWalkDir { - source: walkdir::Error, - }, + #[error("Unable to walk dir: {}", source)] + UnableToWalkDir { source: walkdir::Error }, - #[snafu(display("Unable to access metadata for {}: {}", path, source))] + #[error("Unable to access metadata for {}: {}", path, source)] Metadata { source: Box, path: String, }, - #[snafu(display("Unable to copy data to file: {}", source))] - UnableToCopyDataToFile { - source: io::Error, - }, + #[error("Unable to copy data to file: {}", source)] + UnableToCopyDataToFile { source: io::Error }, - #[snafu(display("Unable to rename file: {}", source))] - UnableToRenameFile { - source: io::Error, - }, + #[error("Unable to rename file: {}", source)] + UnableToRenameFile { source: io::Error }, - #[snafu(display("Unable to create dir {}: {}", path.display(), source))] - UnableToCreateDir { - source: io::Error, - path: PathBuf, - }, + #[error("Unable to create dir {}: {}", path.display(), source)] + UnableToCreateDir { source: io::Error, path: PathBuf }, - #[snafu(display("Unable to create file {}: {}", path.display(), source))] - UnableToCreateFile { - source: io::Error, - path: PathBuf, - }, + #[error("Unable to create file {}: {}", path.display(), source)] + UnableToCreateFile { source: io::Error, path: PathBuf }, - #[snafu(display("Unable to delete file {}: {}", path.display(), source))] - UnableToDeleteFile { - source: io::Error, - path: PathBuf, - }, + #[error("Unable to delete file {}: {}", path.display(), source)] + UnableToDeleteFile { source: io::Error, path: PathBuf }, - #[snafu(display("Unable to open file {}: {}", path.display(), source))] - UnableToOpenFile { - source: io::Error, - path: PathBuf, - }, + #[error("Unable to open file {}: {}", path.display(), source)] + UnableToOpenFile { source: io::Error, path: PathBuf }, - #[snafu(display("Unable to read data from file {}: {}", path.display(), source))] - UnableToReadBytes { - source: io::Error, - path: PathBuf, - }, + #[error("Unable to read data from file {}: {}", path.display(), source)] + UnableToReadBytes { source: io::Error, path: PathBuf }, - #[snafu(display("Out of range of file {}, expected: {}, actual: {}", path.display(), expected, actual))] + #[error("Out of range of file {}, expected: {}, actual: {}", path.display(), expected, actual)] OutOfRange { path: PathBuf, expected: usize, actual: usize, }, - #[snafu(display("Requested range was invalid"))] - InvalidRange { - source: InvalidGetRange, - }, + #[error("Requested range was invalid")] + InvalidRange { source: InvalidGetRange }, - #[snafu(display("Unable to copy file from {} to {}: {}", from.display(), to.display(), source))] + #[error("Unable to copy file from {} to {}: {}", from.display(), to.display(), source)] UnableToCopyFile { from: PathBuf, to: PathBuf, source: io::Error, }, - NotFound { - path: PathBuf, - source: io::Error, - }, + #[error("NotFound")] + NotFound { path: PathBuf, source: io::Error }, - #[snafu(display("Error seeking file {}: {}", path.display(), source))] - Seek { - source: io::Error, - path: PathBuf, - }, + #[error("Error seeking file {}: {}", path.display(), source)] + Seek { source: io::Error, path: PathBuf }, - #[snafu(display("Unable to convert URL \"{}\" to filesystem path", url))] - InvalidUrl { - url: Url, - }, + #[error("Unable to convert URL \"{}\" to filesystem path", url)] + InvalidUrl { url: Url }, - AlreadyExists { - path: String, - source: io::Error, - }, + #[error("AlreadyExists")] + AlreadyExists { path: String, source: io::Error }, - #[snafu(display("Unable to canonicalize filesystem root: {}", path.display()))] - UnableToCanonicalize { - path: PathBuf, - source: io::Error, - }, + #[error("Unable to canonicalize filesystem root: {}", path.display())] + UnableToCanonicalize { path: PathBuf, source: io::Error }, - #[snafu(display("Filenames containing trailing '/#\\d+/' are not supported: {}", path))] - InvalidPath { - path: String, - }, + #[error("Filenames containing trailing '/#\\d+/' are not supported: {}", path)] + InvalidPath { path: String }, - #[snafu(display("Upload aborted"))] + #[error("Upload aborted")] Aborted, } @@ -276,8 +238,9 @@ impl LocalFileSystem { /// Returns an error if the path does not exist /// pub fn new_with_prefix(prefix: impl AsRef) -> Result { - let path = std::fs::canonicalize(&prefix).context(UnableToCanonicalizeSnafu { - path: prefix.as_ref(), + let path = std::fs::canonicalize(&prefix).map_err(|source| { + let path = prefix.as_ref().into(); + Error::UnableToCanonicalize { source, path } })?; Ok(Self { @@ -290,12 +253,12 @@ impl LocalFileSystem { /// Return an absolute filesystem path of the given file location pub fn path_to_filesystem(&self, location: &Path) -> Result { - ensure!( - is_valid_file_path(location), - InvalidPathSnafu { - path: location.as_ref() - } - ); + if !is_valid_file_path(location) { + let path = location.as_ref().into(); + let error = Error::InvalidPath { path }; + return Err(error.into()); + } + let path = self.config.prefix_to_filesystem(location)?; #[cfg(target_os = "windows")] @@ -451,7 +414,9 @@ impl ObjectStore for LocalFileSystem { options.check_preconditions(&meta)?; let range = match options.range { - Some(r) => r.as_range(meta.size).context(InvalidRangeSnafu)?, + Some(r) => r + .as_range(meta.size) + .map_err(|source| Error::InvalidRange { source })?, None => 0..meta.size, }; @@ -721,12 +686,15 @@ impl ObjectStore for LocalFileSystem { /// Creates the parent directories of `path` or returns an error based on `source` if no parent fn create_parent_dirs(path: &std::path::Path, source: io::Error) -> Result<()> { - let parent = path.parent().ok_or_else(|| Error::UnableToCreateFile { - path: path.to_path_buf(), - source, + let parent = path.parent().ok_or_else(|| { + let path = path.to_path_buf(); + Error::UnableToCreateFile { path, source } })?; - std::fs::create_dir_all(parent).context(UnableToCreateDirSnafu { path: parent })?; + std::fs::create_dir_all(parent).map_err(|source| { + let path = parent.into(); + Error::UnableToCreateDir { source, path } + })?; Ok(()) } @@ -796,12 +764,14 @@ impl MultipartUpload for LocalUpload { let s = Arc::clone(&self.state); maybe_spawn_blocking(move || { let mut file = s.file.lock(); - file.seek(SeekFrom::Start(offset)) - .context(SeekSnafu { path: &s.dest })?; + file.seek(SeekFrom::Start(offset)).map_err(|source| { + let path = s.dest.clone(); + Error::Seek { source, path } + })?; data.iter() .try_for_each(|x| file.write_all(x)) - .context(UnableToCopyDataToFileSnafu)?; + .map_err(|source| Error::UnableToCopyDataToFile { source })?; Ok(()) }) @@ -809,12 +779,13 @@ impl MultipartUpload for LocalUpload { } async fn complete(&mut self) -> Result { - let src = self.src.take().context(AbortedSnafu)?; + let src = self.src.take().ok_or(Error::Aborted)?; let s = Arc::clone(&self.state); maybe_spawn_blocking(move || { // Ensure no inflight writes let file = s.file.lock(); - std::fs::rename(&src, &s.dest).context(UnableToRenameFileSnafu)?; + std::fs::rename(&src, &s.dest) + .map_err(|source| Error::UnableToRenameFile { source })?; let metadata = file.metadata().map_err(|e| Error::Metadata { source: e.into(), path: src.to_string_lossy().to_string(), @@ -829,9 +800,10 @@ impl MultipartUpload for LocalUpload { } async fn abort(&mut self) -> Result<()> { - let src = self.src.take().context(AbortedSnafu)?; + let src = self.src.take().ok_or(Error::Aborted)?; maybe_spawn_blocking(move || { - std::fs::remove_file(&src).context(UnableToDeleteFileSnafu { path: &src })?; + std::fs::remove_file(&src) + .map_err(|source| Error::UnableToDeleteFile { source, path: src })?; Ok(()) }) .await @@ -898,22 +870,30 @@ pub(crate) fn chunked_stream( pub(crate) fn read_range(file: &mut File, path: &PathBuf, range: Range) -> Result { let to_read = range.end - range.start; file.seek(SeekFrom::Start(range.start as u64)) - .context(SeekSnafu { path })?; + .map_err(|source| { + let path = path.into(); + Error::Seek { source, path } + })?; let mut buf = Vec::with_capacity(to_read); let read = file .take(to_read as u64) .read_to_end(&mut buf) - .context(UnableToReadBytesSnafu { path })?; + .map_err(|source| { + let path = path.into(); + Error::UnableToReadBytes { source, path } + })?; - ensure!( - read == to_read, - OutOfRangeSnafu { - path, + if read != to_read { + let error = Error::OutOfRange { + path: path.into(), expected: to_read, - actual: read - } - ); + actual: read, + }; + + return Err(error.into()); + } + Ok(buf.into()) } @@ -982,8 +962,9 @@ fn get_etag(metadata: &Metadata) -> String { fn convert_metadata(metadata: Metadata, location: Path) -> Result { let last_modified = last_modified(&metadata); - let size = usize::try_from(metadata.len()).context(FileSizeOverflowedUsizeSnafu { - path: location.as_ref(), + let size = usize::try_from(metadata.len()).map_err(|source| { + let path = location.as_ref().into(); + Error::FileSizeOverflowedUsize { source, path } })?; Ok(ObjectMeta { diff --git a/src/memory.rs b/src/memory.rs index a467e3b..3f3cff3 100644 --- a/src/memory.rs +++ b/src/memory.rs @@ -25,7 +25,6 @@ use bytes::Bytes; use chrono::{DateTime, Utc}; use futures::{stream::BoxStream, StreamExt}; use parking_lot::RwLock; -use snafu::{OptionExt, ResultExt, Snafu}; use crate::multipart::{MultipartStore, PartId}; use crate::util::InvalidGetRange; @@ -37,24 +36,24 @@ use crate::{ use crate::{GetOptions, PutPayload}; /// A specialized `Error` for in-memory object store-related errors -#[derive(Debug, Snafu)] +#[derive(Debug, thiserror::Error)] enum Error { - #[snafu(display("No data in memory found. Location: {path}"))] + #[error("No data in memory found. Location: {path}")] NoDataInMemory { path: String }, - #[snafu(display("Invalid range: {source}"))] + #[error("Invalid range: {source}")] Range { source: InvalidGetRange }, - #[snafu(display("Object already exists at that location: {path}"))] + #[error("Object already exists at that location: {path}")] AlreadyExists { path: String }, - #[snafu(display("ETag required for conditional update"))] + #[error("ETag required for conditional update")] MissingETag, - #[snafu(display("MultipartUpload not found: {id}"))] + #[error("MultipartUpload not found: {id}")] UploadNotFound { id: String }, - #[snafu(display("Missing part at index: {part}"))] + #[error("Missing part at index: {part}")] MissingPart { part: usize }, } @@ -158,7 +157,7 @@ impl Storage { }), Some(e) => { let existing = e.e_tag.to_string(); - let expected = v.e_tag.context(MissingETagSnafu)?; + let expected = v.e_tag.ok_or(Error::MissingETag)?; if existing == expected { *e = entry; Ok(()) @@ -177,7 +176,7 @@ impl Storage { .parse() .ok() .and_then(|x| self.uploads.get_mut(&x)) - .context(UploadNotFoundSnafu { id })?; + .ok_or_else(|| Error::UploadNotFound { id: id.into() })?; Ok(parts) } @@ -186,7 +185,7 @@ impl Storage { .parse() .ok() .and_then(|x| self.uploads.remove(&x)) - .context(UploadNotFoundSnafu { id })?; + .ok_or_else(|| Error::UploadNotFound { id: id.into() })?; Ok(parts) } } @@ -250,7 +249,9 @@ impl ObjectStore for InMemory { let (range, data) = match options.range { Some(range) => { - let r = range.as_range(entry.data.len()).context(RangeSnafu)?; + let r = range + .as_range(entry.data.len()) + .map_err(|source| Error::Range { source })?; (r.clone(), entry.data.slice(r)) } None => (0..entry.data.len(), entry.data), @@ -272,7 +273,7 @@ impl ObjectStore for InMemory { .map(|range| { let r = GetRange::Bounded(range.clone()) .as_range(entry.data.len()) - .context(RangeSnafu)?; + .map_err(|source| Error::Range { source })?; Ok(entry.data.slice(r)) }) @@ -435,7 +436,7 @@ impl MultipartStore for InMemory { let mut cap = 0; for (part, x) in upload.parts.iter().enumerate() { - cap += x.as_ref().context(MissingPartSnafu { part })?.len(); + cap += x.as_ref().ok_or(Error::MissingPart { part })?.len(); } let mut buf = Vec::with_capacity(cap); for x in &upload.parts { @@ -474,7 +475,7 @@ impl InMemory { .map .get(location) .cloned() - .context(NoDataInMemorySnafu { + .ok_or_else(|| Error::NoDataInMemory { path: location.to_string(), })?; diff --git a/src/parse.rs b/src/parse.rs index a391930..bc65a0b 100644 --- a/src/parse.rs +++ b/src/parse.rs @@ -20,16 +20,18 @@ use crate::local::LocalFileSystem; use crate::memory::InMemory; use crate::path::Path; use crate::ObjectStore; -use snafu::Snafu; use url::Url; -#[derive(Debug, Snafu)] +#[derive(Debug, thiserror::Error)] pub enum Error { - #[snafu(display("Unable to recognise URL \"{}\"", url))] + #[error("Unable to recognise URL \"{}\"", url)] Unrecognised { url: Url }, - #[snafu(context(false))] - Path { source: crate::path::Error }, + #[error(transparent)] + Path { + #[from] + source: crate::path::Error, + }, } impl From for super::Error { diff --git a/src/path/mod.rs b/src/path/mod.rs index 4c9bb5f..f8affe8 100644 --- a/src/path/mod.rs +++ b/src/path/mod.rs @@ -19,7 +19,6 @@ use itertools::Itertools; use percent_encoding::percent_decode; -use snafu::{ensure, ResultExt, Snafu}; use std::fmt::Formatter; #[cfg(not(target_arch = "wasm32"))] use url::Url; @@ -35,18 +34,18 @@ mod parts; pub use parts::{InvalidPart, PathPart}; /// Error returned by [`Path::parse`] -#[derive(Debug, Snafu)] +#[derive(Debug, thiserror::Error)] #[non_exhaustive] pub enum Error { /// Error when there's an empty segment between two slashes `/` in the path - #[snafu(display("Path \"{}\" contained empty path segment", path))] + #[error("Path \"{}\" contained empty path segment", path)] EmptySegment { /// The source path path: String, }, /// Error when an invalid segment is encountered in the given path - #[snafu(display("Error parsing Path \"{}\": {}", path, source))] + #[error("Error parsing Path \"{}\": {}", path, source)] BadSegment { /// The source path path: String, @@ -55,7 +54,7 @@ pub enum Error { }, /// Error when path cannot be canonicalized - #[snafu(display("Failed to canonicalize path \"{}\": {}", path.display(), source))] + #[error("Failed to canonicalize path \"{}\": {}", path.display(), source)] Canonicalize { /// The source path path: std::path::PathBuf, @@ -64,14 +63,14 @@ pub enum Error { }, /// Error when the path is not a valid URL - #[snafu(display("Unable to convert path \"{}\" to URL", path.display()))] + #[error("Unable to convert path \"{}\" to URL", path.display())] InvalidPath { /// The source path path: std::path::PathBuf, }, /// Error when a path contains non-unicode characters - #[snafu(display("Path \"{}\" contained non-unicode characters: {}", path, source))] + #[error("Path \"{}\" contained non-unicode characters: {}", path, source)] NonUnicode { /// The source path path: String, @@ -80,7 +79,7 @@ pub enum Error { }, /// Error when the a path doesn't start with given prefix - #[snafu(display("Path {} does not start with prefix {}", path, prefix))] + #[error("Path {} does not start with prefix {}", path, prefix)] PrefixMismatch { /// The source path path: String, @@ -173,8 +172,14 @@ impl Path { let stripped = stripped.strip_suffix(DELIMITER).unwrap_or(stripped); for segment in stripped.split(DELIMITER) { - ensure!(!segment.is_empty(), EmptySegmentSnafu { path }); - PathPart::parse(segment).context(BadSegmentSnafu { path })?; + if segment.is_empty() { + return Err(Error::EmptySegment { path: path.into() }); + } + + PathPart::parse(segment).map_err(|source| { + let path = path.into(); + Error::BadSegment { source, path } + })?; } Ok(Self { @@ -190,8 +195,9 @@ impl Path { /// /// Note: this will canonicalize the provided path, resolving any symlinks pub fn from_filesystem_path(path: impl AsRef) -> Result { - let absolute = std::fs::canonicalize(&path).context(CanonicalizeSnafu { - path: path.as_ref(), + let absolute = std::fs::canonicalize(&path).map_err(|source| { + let path = path.as_ref().into(); + Error::Canonicalize { source, path } })?; Self::from_absolute_path(absolute) @@ -241,7 +247,10 @@ impl Path { let path = path.as_ref(); let decoded = percent_decode(path.as_bytes()) .decode_utf8() - .context(NonUnicodeSnafu { path })?; + .map_err(|source| { + let path = path.into(); + Error::NonUnicode { source, path } + })?; Self::parse(decoded) } diff --git a/src/path/parts.rs b/src/path/parts.rs index de2e1a7..9c6612b 100644 --- a/src/path/parts.rs +++ b/src/path/parts.rs @@ -19,15 +19,14 @@ use percent_encoding::{percent_encode, AsciiSet, CONTROLS}; use std::borrow::Cow; use crate::path::DELIMITER_BYTE; -use snafu::Snafu; /// Error returned by [`PathPart::parse`] -#[derive(Debug, Snafu)] -#[snafu(display( +#[derive(Debug, thiserror::Error)] +#[error( "Encountered illegal character sequence \"{}\" whilst parsing path segment \"{}\"", illegal, segment -))] +)] #[allow(missing_copy_implementations)] pub struct InvalidPart { segment: String, diff --git a/src/util.rs b/src/util.rs index 99102a9..6d638f3 100644 --- a/src/util.rs +++ b/src/util.rs @@ -24,7 +24,6 @@ use std::{ use super::Result; use bytes::Bytes; use futures::{stream::StreamExt, Stream, TryStreamExt}; -use snafu::Snafu; #[cfg(any(feature = "azure", feature = "http"))] pub(crate) static RFC1123_FMT: &str = "%a, %d %h %Y %T GMT"; @@ -204,14 +203,12 @@ pub enum GetRange { Suffix(usize), } -#[derive(Debug, Snafu)] +#[derive(Debug, thiserror::Error)] pub(crate) enum InvalidGetRange { - #[snafu(display( - "Wanted range starting at {requested}, but object was only {length} bytes long" - ))] + #[error("Wanted range starting at {requested}, but object was only {length} bytes long")] StartTooLarge { requested: usize, length: usize }, - #[snafu(display("Range started at {start} and ended at {end}"))] + #[error("Range started at {start} and ended at {end}")] Inconsistent { start: usize, end: usize }, } From 704d4086659f588e15d52a130683484484e99352 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 3 Jan 2025 13:29:36 +0100 Subject: [PATCH 372/397] Update itertools requirement from 0.13.0 to 0.14.0 in /object_store (#6925) Updates the requirements on [itertools](https://github.com/rust-itertools/itertools) to permit the latest version. - [Changelog](https://github.com/rust-itertools/itertools/blob/master/CHANGELOG.md) - [Commits](https://github.com/rust-itertools/itertools/compare/v0.13.0...v0.14.0) --- updated-dependencies: - dependency-name: itertools dependency-type: direct:production ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index 6f5e9db..992ae66 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -35,7 +35,7 @@ bytes = "1.0" chrono = { version = "0.4.34", default-features = false, features = ["clock"] } futures = "0.3" humantime = "2.1" -itertools = "0.13.0" +itertools = "0.14.0" parking_lot = { version = "0.12" } percent-encoding = "2.1" thiserror = "2.0.2" From f54ebf02712304dd8370e6ff89a94f64c90ad1eb Mon Sep 17 00:00:00 2001 From: Vrishabh Date: Sun, 5 Jan 2025 15:54:14 +0530 Subject: [PATCH 373/397] Minor clippy fixes (#6942) --- src/aws/client.rs | 2 +- src/azure/client.rs | 2 +- src/gcp/client.rs | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/aws/client.rs b/src/aws/client.rs index 25fdd33..b81be0c 100644 --- a/src/aws/client.rs +++ b/src/aws/client.rs @@ -299,7 +299,7 @@ pub(crate) struct Request<'a> { retry_error_body: bool, } -impl<'a> Request<'a> { +impl Request<'_> { pub(crate) fn query(self, query: &T) -> Self { let builder = self.builder.query(query); Self { builder, ..self } diff --git a/src/azure/client.rs b/src/azure/client.rs index ea3a5fa..bd72d0c 100644 --- a/src/azure/client.rs +++ b/src/azure/client.rs @@ -198,7 +198,7 @@ struct PutRequest<'a> { idempotent: bool, } -impl<'a> PutRequest<'a> { +impl PutRequest<'_> { fn header(self, k: &HeaderName, v: &str) -> Self { let builder = self.builder.header(k, v); Self { builder, ..self } diff --git a/src/gcp/client.rs b/src/gcp/client.rs index 1928d13..d6f89ca 100644 --- a/src/gcp/client.rs +++ b/src/gcp/client.rs @@ -173,7 +173,7 @@ pub(crate) struct Request<'a> { idempotent: bool, } -impl<'a> Request<'a> { +impl Request<'_> { fn header(self, k: &HeaderName, v: &str) -> Self { let builder = self.builder.header(k, v); Self { builder, ..self } From 98d1588ec9e277f9587510dd533b45b8bd5844d9 Mon Sep 17 00:00:00 2001 From: Kyle Barron Date: Wed, 8 Jan 2025 06:06:13 -0800 Subject: [PATCH 374/397] Return `BoxStream` with `'static` lifetime from `ObjectStore::list` (#6619) Co-authored-by: Andrew Lamb --- src/aws/client.rs | 2 +- src/aws/mod.rs | 4 ++-- src/azure/client.rs | 2 +- src/azure/mod.rs | 7 +++--- src/chunked.rs | 4 ++-- src/client/list.rs | 19 +++++++-------- src/client/pagination.rs | 50 +++++++++++++++++++++++----------------- src/gcp/client.rs | 2 +- src/gcp/mod.rs | 4 ++-- src/http/mod.rs | 15 +++++++----- src/lib.rs | 8 +++---- src/limit.rs | 14 ++++++----- src/local.rs | 2 +- src/memory.rs | 2 +- src/prefix.rs | 32 +++++++++++++++++++++---- src/throttle.rs | 16 ++++++++----- tests/get_range_file.rs | 2 +- 17 files changed, 113 insertions(+), 72 deletions(-) diff --git a/src/aws/client.rs b/src/aws/client.rs index b81be0c..246f277 100644 --- a/src/aws/client.rs +++ b/src/aws/client.rs @@ -855,7 +855,7 @@ impl GetClient for S3Client { } #[async_trait] -impl ListClient for S3Client { +impl ListClient for Arc { /// Make an S3 List request async fn list_request( &self, diff --git a/src/aws/mod.rs b/src/aws/mod.rs index 7f449c4..82ef909 100644 --- a/src/aws/mod.rs +++ b/src/aws/mod.rs @@ -273,7 +273,7 @@ impl ObjectStore for AmazonS3 { .boxed() } - fn list(&self, prefix: Option<&Path>) -> BoxStream<'_, Result> { + fn list(&self, prefix: Option<&Path>) -> BoxStream<'static, Result> { self.client.list(prefix) } @@ -281,7 +281,7 @@ impl ObjectStore for AmazonS3 { &self, prefix: Option<&Path>, offset: &Path, - ) -> BoxStream<'_, Result> { + ) -> BoxStream<'static, Result> { if self.client.config.is_s3_express() { let offset = offset.clone(); // S3 Express does not support start-after diff --git a/src/azure/client.rs b/src/azure/client.rs index bd72d0c..fa5412c 100644 --- a/src/azure/client.rs +++ b/src/azure/client.rs @@ -925,7 +925,7 @@ impl GetClient for AzureClient { } #[async_trait] -impl ListClient for AzureClient { +impl ListClient for Arc { /// Make an Azure List request async fn list_request( &self, diff --git a/src/azure/mod.rs b/src/azure/mod.rs index 81b6667..ea4dd8f 100644 --- a/src/azure/mod.rs +++ b/src/azure/mod.rs @@ -119,6 +119,9 @@ impl ObjectStore for MicrosoftAzure { self.client.delete_request(location, &()).await } + fn list(&self, prefix: Option<&Path>) -> BoxStream<'static, Result> { + self.client.list(prefix) + } fn delete_stream<'a>( &'a self, locations: BoxStream<'a, Result>, @@ -139,10 +142,6 @@ impl ObjectStore for MicrosoftAzure { .boxed() } - fn list(&self, prefix: Option<&Path>) -> BoxStream<'_, Result> { - self.client.list(prefix) - } - async fn list_with_delimiter(&self, prefix: Option<&Path>) -> Result { self.client.list_with_delimiter(prefix).await } diff --git a/src/chunked.rs b/src/chunked.rs index 3f83c13..4998e9f 100644 --- a/src/chunked.rs +++ b/src/chunked.rs @@ -150,7 +150,7 @@ impl ObjectStore for ChunkedStore { self.inner.delete(location).await } - fn list(&self, prefix: Option<&Path>) -> BoxStream<'_, Result> { + fn list(&self, prefix: Option<&Path>) -> BoxStream<'static, Result> { self.inner.list(prefix) } @@ -158,7 +158,7 @@ impl ObjectStore for ChunkedStore { &self, prefix: Option<&Path>, offset: &Path, - ) -> BoxStream<'_, Result> { + ) -> BoxStream<'static, Result> { self.inner.list_with_offset(prefix, offset) } diff --git a/src/client/list.rs b/src/client/list.rs index 4445d0d..fe9bfeb 100644 --- a/src/client/list.rs +++ b/src/client/list.rs @@ -44,37 +44,38 @@ pub(crate) trait ListClientExt { prefix: Option<&Path>, delimiter: bool, offset: Option<&Path>, - ) -> BoxStream<'_, Result>; + ) -> BoxStream<'static, Result>; - fn list(&self, prefix: Option<&Path>) -> BoxStream<'_, Result>; + fn list(&self, prefix: Option<&Path>) -> BoxStream<'static, Result>; #[allow(unused)] fn list_with_offset( &self, prefix: Option<&Path>, offset: &Path, - ) -> BoxStream<'_, Result>; + ) -> BoxStream<'static, Result>; async fn list_with_delimiter(&self, prefix: Option<&Path>) -> Result; } #[async_trait] -impl ListClientExt for T { +impl ListClientExt for T { fn list_paginated( &self, prefix: Option<&Path>, delimiter: bool, offset: Option<&Path>, - ) -> BoxStream<'_, Result> { + ) -> BoxStream<'static, Result> { let offset = offset.map(|x| x.to_string()); let prefix = prefix .filter(|x| !x.as_ref().is_empty()) .map(|p| format!("{}{}", p.as_ref(), crate::path::DELIMITER)); stream_paginated( + self.clone(), (prefix, offset), - move |(prefix, offset), token| async move { - let (r, next_token) = self + move |client, (prefix, offset), token| async move { + let (r, next_token) = client .list_request( prefix.as_deref(), delimiter, @@ -88,7 +89,7 @@ impl ListClientExt for T { .boxed() } - fn list(&self, prefix: Option<&Path>) -> BoxStream<'_, Result> { + fn list(&self, prefix: Option<&Path>) -> BoxStream<'static, Result> { self.list_paginated(prefix, false, None) .map_ok(|r| futures::stream::iter(r.objects.into_iter().map(Ok))) .try_flatten() @@ -99,7 +100,7 @@ impl ListClientExt for T { &self, prefix: Option<&Path>, offset: &Path, - ) -> BoxStream<'_, Result> { + ) -> BoxStream<'static, Result> { self.list_paginated(prefix, false, Some(offset)) .map_ok(|r| futures::stream::iter(r.objects.into_iter().map(Ok))) .try_flatten() diff --git a/src/client/pagination.rs b/src/client/pagination.rs index 77b2a3d..d789c74 100644 --- a/src/client/pagination.rs +++ b/src/client/pagination.rs @@ -35,9 +35,14 @@ use std::future::Future; /// finish, otherwise it will continue to call `op(state, token)` with the values returned by the /// previous call to `op`, until a continuation token of `None` is returned /// -pub(crate) fn stream_paginated(state: S, op: F) -> impl Stream> +pub(crate) fn stream_paginated( + client: C, + state: S, + op: F, +) -> impl Stream> where - F: Fn(S, Option) -> Fut + Copy, + C: Clone, + F: Fn(C, S, Option) -> Fut + Copy, Fut: Future)>>, { enum PaginationState { @@ -46,27 +51,30 @@ where Done, } - futures::stream::unfold(PaginationState::Start(state), move |state| async move { - let (s, page_token) = match state { - PaginationState::Start(s) => (s, None), - PaginationState::HasMore(s, page_token) if !page_token.is_empty() => { - (s, Some(page_token)) - } - _ => { - return None; - } - }; + futures::stream::unfold(PaginationState::Start(state), move |state| { + let client = client.clone(); + async move { + let (s, page_token) = match state { + PaginationState::Start(s) => (s, None), + PaginationState::HasMore(s, page_token) if !page_token.is_empty() => { + (s, Some(page_token)) + } + _ => { + return None; + } + }; - let (resp, s, continuation) = match op(s, page_token).await { - Ok(resp) => resp, - Err(e) => return Some((Err(e), PaginationState::Done)), - }; + let (resp, s, continuation) = match op(client, s, page_token).await { + Ok(resp) => resp, + Err(e) => return Some((Err(e), PaginationState::Done)), + }; - let next_state = match continuation { - Some(token) => PaginationState::HasMore(s, token), - None => PaginationState::Done, - }; + let next_state = match continuation { + Some(token) => PaginationState::HasMore(s, token), + None => PaginationState::Done, + }; - Some((Ok(resp), next_state)) + Some((Ok(resp), next_state)) + } }) } diff --git a/src/gcp/client.rs b/src/gcp/client.rs index d6f89ca..8dd1c69 100644 --- a/src/gcp/client.rs +++ b/src/gcp/client.rs @@ -633,7 +633,7 @@ impl GetClient for GoogleCloudStorageClient { } #[async_trait] -impl ListClient for GoogleCloudStorageClient { +impl ListClient for Arc { /// Perform a list request async fn list_request( &self, diff --git a/src/gcp/mod.rs b/src/gcp/mod.rs index 5199135..a2f5124 100644 --- a/src/gcp/mod.rs +++ b/src/gcp/mod.rs @@ -183,7 +183,7 @@ impl ObjectStore for GoogleCloudStorage { self.client.delete_request(location).await } - fn list(&self, prefix: Option<&Path>) -> BoxStream<'_, Result> { + fn list(&self, prefix: Option<&Path>) -> BoxStream<'static, Result> { self.client.list(prefix) } @@ -191,7 +191,7 @@ impl ObjectStore for GoogleCloudStorage { &self, prefix: Option<&Path>, offset: &Path, - ) -> BoxStream<'_, Result> { + ) -> BoxStream<'static, Result> { self.client.list_with_offset(prefix, offset) } diff --git a/src/http/mod.rs b/src/http/mod.rs index 417f728..899740d 100644 --- a/src/http/mod.rs +++ b/src/http/mod.rs @@ -31,6 +31,8 @@ //! [rfc2518]: https://datatracker.ietf.org/doc/html/rfc2518 //! [WebDAV]: https://en.wikipedia.org/wiki/WebDAV +use std::sync::Arc; + use async_trait::async_trait; use futures::stream::BoxStream; use futures::{StreamExt, TryStreamExt}; @@ -79,7 +81,7 @@ impl From for crate::Error { /// See [`crate::http`] for more information #[derive(Debug)] pub struct HttpStore { - client: Client, + client: Arc, } impl std::fmt::Display for HttpStore { @@ -130,19 +132,20 @@ impl ObjectStore for HttpStore { self.client.delete(location).await } - fn list(&self, prefix: Option<&Path>) -> BoxStream<'_, Result> { + fn list(&self, prefix: Option<&Path>) -> BoxStream<'static, Result> { let prefix_len = prefix.map(|p| p.as_ref().len()).unwrap_or_default(); let prefix = prefix.cloned(); + let client = Arc::clone(&self.client); futures::stream::once(async move { - let status = self.client.list(prefix.as_ref(), "infinity").await?; + let status = client.list(prefix.as_ref(), "infinity").await?; let iter = status .response .into_iter() .filter(|r| !r.is_dir()) - .map(|response| { + .map(move |response| { response.check_ok()?; - response.object_meta(self.client.base_url()) + response.object_meta(client.base_url()) }) // Filter out exact prefix matches .filter_ok(move |r| r.location.as_ref().len() > prefix_len); @@ -238,7 +241,7 @@ impl HttpBuilder { let parsed = Url::parse(&url).map_err(|source| Error::UnableToParseUrl { url, source })?; Ok(HttpStore { - client: Client::new(parsed, self.client_options, self.retry_config)?, + client: Arc::new(Client::new(parsed, self.client_options, self.retry_config)?), }) } } diff --git a/src/lib.rs b/src/lib.rs index 987ffac..53eda5a 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -722,7 +722,7 @@ pub trait ObjectStore: std::fmt::Display + Send + Sync + Debug + 'static { /// `foo/bar_baz/x`. List is recursive, i.e. `foo/bar/more/x` will be included. /// /// Note: the order of returned [`ObjectMeta`] is not guaranteed - fn list(&self, prefix: Option<&Path>) -> BoxStream<'_, Result>; + fn list(&self, prefix: Option<&Path>) -> BoxStream<'static, Result>; /// List all the objects with the given prefix and a location greater than `offset` /// @@ -734,7 +734,7 @@ pub trait ObjectStore: std::fmt::Display + Send + Sync + Debug + 'static { &self, prefix: Option<&Path>, offset: &Path, - ) -> BoxStream<'_, Result> { + ) -> BoxStream<'static, Result> { let offset = offset.clone(); self.list(prefix) .try_filter(move |f| futures::future::ready(f.location > offset)) @@ -847,7 +847,7 @@ macro_rules! as_ref_impl { self.as_ref().delete_stream(locations) } - fn list(&self, prefix: Option<&Path>) -> BoxStream<'_, Result> { + fn list(&self, prefix: Option<&Path>) -> BoxStream<'static, Result> { self.as_ref().list(prefix) } @@ -855,7 +855,7 @@ macro_rules! as_ref_impl { &self, prefix: Option<&Path>, offset: &Path, - ) -> BoxStream<'_, Result> { + ) -> BoxStream<'static, Result> { self.as_ref().list_with_offset(prefix, offset) } diff --git a/src/limit.rs b/src/limit.rs index 6a3c3b5..77f72a0 100644 --- a/src/limit.rs +++ b/src/limit.rs @@ -45,7 +45,7 @@ use tokio::sync::{OwnedSemaphorePermit, Semaphore}; /// #[derive(Debug)] pub struct LimitStore { - inner: T, + inner: Arc, max_requests: usize, semaphore: Arc, } @@ -56,7 +56,7 @@ impl LimitStore { /// `max_requests` pub fn new(inner: T, max_requests: usize) -> Self { Self { - inner, + inner: Arc::new(inner), max_requests, semaphore: Arc::new(Semaphore::new(max_requests)), } @@ -144,12 +144,13 @@ impl ObjectStore for LimitStore { self.inner.delete_stream(locations) } - fn list(&self, prefix: Option<&Path>) -> BoxStream<'_, Result> { + fn list(&self, prefix: Option<&Path>) -> BoxStream<'static, Result> { let prefix = prefix.cloned(); + let inner = Arc::clone(&self.inner); let fut = Arc::clone(&self.semaphore) .acquire_owned() .map(move |permit| { - let s = self.inner.list(prefix.as_ref()); + let s = inner.list(prefix.as_ref()); PermitWrapper::new(s, permit.unwrap()) }); fut.into_stream().flatten().boxed() @@ -159,13 +160,14 @@ impl ObjectStore for LimitStore { &self, prefix: Option<&Path>, offset: &Path, - ) -> BoxStream<'_, Result> { + ) -> BoxStream<'static, Result> { let prefix = prefix.cloned(); let offset = offset.clone(); + let inner = Arc::clone(&self.inner); let fut = Arc::clone(&self.semaphore) .acquire_owned() .map(move |permit| { - let s = self.inner.list_with_offset(prefix.as_ref(), &offset); + let s = inner.list_with_offset(prefix.as_ref(), &offset); PermitWrapper::new(s, permit.unwrap()) }); fut.into_stream().flatten().boxed() diff --git a/src/local.rs b/src/local.rs index b193481..3640264 100644 --- a/src/local.rs +++ b/src/local.rs @@ -488,7 +488,7 @@ impl ObjectStore for LocalFileSystem { .await } - fn list(&self, prefix: Option<&Path>) -> BoxStream<'_, Result> { + fn list(&self, prefix: Option<&Path>) -> BoxStream<'static, Result> { let config = Arc::clone(&self.config); let root_path = match prefix { diff --git a/src/memory.rs b/src/memory.rs index 3f3cff3..6402f92 100644 --- a/src/memory.rs +++ b/src/memory.rs @@ -297,7 +297,7 @@ impl ObjectStore for InMemory { Ok(()) } - fn list(&self, prefix: Option<&Path>) -> BoxStream<'_, Result> { + fn list(&self, prefix: Option<&Path>) -> BoxStream<'static, Result> { let root = Path::default(); let prefix = prefix.unwrap_or(&root); diff --git a/src/prefix.rs b/src/prefix.rs index 227887d..a0b67ca 100644 --- a/src/prefix.rs +++ b/src/prefix.rs @@ -74,6 +74,28 @@ impl PrefixStore { } } +// Note: This is a relative hack to move these two functions to pure functions so they don't rely +// on the `self` lifetime. Expected to be cleaned up before merge. +// +/// Strip the constant prefix from a given path +fn strip_prefix(prefix: &Path, path: Path) -> Path { + // Note cannot use match because of borrow checker + if let Some(suffix) = path.prefix_match(prefix) { + return suffix.collect(); + } + path +} + +/// Strip the constant prefix from a given ObjectMeta +fn strip_meta(prefix: &Path, meta: ObjectMeta) -> ObjectMeta { + ObjectMeta { + last_modified: meta.last_modified, + size: meta.size, + location: strip_prefix(prefix, meta.location), + e_tag: meta.e_tag, + version: None, + } +} #[async_trait::async_trait] impl ObjectStore for PrefixStore { async fn put(&self, location: &Path, payload: PutPayload) -> Result { @@ -136,21 +158,23 @@ impl ObjectStore for PrefixStore { self.inner.delete(&full_path).await } - fn list(&self, prefix: Option<&Path>) -> BoxStream<'_, Result> { + fn list(&self, prefix: Option<&Path>) -> BoxStream<'static, Result> { let prefix = self.full_path(prefix.unwrap_or(&Path::default())); let s = self.inner.list(Some(&prefix)); - s.map_ok(|meta| self.strip_meta(meta)).boxed() + let slf_prefix = self.prefix.clone(); + s.map_ok(move |meta| strip_meta(&slf_prefix, meta)).boxed() } fn list_with_offset( &self, prefix: Option<&Path>, offset: &Path, - ) -> BoxStream<'_, Result> { + ) -> BoxStream<'static, Result> { let offset = self.full_path(offset); let prefix = self.full_path(prefix.unwrap_or(&Path::default())); let s = self.inner.list_with_offset(Some(&prefix), &offset); - s.map_ok(|meta| self.strip_meta(meta)).boxed() + let slf_prefix = self.prefix.clone(); + s.map_ok(move |meta| strip_meta(&slf_prefix, meta)).boxed() } async fn list_with_delimiter(&self, prefix: Option<&Path>) -> Result { diff --git a/src/throttle.rs b/src/throttle.rs index b9dff5c..29cd327 100644 --- a/src/throttle.rs +++ b/src/throttle.rs @@ -237,11 +237,13 @@ impl ObjectStore for ThrottledStore { self.inner.delete(location).await } - fn list(&self, prefix: Option<&Path>) -> BoxStream<'_, Result> { + fn list(&self, prefix: Option<&Path>) -> BoxStream<'static, Result> { let stream = self.inner.list(prefix); + let config = Arc::clone(&self.config); futures::stream::once(async move { - let wait_list_per_entry = self.config().wait_list_per_entry; - sleep(self.config().wait_list_per_call).await; + let config = *config.lock(); + let wait_list_per_entry = config.wait_list_per_entry; + sleep(config.wait_list_per_call).await; throttle_stream(stream, move |_| wait_list_per_entry) }) .flatten() @@ -252,11 +254,13 @@ impl ObjectStore for ThrottledStore { &self, prefix: Option<&Path>, offset: &Path, - ) -> BoxStream<'_, Result> { + ) -> BoxStream<'static, Result> { let stream = self.inner.list_with_offset(prefix, offset); + let config = Arc::clone(&self.config); futures::stream::once(async move { - let wait_list_per_entry = self.config().wait_list_per_entry; - sleep(self.config().wait_list_per_call).await; + let config = *config.lock(); + let wait_list_per_entry = config.wait_list_per_entry; + sleep(config.wait_list_per_call).await; throttle_stream(stream, move |_| wait_list_per_entry) }) .flatten() diff --git a/tests/get_range_file.rs b/tests/get_range_file.rs index c5550ac..e500fc8 100644 --- a/tests/get_range_file.rs +++ b/tests/get_range_file.rs @@ -62,7 +62,7 @@ impl ObjectStore for MyStore { todo!() } - fn list(&self, _: Option<&Path>) -> BoxStream<'_, Result> { + fn list(&self, _: Option<&Path>) -> BoxStream<'static, Result> { todo!() } From 00b58832d4ce73b17eebd348e5b4f08635b6eb3f Mon Sep 17 00:00:00 2001 From: Kyle Barron Date: Wed, 15 Jan 2025 00:27:02 -0800 Subject: [PATCH 375/397] Improve docs for `AmazonS3Builder::from_env` (#6977) * Improve docs for `AmazonS3Builder::from_env` * wording * Update object_store/src/aws/builder.rs --------- Co-authored-by: Andrew Lamb --- src/aws/builder.rs | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/aws/builder.rs b/src/aws/builder.rs index d29fa78..29b2eef 100644 --- a/src/aws/builder.rs +++ b/src/aws/builder.rs @@ -427,7 +427,11 @@ impl AmazonS3Builder { /// Fill the [`AmazonS3Builder`] with regular AWS environment variables /// - /// Variables extracted from environment: + /// All environment variables starting with `AWS_` will be evaluated. Names must + /// match acceptable input to [`AmazonS3ConfigKey::from_str`]. Only upper-case environment + /// variables are accepted. + /// + /// Some examples of variables extracted from environment: /// * `AWS_ACCESS_KEY_ID` -> access_key_id /// * `AWS_SECRET_ACCESS_KEY` -> secret_access_key /// * `AWS_DEFAULT_REGION` -> region @@ -435,6 +439,7 @@ impl AmazonS3Builder { /// * `AWS_SESSION_TOKEN` -> token /// * `AWS_CONTAINER_CREDENTIALS_RELATIVE_URI` -> /// * `AWS_ALLOW_HTTP` -> set to "true" to permit HTTP connections without TLS + /// * `AWS_REQUEST_PAYER` -> set to "true" to permit operations on requester-pays buckets. /// # Example /// ``` /// use object_store::aws::AmazonS3Builder; From 8283d76d18405349d249ecf3afd3c640497d2120 Mon Sep 17 00:00:00 2001 From: Xiangpeng Hao Date: Fri, 17 Jan 2025 16:52:02 -0500 Subject: [PATCH 376/397] Use `u64` range instead of `usize`, for better wasm32 support (#6961) * u64 ranges * more u64 * make clippy happy * even more u64 * Update object_store/src/lib.rs Co-authored-by: Andrew Lamb * Update object_store/src/lib.rs Co-authored-by: Andrew Lamb * address comments --------- Co-authored-by: Andrew Lamb --- src/azure/client.rs | 2 +- src/chunked.rs | 8 ++--- src/client/get.rs | 10 +++--- src/client/s3.rs | 2 +- src/http/client.rs | 4 +-- src/integration.rs | 10 +++--- src/lib.rs | 26 +++++++++------ src/limit.rs | 4 +-- src/local.rs | 72 ++++++++++++++++++----------------------- src/memory.rs | 36 ++++++++++++++------- src/prefix.rs | 4 +-- src/throttle.rs | 6 ++-- src/util.rs | 63 +++++++++++++++++++++++------------- tests/get_range_file.rs | 13 +++++--- 14 files changed, 147 insertions(+), 113 deletions(-) diff --git a/src/azure/client.rs b/src/azure/client.rs index fa5412c..2c2e27e 100644 --- a/src/azure/client.rs +++ b/src/azure/client.rs @@ -1058,7 +1058,7 @@ impl TryFrom for ObjectMeta { Ok(Self { location: Path::parse(value.name)?, last_modified: value.properties.last_modified, - size: value.properties.content_length as usize, + size: value.properties.content_length, e_tag: value.properties.e_tag, version: None, // For consistency with S3 and GCP which don't include this }) diff --git a/src/chunked.rs b/src/chunked.rs index 4998e9f..2bb30b9 100644 --- a/src/chunked.rs +++ b/src/chunked.rs @@ -44,7 +44,7 @@ use crate::{PutPayload, Result}; #[derive(Debug)] pub struct ChunkedStore { inner: Arc, - chunk_size: usize, + chunk_size: usize, // chunks are in memory, so we use usize not u64 } impl ChunkedStore { @@ -138,7 +138,7 @@ impl ObjectStore for ChunkedStore { }) } - async fn get_range(&self, location: &Path, range: Range) -> Result { + async fn get_range(&self, location: &Path, range: Range) -> Result { self.inner.get_range(location, range).await } @@ -203,8 +203,8 @@ mod tests { let mut remaining = 1001; while let Some(next) = s.next().await { - let size = next.unwrap().len(); - let expected = remaining.min(chunk_size); + let size = next.unwrap().len() as u64; + let expected = remaining.min(chunk_size as u64); assert_eq!(size, expected); remaining -= expected; } diff --git a/src/client/get.rs b/src/client/get.rs index 57aca89..f252dd9 100644 --- a/src/client/get.rs +++ b/src/client/get.rs @@ -67,9 +67,9 @@ impl GetClientExt for T { struct ContentRange { /// The range of the object returned - range: Range, + range: Range, /// The total size of the object being requested - size: usize, + size: u64, } impl ContentRange { @@ -84,7 +84,7 @@ impl ContentRange { let (start_s, end_s) = range.split_once('-')?; let start = start_s.parse().ok()?; - let end: usize = end_s.parse().ok()?; + let end: u64 = end_s.parse().ok()?; Some(Self { size, @@ -140,8 +140,8 @@ enum GetResultError { #[error("Requested {expected:?}, got {actual:?}")] UnexpectedRange { - expected: Range, - actual: Range, + expected: Range, + actual: Range, }, } diff --git a/src/client/s3.rs b/src/client/s3.rs index 7fe956b..a2221fb 100644 --- a/src/client/s3.rs +++ b/src/client/s3.rs @@ -66,7 +66,7 @@ pub struct ListPrefix { #[serde(rename_all = "PascalCase")] pub struct ListContents { pub key: String, - pub size: usize, + pub size: u64, pub last_modified: DateTime, #[serde(rename = "ETag")] pub e_tag: Option, diff --git a/src/http/client.rs b/src/http/client.rs index 41e6464..9983fdf 100644 --- a/src/http/client.rs +++ b/src/http/client.rs @@ -420,7 +420,7 @@ impl MultiStatusResponse { })?) } - fn size(&self) -> Result { + fn size(&self) -> Result { let size = self .prop_stat .prop @@ -462,7 +462,7 @@ pub(crate) struct Prop { last_modified: DateTime, #[serde(rename = "getcontentlength")] - content_length: Option, + content_length: Option, #[serde(rename = "resourcetype")] resource_type: ResourceType, diff --git a/src/integration.rs b/src/integration.rs index 20e95fd..25a9294 100644 --- a/src/integration.rs +++ b/src/integration.rs @@ -112,7 +112,7 @@ pub async fn put_get_delete_list(storage: &DynObjectStore) { let range_result = storage.get_range(&location, range.clone()).await; let bytes = range_result.unwrap(); - assert_eq!(bytes, data.slice(range.clone())); + assert_eq!(bytes, data.slice(range.start as usize..range.end as usize)); let opts = GetOptions { range: Some(GetRange::Bounded(2..5)), @@ -190,11 +190,11 @@ pub async fn put_get_delete_list(storage: &DynObjectStore) { let ranges = vec![0..1, 2..3, 0..5]; let bytes = storage.get_ranges(&location, &ranges).await.unwrap(); for (range, bytes) in ranges.iter().zip(bytes) { - assert_eq!(bytes, data.slice(range.clone())) + assert_eq!(bytes, data.slice(range.start as usize..range.end as usize)); } let head = storage.head(&location).await.unwrap(); - assert_eq!(head.size, data.len()); + assert_eq!(head.size, data.len() as u64); storage.delete(&location).await.unwrap(); @@ -934,7 +934,7 @@ pub async fn list_with_delimiter(storage: &DynObjectStore) { let object = &result.objects[0]; assert_eq!(object.location, expected_location); - assert_eq!(object.size, data.len()); + assert_eq!(object.size, data.len() as u64); // ==================== check: prefix-list `mydb/wb/000/000/001` (partial filename doesn't match) ==================== let prefix = Path::from("mydb/wb/000/000/001"); @@ -1085,7 +1085,7 @@ pub async fn multipart(storage: &dyn ObjectStore, multipart: &dyn MultipartStore .unwrap(); let meta = storage.head(&path).await.unwrap(); - assert_eq!(meta.size, chunk_size * 2); + assert_eq!(meta.size, chunk_size as u64 * 2); // Empty case let path = Path::from("test_empty_multipart"); diff --git a/src/lib.rs b/src/lib.rs index 53eda5a..cffcbbd 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -234,7 +234,7 @@ //! //! // Buffer the entire object in memory //! let object: Bytes = result.bytes().await.unwrap(); -//! assert_eq!(object.len(), meta.size); +//! assert_eq!(object.len() as u64, meta.size); //! //! // Alternatively stream the bytes from object storage //! let stream = object_store.get(&path).await.unwrap().into_stream(); @@ -630,7 +630,7 @@ pub trait ObjectStore: std::fmt::Display + Send + Sync + Debug + 'static { /// in the given byte range. /// /// See [`GetRange::Bounded`] for more details on how `range` gets interpreted - async fn get_range(&self, location: &Path, range: Range) -> Result { + async fn get_range(&self, location: &Path, range: Range) -> Result { let options = GetOptions { range: Some(range.into()), ..Default::default() @@ -640,7 +640,7 @@ pub trait ObjectStore: std::fmt::Display + Send + Sync + Debug + 'static { /// Return the bytes that are stored at the specified location /// in the given byte ranges - async fn get_ranges(&self, location: &Path, ranges: &[Range]) -> Result> { + async fn get_ranges(&self, location: &Path, ranges: &[Range]) -> Result> { coalesce_ranges( ranges, |range| self.get_range(location, range), @@ -820,14 +820,14 @@ macro_rules! as_ref_impl { self.as_ref().get_opts(location, options).await } - async fn get_range(&self, location: &Path, range: Range) -> Result { + async fn get_range(&self, location: &Path, range: Range) -> Result { self.as_ref().get_range(location, range).await } async fn get_ranges( &self, location: &Path, - ranges: &[Range], + ranges: &[Range], ) -> Result> { self.as_ref().get_ranges(location, ranges).await } @@ -903,8 +903,10 @@ pub struct ObjectMeta { pub location: Path, /// The last modified time pub last_modified: DateTime, - /// The size in bytes of the object - pub size: usize, + /// The size in bytes of the object. + /// + /// Note this is not `usize` as `object_store` supports 32-bit architectures such as WASM + pub size: u64, /// The unique identifier for the object /// /// @@ -1019,7 +1021,9 @@ pub struct GetResult { /// The [`ObjectMeta`] for this object pub meta: ObjectMeta, /// The range of bytes returned by this request - pub range: Range, + /// + /// Note this is not `usize` as `object_store` supports 32-bit architectures such as WASM + pub range: Range, /// Additional object attributes pub attributes: Attributes, } @@ -1060,7 +1064,11 @@ impl GetResult { path: path.clone(), })?; - let mut buffer = Vec::with_capacity(len); + let mut buffer = if let Ok(len) = len.try_into() { + Vec::with_capacity(len) + } else { + Vec::new() + }; file.take(len as _) .read_to_end(&mut buffer) .map_err(|source| local::Error::UnableToReadBytes { source, path })?; diff --git a/src/limit.rs b/src/limit.rs index 77f72a0..330a0da 100644 --- a/src/limit.rs +++ b/src/limit.rs @@ -117,12 +117,12 @@ impl ObjectStore for LimitStore { Ok(permit_get_result(r, permit)) } - async fn get_range(&self, location: &Path, range: Range) -> Result { + async fn get_range(&self, location: &Path, range: Range) -> Result { let _permit = self.semaphore.acquire().await.unwrap(); self.inner.get_range(location, range).await } - async fn get_ranges(&self, location: &Path, ranges: &[Range]) -> Result> { + async fn get_ranges(&self, location: &Path, ranges: &[Range]) -> Result> { let _permit = self.semaphore.acquire().await.unwrap(); self.inner.get_ranges(location, ranges).await } diff --git a/src/local.rs b/src/local.rs index 3640264..65e87f9 100644 --- a/src/local.rs +++ b/src/local.rs @@ -21,7 +21,7 @@ use std::io::{ErrorKind, Read, Seek, SeekFrom, Write}; use std::ops::Range; use std::sync::Arc; use std::time::SystemTime; -use std::{collections::BTreeSet, convert::TryFrom, io}; +use std::{collections::BTreeSet, io}; use std::{collections::VecDeque, path::PathBuf}; use async_trait::async_trait; @@ -44,12 +44,6 @@ use crate::{ /// A specialized `Error` for filesystem object store-related errors #[derive(Debug, thiserror::Error)] pub(crate) enum Error { - #[error("File size for {} did not fit in a usize: {}", path, source)] - FileSizeOverflowedUsize { - source: std::num::TryFromIntError, - path: String, - }, - #[error("Unable to walk dir: {}", source)] UnableToWalkDir { source: walkdir::Error }, @@ -83,8 +77,8 @@ pub(crate) enum Error { #[error("Out of range of file {}, expected: {}, actual: {}", path.display(), expected, actual)] OutOfRange { path: PathBuf, - expected: usize, - actual: usize, + expected: u64, + actual: u64, }, #[error("Requested range was invalid")] @@ -410,7 +404,7 @@ impl ObjectStore for LocalFileSystem { let path = self.path_to_filesystem(&location)?; maybe_spawn_blocking(move || { let (file, metadata) = open_file(&path)?; - let meta = convert_metadata(metadata, location)?; + let meta = convert_metadata(metadata, location); options.check_preconditions(&meta)?; let range = match options.range { @@ -430,7 +424,7 @@ impl ObjectStore for LocalFileSystem { .await } - async fn get_range(&self, location: &Path, range: Range) -> Result { + async fn get_range(&self, location: &Path, range: Range) -> Result { let path = self.path_to_filesystem(location)?; maybe_spawn_blocking(move || { let (mut file, _) = open_file(&path)?; @@ -439,7 +433,7 @@ impl ObjectStore for LocalFileSystem { .await } - async fn get_ranges(&self, location: &Path, ranges: &[Range]) -> Result> { + async fn get_ranges(&self, location: &Path, ranges: &[Range]) -> Result> { let path = self.path_to_filesystem(location)?; let ranges = ranges.to_vec(); maybe_spawn_blocking(move || { @@ -825,7 +819,7 @@ impl Drop for LocalUpload { pub(crate) fn chunked_stream( mut file: File, path: PathBuf, - range: Range, + range: Range, chunk_size: usize, ) -> BoxStream<'static, Result> { futures::stream::once(async move { @@ -847,17 +841,23 @@ pub(crate) fn chunked_stream( return Ok(None); } - let to_read = remaining.min(chunk_size); - let mut buffer = Vec::with_capacity(to_read); + let to_read = remaining.min(chunk_size as u64); + let cap = usize::try_from(to_read).map_err(|_e| Error::InvalidRange { + source: InvalidGetRange::TooLarge { + requested: to_read, + max: usize::MAX as u64, + }, + })?; + let mut buffer = Vec::with_capacity(cap); let read = (&mut file) - .take(to_read as u64) + .take(to_read) .read_to_end(&mut buffer) .map_err(|e| Error::UnableToReadBytes { source: e, path: path.clone(), })?; - Ok(Some((buffer.into(), (file, path, remaining - read)))) + Ok(Some((buffer.into(), (file, path, remaining - read as u64)))) }) }, ); @@ -867,22 +867,18 @@ pub(crate) fn chunked_stream( .boxed() } -pub(crate) fn read_range(file: &mut File, path: &PathBuf, range: Range) -> Result { +pub(crate) fn read_range(file: &mut File, path: &PathBuf, range: Range) -> Result { let to_read = range.end - range.start; - file.seek(SeekFrom::Start(range.start as u64)) - .map_err(|source| { - let path = path.into(); - Error::Seek { source, path } - })?; + file.seek(SeekFrom::Start(range.start)).map_err(|source| { + let path = path.into(); + Error::Seek { source, path } + })?; - let mut buf = Vec::with_capacity(to_read); - let read = file - .take(to_read as u64) - .read_to_end(&mut buf) - .map_err(|source| { - let path = path.into(); - Error::UnableToReadBytes { source, path } - })?; + let mut buf = Vec::with_capacity(to_read as usize); + let read = file.take(to_read).read_to_end(&mut buf).map_err(|source| { + let path = path.into(); + Error::UnableToReadBytes { source, path } + })? as u64; if read != to_read { let error = Error::OutOfRange { @@ -922,7 +918,7 @@ fn open_file(path: &PathBuf) -> Result<(File, Metadata)> { fn convert_entry(entry: DirEntry, location: Path) -> Result> { match entry.metadata() { - Ok(metadata) => convert_metadata(metadata, location).map(Some), + Ok(metadata) => Ok(Some(convert_metadata(metadata, location))), Err(e) => { if let Some(io_err) = e.io_error() { if io_err.kind() == ErrorKind::NotFound { @@ -960,20 +956,16 @@ fn get_etag(metadata: &Metadata) -> String { format!("{inode:x}-{mtime:x}-{size:x}") } -fn convert_metadata(metadata: Metadata, location: Path) -> Result { +fn convert_metadata(metadata: Metadata, location: Path) -> ObjectMeta { let last_modified = last_modified(&metadata); - let size = usize::try_from(metadata.len()).map_err(|source| { - let path = location.as_ref().into(); - Error::FileSizeOverflowedUsize { source, path } - })?; - Ok(ObjectMeta { + ObjectMeta { location, last_modified, - size, + size: metadata.len(), e_tag: Some(get_etag(&metadata)), version: None, - }) + } } #[cfg(unix)] diff --git a/src/memory.rs b/src/memory.rs index 6402f92..26beff1 100644 --- a/src/memory.rs +++ b/src/memory.rs @@ -241,7 +241,7 @@ impl ObjectStore for InMemory { let meta = ObjectMeta { location: location.clone(), last_modified: entry.last_modified, - size: entry.data.len(), + size: entry.data.len() as u64, e_tag: Some(e_tag), version: None, }; @@ -250,11 +250,14 @@ impl ObjectStore for InMemory { let (range, data) = match options.range { Some(range) => { let r = range - .as_range(entry.data.len()) + .as_range(entry.data.len() as u64) .map_err(|source| Error::Range { source })?; - (r.clone(), entry.data.slice(r)) + ( + r.clone(), + entry.data.slice(r.start as usize..r.end as usize), + ) } - None => (0..entry.data.len(), entry.data), + None => (0..entry.data.len() as u64, entry.data), }; let stream = futures::stream::once(futures::future::ready(Ok(data))); @@ -266,16 +269,27 @@ impl ObjectStore for InMemory { }) } - async fn get_ranges(&self, location: &Path, ranges: &[Range]) -> Result> { + async fn get_ranges(&self, location: &Path, ranges: &[Range]) -> Result> { let entry = self.entry(location).await?; ranges .iter() .map(|range| { let r = GetRange::Bounded(range.clone()) - .as_range(entry.data.len()) + .as_range(entry.data.len() as u64) .map_err(|source| Error::Range { source })?; - - Ok(entry.data.slice(r)) + let r_end = usize::try_from(r.end).map_err(|_e| Error::Range { + source: InvalidGetRange::TooLarge { + requested: r.end, + max: usize::MAX as u64, + }, + })?; + let r_start = usize::try_from(r.start).map_err(|_e| Error::Range { + source: InvalidGetRange::TooLarge { + requested: r.start, + max: usize::MAX as u64, + }, + })?; + Ok(entry.data.slice(r_start..r_end)) }) .collect() } @@ -286,7 +300,7 @@ impl ObjectStore for InMemory { Ok(ObjectMeta { location: location.clone(), last_modified: entry.last_modified, - size: entry.data.len(), + size: entry.data.len() as u64, e_tag: Some(entry.e_tag.to_string()), version: None, }) @@ -316,7 +330,7 @@ impl ObjectStore for InMemory { Ok(ObjectMeta { location: key.clone(), last_modified: value.last_modified, - size: value.data.len(), + size: value.data.len() as u64, e_tag: Some(value.e_tag.to_string()), version: None, }) @@ -361,7 +375,7 @@ impl ObjectStore for InMemory { let object = ObjectMeta { location: k.clone(), last_modified: v.last_modified, - size: v.data.len(), + size: v.data.len() as u64, e_tag: Some(v.e_tag.to_string()), version: None, }; diff --git a/src/prefix.rs b/src/prefix.rs index a0b67ca..ac9803e 100644 --- a/src/prefix.rs +++ b/src/prefix.rs @@ -132,7 +132,7 @@ impl ObjectStore for PrefixStore { self.inner.get(&full_path).await } - async fn get_range(&self, location: &Path, range: Range) -> Result { + async fn get_range(&self, location: &Path, range: Range) -> Result { let full_path = self.full_path(location); self.inner.get_range(&full_path, range).await } @@ -142,7 +142,7 @@ impl ObjectStore for PrefixStore { self.inner.get_opts(&full_path, options).await } - async fn get_ranges(&self, location: &Path, ranges: &[Range]) -> Result> { + async fn get_ranges(&self, location: &Path, ranges: &[Range]) -> Result> { let full_path = self.full_path(location); self.inner.get_ranges(&full_path, ranges).await } diff --git a/src/throttle.rs b/src/throttle.rs index 29cd327..6586ba9 100644 --- a/src/throttle.rs +++ b/src/throttle.rs @@ -203,7 +203,7 @@ impl ObjectStore for ThrottledStore { Ok(throttle_get(result, wait_get_per_byte)) } - async fn get_range(&self, location: &Path, range: Range) -> Result { + async fn get_range(&self, location: &Path, range: Range) -> Result { let config = self.config(); let sleep_duration = @@ -214,10 +214,10 @@ impl ObjectStore for ThrottledStore { self.inner.get_range(location, range).await } - async fn get_ranges(&self, location: &Path, ranges: &[Range]) -> Result> { + async fn get_ranges(&self, location: &Path, ranges: &[Range]) -> Result> { let config = self.config(); - let total_bytes: usize = ranges.iter().map(|range| range.end - range.start).sum(); + let total_bytes: u64 = ranges.iter().map(|range| range.end - range.start).sum(); let sleep_duration = config.wait_get_per_call + config.wait_get_per_byte * total_bytes as u32; diff --git a/src/util.rs b/src/util.rs index 6d638f3..17a7a8c 100644 --- a/src/util.rs +++ b/src/util.rs @@ -49,7 +49,7 @@ pub(crate) fn hmac_sha256(secret: impl AsRef<[u8]>, bytes: impl AsRef<[u8]>) -> } /// Collect a stream into [`Bytes`] avoiding copying in the event of a single chunk -pub async fn collect_bytes(mut stream: S, size_hint: Option) -> Result +pub async fn collect_bytes(mut stream: S, size_hint: Option) -> Result where E: Send, S: Stream> + Send + Unpin, @@ -60,9 +60,9 @@ where match stream.next().await.transpose()? { None => Ok(first), Some(second) => { - let size_hint = size_hint.unwrap_or_else(|| first.len() + second.len()); + let size_hint = size_hint.unwrap_or_else(|| first.len() as u64 + second.len() as u64); - let mut buf = Vec::with_capacity(size_hint); + let mut buf = Vec::with_capacity(size_hint as usize); buf.extend_from_slice(&first); buf.extend_from_slice(&second); while let Some(maybe_bytes) = stream.next().await { @@ -89,7 +89,7 @@ where /// Range requests with a gap less than or equal to this, /// will be coalesced into a single request by [`coalesce_ranges`] -pub const OBJECT_STORE_COALESCE_DEFAULT: usize = 1024 * 1024; +pub const OBJECT_STORE_COALESCE_DEFAULT: u64 = 1024 * 1024; /// Up to this number of range requests will be performed in parallel by [`coalesce_ranges`] pub(crate) const OBJECT_STORE_COALESCE_PARALLEL: usize = 10; @@ -103,12 +103,12 @@ pub(crate) const OBJECT_STORE_COALESCE_PARALLEL: usize = 10; /// * Make multiple `fetch` requests in parallel (up to maximum of 10) /// pub async fn coalesce_ranges( - ranges: &[Range], + ranges: &[Range], fetch: F, - coalesce: usize, + coalesce: u64, ) -> Result, E> where - F: Send + FnMut(Range) -> Fut, + F: Send + FnMut(Range) -> Fut, E: Send, Fut: std::future::Future> + Send, { @@ -129,13 +129,14 @@ where let start = range.start - fetch_range.start; let end = range.end - fetch_range.start; - fetch_bytes.slice(start..end.min(fetch_bytes.len())) + let range = (start as usize)..(end as usize).min(fetch_bytes.len()); + fetch_bytes.slice(range) }) .collect()) } /// Returns a sorted list of ranges that cover `ranges` -fn merge_ranges(ranges: &[Range], coalesce: usize) -> Vec> { +fn merge_ranges(ranges: &[Range], coalesce: u64) -> Vec> { if ranges.is_empty() { return vec![]; } @@ -196,38 +197,49 @@ pub enum GetRange { /// an error will be returned. Additionally, if the range ends after the end /// of the object, the entire remainder of the object will be returned. /// Otherwise, the exact requested range will be returned. - Bounded(Range), + /// + /// Note that range is u64 (i.e., not usize), + /// as `object_store` supports 32-bit architectures such as WASM + Bounded(Range), /// Request all bytes starting from a given byte offset - Offset(usize), + Offset(u64), /// Request up to the last n bytes - Suffix(usize), + Suffix(u64), } #[derive(Debug, thiserror::Error)] pub(crate) enum InvalidGetRange { #[error("Wanted range starting at {requested}, but object was only {length} bytes long")] - StartTooLarge { requested: usize, length: usize }, + StartTooLarge { requested: u64, length: u64 }, #[error("Range started at {start} and ended at {end}")] - Inconsistent { start: usize, end: usize }, + Inconsistent { start: u64, end: u64 }, + + #[error("Range {requested} is larger than system memory limit {max}")] + TooLarge { requested: u64, max: u64 }, } impl GetRange { pub(crate) fn is_valid(&self) -> Result<(), InvalidGetRange> { - match self { - Self::Bounded(r) if r.end <= r.start => { + if let Self::Bounded(r) = self { + if r.end <= r.start { return Err(InvalidGetRange::Inconsistent { start: r.start, end: r.end, }); } - _ => (), - }; + if (r.end - r.start) > usize::MAX as u64 { + return Err(InvalidGetRange::TooLarge { + requested: r.start, + max: usize::MAX as u64, + }); + } + } Ok(()) } /// Convert to a [`Range`] if valid. - pub(crate) fn as_range(&self, len: usize) -> Result, InvalidGetRange> { + pub(crate) fn as_range(&self, len: u64) -> Result, InvalidGetRange> { self.is_valid()?; match self { Self::Bounded(r) => { @@ -267,7 +279,7 @@ impl Display for GetRange { } } -impl> From for GetRange { +impl> From for GetRange { fn from(value: T) -> Self { use std::ops::Bound::*; let first = match value.start_bound() { @@ -323,7 +335,7 @@ mod tests { /// Calls coalesce_ranges and validates the returned data is correct /// /// Returns the fetched ranges - async fn do_fetch(ranges: Vec>, coalesce: usize) -> Vec> { + async fn do_fetch(ranges: Vec>, coalesce: u64) -> Vec> { let max = ranges.iter().map(|x| x.end).max().unwrap_or(0); let src: Vec<_> = (0..max).map(|x| x as u8).collect(); @@ -332,7 +344,9 @@ mod tests { &ranges, |range| { fetches.push(range.clone()); - futures::future::ready(Ok(Bytes::from(src[range].to_vec()))) + let start = usize::try_from(range.start).unwrap(); + let end = usize::try_from(range.end).unwrap(); + futures::future::ready(Ok(Bytes::from(src[start..end].to_vec()))) }, coalesce, ) @@ -341,7 +355,10 @@ mod tests { assert_eq!(ranges.len(), coalesced.len()); for (range, bytes) in ranges.iter().zip(coalesced) { - assert_eq!(bytes.as_ref(), &src[range.clone()]); + assert_eq!( + bytes.as_ref(), + &src[usize::try_from(range.start).unwrap()..usize::try_from(range.end).unwrap()] + ); } fetches } diff --git a/tests/get_range_file.rs b/tests/get_range_file.rs index e500fc8..6790c11 100644 --- a/tests/get_range_file.rs +++ b/tests/get_range_file.rs @@ -90,12 +90,15 @@ async fn test_get_range() { let fetched = store.get(&path).await.unwrap().bytes().await.unwrap(); assert_eq!(expected, fetched); - for range in [0..10, 3..5, 0..expected.len()] { + for range in [0..10, 3..5, 0..expected.len() as u64] { let data = store.get_range(&path, range.clone()).await.unwrap(); - assert_eq!(&data[..], &expected[range]) + assert_eq!( + &data[..], + &expected[range.start as usize..range.end as usize] + ) } - let over_range = 0..(expected.len() * 2); + let over_range = 0..(expected.len() as u64 * 2); let data = store.get_range(&path, over_range.clone()).await.unwrap(); assert_eq!(&data[..], expected) } @@ -113,10 +116,10 @@ async fn test_get_opts_over_range() { store.put(&path, expected.clone().into()).await.unwrap(); let opts = GetOptions { - range: Some(GetRange::Bounded(0..(expected.len() * 2))), + range: Some(GetRange::Bounded(0..(expected.len() as u64 * 2))), ..Default::default() }; let res = store.get_opts(&path, opts).await.unwrap(); - assert_eq!(res.range, 0..expected.len()); + assert_eq!(res.range, 0..expected.len() as u64); assert_eq!(res.bytes().await.unwrap(), expected); } From 873ed4dad722fbe77e5636e313096266951a2c60 Mon Sep 17 00:00:00 2001 From: Diptanu Choudhury Date: Tue, 28 Jan 2025 13:47:15 -0800 Subject: [PATCH 377/397] Change Log On Succesful S3 Copy / Multipart Upload to Debug (#7033) * Update retry.rs * Update object_store/src/client/retry.rs Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> --------- Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> --- src/client/retry.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/client/retry.rs b/src/client/retry.rs index 8938b08..a3f8fcb 100644 --- a/src/client/retry.rs +++ b/src/client/retry.rs @@ -24,7 +24,7 @@ use reqwest::header::LOCATION; use reqwest::{Client, Request, Response, StatusCode}; use std::error::Error as StdError; use std::time::{Duration, Instant}; -use tracing::info; +use tracing::{debug, info}; /// Retry request error #[derive(Debug, thiserror::Error)] @@ -296,7 +296,7 @@ impl RetryableRequest { })?; let response_body = String::from_utf8_lossy(&bytes); - info!("Checking for error in response_body: {}", response_body); + debug!("Checking for error in response_body: {}", response_body); if !body_contains_error(&response_body) { // Success response and no error, clone and return response From 28d56f6b5dcabae19cbc6801f2697891efd713c9 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Sun, 2 Feb 2025 20:11:35 +0900 Subject: [PATCH 378/397] Remove all RCs after release (#7060) * Remove all RCs after release * Remove reference to old releases script * Keep running remove-old-artifacts.sh from release-tarball.sh --------- Co-authored-by: Andrew Lamb --- dev/release/release-tarball.sh | 4 +-- ...ld-releases.sh => remove-old-artifacts.sh} | 30 +++++++++++++++---- 2 files changed, 26 insertions(+), 8 deletions(-) rename dev/release/{remove-old-releases.sh => remove-old-artifacts.sh} (60%) diff --git a/dev/release/release-tarball.sh b/dev/release/release-tarball.sh index 9581186..16b10e0 100755 --- a/dev/release/release-tarball.sh +++ b/dev/release/release-tarball.sh @@ -75,5 +75,5 @@ echo "Success!" echo "The release is available here:" echo " https://dist.apache.org/repos/dist/release/arrow/${release_version}" -echo "Clean up old versions from svn" -"${SOURCE_TOP_DIR}"/dev/release/remove-old-releases.sh +echo "Clean up old artifacts from svn" +"${SOURCE_TOP_DIR}"/dev/release/remove-old-artifacts.sh diff --git a/dev/release/remove-old-releases.sh b/dev/release/remove-old-artifacts.sh similarity index 60% rename from dev/release/remove-old-releases.sh rename to dev/release/remove-old-artifacts.sh index c8bd8b7..bbbbe0c 100755 --- a/dev/release/remove-old-releases.sh +++ b/dev/release/remove-old-artifacts.sh @@ -18,8 +18,8 @@ # under the License. # -# This script removes all but the most recent versions of arrow-rs -# from svn +# This script removes all RCs and all but the most recent versions of +# object_store from svn. # # The older versions are in SVN history as well as available on the # archive page https://archive.apache.org/dist/ @@ -29,17 +29,35 @@ set -e set -u +set -o pipefail -svn_base="https://dist.apache.org/repos/dist/release/arrow" +echo "Remove all RCs" +dev_base_url=https://dist.apache.org/repos/dist/dev/arrow +old_rcs=$( + svn ls ${dev_base_url}/ | \ + grep -E '^apache-arrow-object-store-rs-[0-9]' | \ + sort --version-sort +) +for old_rc in $old_rcs; do + echo "Remove RC: ${old_rc}" + svn \ + delete \ + -m "Remove old Apache Arrow Rust Object Store RC: ${old_rc}" \ + ${dev_base_url}/${old_rc} +done echo "Remove all but the most recent version" +release_base_url="https://dist.apache.org/repos/dist/release/arrow" old_releases=$( - svn ls ${svn_base} | \ + svn ls ${release_base_url} | \ grep -E '^arrow-object-store-rs-[0-9\.]+' | \ sort --version-sort --reverse | \ tail -n +2 ) for old_release_version in $old_releases; do - echo "Remove old release ${old_release_version}" - svn delete -m "Removing ${old_release_version}" ${svn_base}/${old_release_version} + echo "Remove old release: ${old_release_version}" + svn \ + delete \ + -m "Remove Apache Arrow Rust Object Store release: ${old_release_version}" \ + ${release_base_url}/${old_release_version} done From 347b331c8c7077ec6fbc39532e28d5d9d670f921 Mon Sep 17 00:00:00 2001 From: Matthijs Brobbel Date: Sun, 2 Feb 2025 13:20:13 +0100 Subject: [PATCH 379/397] Update rand requirement from 0.8 to 0.9 (#7045) * Update rand requirement from 0.8 to 0.9 Updates the requirements on [rand](https://github.com/rust-random/rand) to permit the latest version. - [Release notes](https://github.com/rust-random/rand/releases) - [Changelog](https://github.com/rust-random/rand/blob/master/CHANGELOG.md) - [Commits](https://github.com/rust-random/rand/compare/0.8.0...0.9.0) --- updated-dependencies: - dependency-name: rand dependency-type: direct:production ... Signed-off-by: dependabot[bot] * Migrate to 0.9 * Add missing feature, also bump `rand` in `object_store` * Rustfmt * Name `UniformUsize` instance `uusize` instead of `usize` * Rename another use of `usize` --------- Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- Cargo.toml | 4 ++-- src/aws/dynamo.rs | 6 +++--- src/azure/client.rs | 2 +- src/client/backoff.rs | 8 ++++---- src/integration.rs | 10 +++++----- src/upload.rs | 6 +++--- src/util.rs | 14 +++++++------- 7 files changed, 25 insertions(+), 25 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 992ae66..168d2eb 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -49,7 +49,7 @@ hyper = { version = "1.2", default-features = false, optional = true } quick-xml = { version = "0.37.0", features = ["serialize", "overlapped-lists"], optional = true } serde = { version = "1.0", default-features = false, features = ["derive"], optional = true } serde_json = { version = "1.0", default-features = false, optional = true } -rand = { version = "0.8", default-features = false, features = ["std", "std_rng"], optional = true } +rand = { version = "0.9", default-features = false, features = ["std", "std_rng", "thread_rng"], optional = true } reqwest = { version = "0.12", default-features = false, features = ["rustls-tls-native-roots", "http2"], optional = true } ring = { version = "0.17", default-features = false, features = ["std"], optional = true } rustls-pemfile = { version = "2.0", default-features = false, features = ["std"], optional = true } @@ -76,7 +76,7 @@ futures-test = "0.3" hyper = { version = "1.2", features = ["server"] } hyper-util = "0.1" http-body-util = "0.1" -rand = "0.8" +rand = "0.9" tempfile = "3.1.0" regex = "1.11.1" # The "gzip" feature for reqwest is enabled for an integration test. diff --git a/src/aws/dynamo.rs b/src/aws/dynamo.rs index 6283e76..a66a343 100644 --- a/src/aws/dynamo.rs +++ b/src/aws/dynamo.rs @@ -527,8 +527,8 @@ mod tests { use super::*; use crate::aws::AmazonS3; use crate::ObjectStore; - use rand::distributions::Alphanumeric; - use rand::{thread_rng, Rng}; + use rand::distr::Alphanumeric; + use rand::{rng, Rng}; #[test] fn test_attribute_serde() { @@ -571,7 +571,7 @@ mod tests { _ => panic!("Should conflict"), } - let rng = thread_rng(); + let rng = rng(); let etag = String::from_utf8(rng.sample_iter(Alphanumeric).take(32).collect()).unwrap(); let t = Some(etag.as_str()); diff --git a/src/azure/client.rs b/src/azure/client.rs index 2c2e27e..7195729 100644 --- a/src/azure/client.rs +++ b/src/azure/client.rs @@ -561,7 +561,7 @@ impl AzureClient { _part_idx: usize, payload: PutPayload, ) -> Result { - let part_idx = u128::from_be_bytes(rand::thread_rng().gen()); + let part_idx = u128::from_be_bytes(rand::rng().random()); let content_id = format!("{part_idx:032x}"); let block_id = BASE64_STANDARD.encode(&content_id); diff --git a/src/client/backoff.rs b/src/client/backoff.rs index 8382a2e..8193e8b 100644 --- a/src/client/backoff.rs +++ b/src/client/backoff.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -use rand::prelude::*; +use rand::{prelude::*, rng}; use std::time::Duration; /// Exponential backoff with decorrelated jitter algorithm @@ -78,7 +78,7 @@ impl Backoff { /// Creates a new `Backoff` with the optional `rng` /// - /// Used [`rand::thread_rng()`] if no rng provided + /// Used [`rand::rng()`] if no rng provided pub(crate) fn new_with_rng( config: &BackoffConfig, rng: Option>, @@ -98,8 +98,8 @@ impl Backoff { let range = self.init_backoff..(self.next_backoff_secs * self.base); let rand_backoff = match self.rng.as_mut() { - Some(rng) => rng.gen_range(range), - None => thread_rng().gen_range(range), + Some(rng) => rng.random_range(range), + None => rng().random_range(range), }; let next_backoff = self.max_backoff_secs.min(rand_backoff); diff --git a/src/integration.rs b/src/integration.rs index 25a9294..5f9a92b 100644 --- a/src/integration.rs +++ b/src/integration.rs @@ -35,8 +35,8 @@ use crate::{ use bytes::Bytes; use futures::stream::FuturesUnordered; use futures::{StreamExt, TryStreamExt}; -use rand::distributions::Alphanumeric; -use rand::{thread_rng, Rng}; +use rand::distr::Alphanumeric; +use rand::{rng, Rng}; pub(crate) async fn flatten_list_stream( storage: &DynObjectStore, @@ -633,7 +633,7 @@ pub async fn put_opts(storage: &dyn ObjectStore, supports_update: bool) { // As a result each conditional operation will need to wait for the lease to timeout before proceeding // One solution would be to clear DynamoDB before each test, but this would require non-trivial additional code // so we instead just generate a random suffix for the filenames - let rng = thread_rng(); + let rng = rng(); let suffix = String::from_utf8(rng.sample_iter(Alphanumeric).take(32).collect()).unwrap(); delete_fixtures(storage).await; @@ -742,10 +742,10 @@ pub async fn put_opts(storage: &dyn ObjectStore, supports_update: bool) { /// Returns a chunk of length `chunk_length` fn get_chunk(chunk_length: usize) -> Bytes { let mut data = vec![0_u8; chunk_length]; - let mut rng = thread_rng(); + let mut rng = rng(); // Set a random selection of bytes for _ in 0..1000 { - data[rng.gen_range(0..chunk_length)] = rng.gen(); + data[rng.random_range(0..chunk_length)] = rng.random(); } data.into() } diff --git a/src/upload.rs b/src/upload.rs index 4df4d8f..af5975a 100644 --- a/src/upload.rs +++ b/src/upload.rs @@ -312,11 +312,11 @@ mod tests { let mut expected = Vec::with_capacity(1024); for _ in 0..50 { - let chunk_size = rng.gen_range(0..30); - let data: Vec<_> = (0..chunk_size).map(|_| rng.gen()).collect(); + let chunk_size = rng.random_range(0..30); + let data: Vec<_> = (0..chunk_size).map(|_| rng.random()).collect(); expected.extend_from_slice(&data); - match rng.gen_bool(method) { + match rng.random_bool(method) { true => write.put(data.into()), false => write.write(&data), } diff --git a/src/util.rs b/src/util.rs index 17a7a8c..f46c959 100644 --- a/src/util.rs +++ b/src/util.rs @@ -329,7 +329,7 @@ mod tests { use crate::Error; use super::*; - use rand::{thread_rng, Rng}; + use rand::{rng, Rng}; use std::ops::Range; /// Calls coalesce_ranges and validates the returned data is correct @@ -395,20 +395,20 @@ mod tests { #[tokio::test] async fn test_coalesce_fuzz() { - let mut rand = thread_rng(); + let mut rand = rng(); for _ in 0..100 { - let object_len = rand.gen_range(10..250); - let range_count = rand.gen_range(0..10); + let object_len = rand.random_range(10..250); + let range_count = rand.random_range(0..10); let ranges: Vec<_> = (0..range_count) .map(|_| { - let start = rand.gen_range(0..object_len); + let start = rand.random_range(0..object_len); let max_len = 20.min(object_len - start); - let len = rand.gen_range(0..max_len); + let len = rand.random_range(0..max_len); start..start + len }) .collect(); - let coalesce = rand.gen_range(1..5); + let coalesce = rand.random_range(1..5); let fetches = do_fetch(ranges.clone(), coalesce).await; for fetch in fetches.windows(2) { From 1e0c75056c00fe3166faf9f0e99437f62db17f8f Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Mon, 3 Feb 2025 16:04:12 +0000 Subject: [PATCH 380/397] Add test of out of order UploadPart (#7047) * Add test of out of order UploadPart * Update object_store/src/integration.rs --------- Co-authored-by: Andrew Lamb --- src/aws/mod.rs | 1 + src/azure/mod.rs | 1 + src/gcp/mod.rs | 1 + src/integration.rs | 35 +++++++++++++++++++++++++++++++++-- 4 files changed, 36 insertions(+), 2 deletions(-) diff --git a/src/aws/mod.rs b/src/aws/mod.rs index 82ef909..b065927 100644 --- a/src/aws/mod.rs +++ b/src/aws/mod.rs @@ -571,6 +571,7 @@ mod tests { stream_get(&integration).await; multipart(&integration, &integration).await; multipart_race_condition(&integration, true).await; + multipart_out_of_order(&integration).await; signing(&integration).await; s3_encryption(&integration).await; put_get_attributes(&integration).await; diff --git a/src/azure/mod.rs b/src/azure/mod.rs index ea4dd8f..bbecba5 100644 --- a/src/azure/mod.rs +++ b/src/azure/mod.rs @@ -314,6 +314,7 @@ mod tests { put_opts(&integration, true).await; multipart(&integration, &integration).await; multipart_race_condition(&integration, false).await; + multipart_out_of_order(&integration).await; signing(&integration).await; let validate = !integration.client.config().disable_tagging; diff --git a/src/gcp/mod.rs b/src/gcp/mod.rs index a2f5124..2aa9976 100644 --- a/src/gcp/mod.rs +++ b/src/gcp/mod.rs @@ -298,6 +298,7 @@ mod test { stream_get(&integration).await; multipart(&integration, &integration).await; multipart_race_condition(&integration, true).await; + multipart_out_of_order(&integration).await; // Fake GCS server doesn't currently honor preconditions get_opts(&integration).await; put_opts(&integration, true).await; diff --git a/src/integration.rs b/src/integration.rs index 5f9a92b..f10b2d3 100644 --- a/src/integration.rs +++ b/src/integration.rs @@ -29,8 +29,8 @@ use core::str; use crate::multipart::MultipartStore; use crate::path::Path; use crate::{ - Attribute, Attributes, DynObjectStore, Error, GetOptions, GetRange, ObjectStore, PutMode, - PutPayload, UpdateVersion, WriteMultipart, + Attribute, Attributes, DynObjectStore, Error, GetOptions, GetRange, MultipartUpload, + ObjectStore, PutMode, PutPayload, UpdateVersion, WriteMultipart, }; use bytes::Bytes; use futures::stream::FuturesUnordered; @@ -1196,3 +1196,34 @@ pub async fn multipart_race_condition(storage: &dyn ObjectStore, last_writer_win )); } } + +/// Tests performing out of order multipart uploads +pub async fn multipart_out_of_order(storage: &dyn ObjectStore) { + let path = Path::from("test_multipart_out_of_order"); + let mut multipart_upload = storage.put_multipart(&path).await.unwrap(); + + let part1 = std::iter::repeat(b'1') + .take(5 * 1024 * 1024) + .collect::(); + let part2 = std::iter::repeat(b'2') + .take(5 * 1024 * 1024) + .collect::(); + let part3 = std::iter::repeat(b'3') + .take(5 * 1024 * 1024) + .collect::(); + let full = [part1.as_ref(), part2.as_ref(), part3.as_ref()].concat(); + + let fut1 = multipart_upload.put_part(part1.into()); + let fut2 = multipart_upload.put_part(part2.into()); + let fut3 = multipart_upload.put_part(part3.into()); + // note order is 2,3,1 , different than the parts were created in + fut2.await.unwrap(); + fut3.await.unwrap(); + fut1.await.unwrap(); + + multipart_upload.complete().await.unwrap(); + + let result = storage.get(&path).await.unwrap(); + let bytes = result.bytes().await.unwrap(); + assert_eq!(bytes, full); +} From 6c3de451a7931bff1c561632c31e3cb9f01faf8b Mon Sep 17 00:00:00 2001 From: Kyle Barron Date: Thu, 6 Feb 2025 12:12:31 -0500 Subject: [PATCH 381/397] Fix LocalFileSystem with range request that ends beyond end of file (#6751) * Fix LocalFileSystem with range request that ends beyond end of file * fix windows * add comment * Seek error * fix seek check * remove windows flag * Get file length from file metadata --- src/local.rs | 60 +++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 59 insertions(+), 1 deletion(-) diff --git a/src/local.rs b/src/local.rs index 65e87f9..6fef461 100644 --- a/src/local.rs +++ b/src/local.rs @@ -868,7 +868,27 @@ pub(crate) fn chunked_stream( } pub(crate) fn read_range(file: &mut File, path: &PathBuf, range: Range) -> Result { - let to_read = range.end - range.start; + let file_metadata = file.metadata().map_err(|e| Error::Metadata { + source: e.into(), + path: path.to_string_lossy().to_string(), + })?; + + // If none of the range is satisfiable we should error, e.g. if the start offset is beyond the + // extents of the file + let file_len = file_metadata.len(); + if range.start >= file_len { + return Err(Error::InvalidRange { + source: InvalidGetRange::StartTooLarge { + requested: range.start, + length: file_len, + }, + } + .into()); + } + + // Don't read past end of file + let to_read = range.end.min(file_len) - range.start; + file.seek(SeekFrom::Start(range.start)).map_err(|source| { let path = path.into(); Error::Seek { source, path } @@ -1131,6 +1151,44 @@ mod tests { assert_eq!(&*read_data, data); } + #[tokio::test] + async fn range_request_start_beyond_end_of_file() { + let root = TempDir::new().unwrap(); + let integration = LocalFileSystem::new_with_prefix(root.path()).unwrap(); + + let location = Path::from("some_file"); + + let data = Bytes::from("arbitrary data"); + + integration + .put(&location, data.clone().into()) + .await + .unwrap(); + + integration + .get_range(&location, 100..200) + .await + .expect_err("Should error with start range beyond end of file"); + } + + #[tokio::test] + async fn range_request_beyond_end_of_file() { + let root = TempDir::new().unwrap(); + let integration = LocalFileSystem::new_with_prefix(root.path()).unwrap(); + + let location = Path::from("some_file"); + + let data = Bytes::from("arbitrary data"); + + integration + .put(&location, data.clone().into()) + .await + .unwrap(); + + let read_data = integration.get_range(&location, 0..100).await.unwrap(); + assert_eq!(&*read_data, data); + } + #[tokio::test] #[cfg(target_family = "unix")] // Fails on github actions runner (which runs the tests as root) From 2bed30253e84beb7bd5587cfc85c822dd0e77ee2 Mon Sep 17 00:00:00 2001 From: Corwin Joy Date: Sat, 8 Feb 2025 08:27:40 -0800 Subject: [PATCH 382/397] Add a custom implementation `LocalFileSystem::list_with_offset` (#7019) * Initial change from Daniel. * Upgrade unit test to be more generic. * Add comments on why we have filter * Cleanup unit tests. * Update object_store/src/local.rs Co-authored-by: Adam Reeve * Add changes suggested by Adam. * Cleanup match error. * Apply formatting changes suggested by cargo +stable fmt --all. * Apply cosmetic changes suggested by clippy. * Upgrade test_path_with_offset to create temporary directory + files for testing rather than pointing to existing dir. --------- Co-authored-by: Adam Reeve --- src/local.rs | 219 ++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 155 insertions(+), 64 deletions(-) diff --git a/src/local.rs b/src/local.rs index 6fef461..ccf6e34 100644 --- a/src/local.rs +++ b/src/local.rs @@ -483,71 +483,15 @@ impl ObjectStore for LocalFileSystem { } fn list(&self, prefix: Option<&Path>) -> BoxStream<'static, Result> { - let config = Arc::clone(&self.config); - - let root_path = match prefix { - Some(prefix) => match config.prefix_to_filesystem(prefix) { - Ok(path) => path, - Err(e) => return futures::future::ready(Err(e)).into_stream().boxed(), - }, - None => self.config.root.to_file_path().unwrap(), - }; - - let walkdir = WalkDir::new(root_path) - // Don't include the root directory itself - .min_depth(1) - .follow_links(true); - - let s = walkdir.into_iter().flat_map(move |result_dir_entry| { - let entry = match convert_walkdir_result(result_dir_entry).transpose()? { - Ok(entry) => entry, - Err(e) => return Some(Err(e)), - }; - - if !entry.path().is_file() { - return None; - } - - match config.filesystem_to_path(entry.path()) { - Ok(path) => match is_valid_file_path(&path) { - true => convert_entry(entry, path).transpose(), - false => None, - }, - Err(e) => Some(Err(e)), - } - }); - - // If no tokio context, return iterator directly as no - // need to perform chunked spawn_blocking reads - if tokio::runtime::Handle::try_current().is_err() { - return futures::stream::iter(s).boxed(); - } - - // Otherwise list in batches of CHUNK_SIZE - const CHUNK_SIZE: usize = 1024; - - let buffer = VecDeque::with_capacity(CHUNK_SIZE); - futures::stream::try_unfold((s, buffer), |(mut s, mut buffer)| async move { - if buffer.is_empty() { - (s, buffer) = tokio::task::spawn_blocking(move || { - for _ in 0..CHUNK_SIZE { - match s.next() { - Some(r) => buffer.push_back(r), - None => break, - } - } - (s, buffer) - }) - .await?; - } + self.list_with_maybe_offset(prefix, None) + } - match buffer.pop_front() { - Some(Err(e)) => Err(e), - Some(Ok(meta)) => Ok(Some((meta, (s, buffer)))), - None => Ok(None), - } - }) - .boxed() + fn list_with_offset( + &self, + prefix: Option<&Path>, + offset: &Path, + ) -> BoxStream<'static, Result> { + self.list_with_maybe_offset(prefix, Some(offset)) } async fn list_with_delimiter(&self, prefix: Option<&Path>) -> Result { @@ -678,6 +622,93 @@ impl ObjectStore for LocalFileSystem { } } +impl LocalFileSystem { + fn list_with_maybe_offset( + &self, + prefix: Option<&Path>, + maybe_offset: Option<&Path>, + ) -> BoxStream<'static, Result> { + let config = Arc::clone(&self.config); + + let root_path = match prefix { + Some(prefix) => match config.prefix_to_filesystem(prefix) { + Ok(path) => path, + Err(e) => return futures::future::ready(Err(e)).into_stream().boxed(), + }, + None => config.root.to_file_path().unwrap(), + }; + + let walkdir = WalkDir::new(root_path) + // Don't include the root directory itself + .min_depth(1) + .follow_links(true); + + let maybe_offset = maybe_offset.cloned(); + + let s = walkdir.into_iter().flat_map(move |result_dir_entry| { + // Apply offset filter before proceeding, to reduce statx file system calls + // This matters for NFS mounts + if let (Some(offset), Ok(entry)) = (maybe_offset.as_ref(), result_dir_entry.as_ref()) { + let location = config.filesystem_to_path(entry.path()); + match location { + Ok(path) if path <= *offset => return None, + Err(e) => return Some(Err(e)), + _ => {} + } + } + + let entry = match convert_walkdir_result(result_dir_entry).transpose()? { + Ok(entry) => entry, + Err(e) => return Some(Err(e)), + }; + + if !entry.path().is_file() { + return None; + } + + match config.filesystem_to_path(entry.path()) { + Ok(path) => match is_valid_file_path(&path) { + true => convert_entry(entry, path).transpose(), + false => None, + }, + Err(e) => Some(Err(e)), + } + }); + + // If no tokio context, return iterator directly as no + // need to perform chunked spawn_blocking reads + if tokio::runtime::Handle::try_current().is_err() { + return futures::stream::iter(s).boxed(); + } + + // Otherwise list in batches of CHUNK_SIZE + const CHUNK_SIZE: usize = 1024; + + let buffer = VecDeque::with_capacity(CHUNK_SIZE); + futures::stream::try_unfold((s, buffer), |(mut s, mut buffer)| async move { + if buffer.is_empty() { + (s, buffer) = tokio::task::spawn_blocking(move || { + for _ in 0..CHUNK_SIZE { + match s.next() { + Some(r) => buffer.push_back(r), + None => break, + } + } + (s, buffer) + }) + .await?; + } + + match buffer.pop_front() { + Some(Err(e)) => Err(e), + Some(Ok(meta)) => Ok(Some((meta, (s, buffer)))), + None => Ok(None), + } + }) + .boxed() + } +} + /// Creates the parent directories of `path` or returns an error based on `source` if no parent fn create_parent_dirs(path: &std::path::Path, source: io::Error) -> Result<()> { let parent = path.parent().ok_or_else(|| { @@ -1459,6 +1490,66 @@ mod tests { ); } + #[tokio::test] + async fn test_path_with_offset() { + let root = TempDir::new().unwrap(); + let integration = LocalFileSystem::new_with_prefix(root.path()).unwrap(); + + let root_path = root.path(); + for i in 0..5 { + let filename = format!("test{}.parquet", i); + let file = root_path.join(filename); + std::fs::write(file, "test").unwrap(); + } + let filter_str = "test"; + let filter = String::from(filter_str); + let offset_str = filter + "1"; + let offset = Path::from(offset_str.clone()); + + // Use list_with_offset to retrieve files + let res = integration.list_with_offset(None, &offset); + let offset_paths: Vec<_> = res.map_ok(|x| x.location).try_collect().await.unwrap(); + let mut offset_files: Vec<_> = offset_paths + .iter() + .map(|x| String::from(x.filename().unwrap())) + .collect(); + + // Check result with direct filesystem read + let files = fs::read_dir(root_path).unwrap(); + let filtered_files = files + .filter_map(Result::ok) + .filter_map(|d| { + d.file_name().to_str().and_then(|f| { + if f.contains(filter_str) { + Some(String::from(f)) + } else { + None + } + }) + }) + .collect::>(); + + let mut expected_offset_files: Vec<_> = filtered_files + .iter() + .filter(|s| **s > offset_str) + .cloned() + .collect(); + + fn do_vecs_match(a: &[T], b: &[T]) -> bool { + let matching = a.iter().zip(b.iter()).filter(|&(a, b)| a == b).count(); + matching == a.len() && matching == b.len() + } + + offset_files.sort(); + expected_offset_files.sort(); + + // println!("Expected Offset Files: {:?}", expected_offset_files); + // println!("Actual Offset Files: {:?}", offset_files); + + assert_eq!(offset_files.len(), expected_offset_files.len()); + assert!(do_vecs_match(&expected_offset_files, &offset_files)); + } + #[tokio::test] async fn filesystem_filename_with_percent() { let temp_dir = TempDir::new().unwrap(); From 59e7ab4f953423e774a8688a3e8db0be76ebfee6 Mon Sep 17 00:00:00 2001 From: james-rms Date: Wed, 12 Feb 2025 09:03:43 +1100 Subject: [PATCH 383/397] object_store/gcp: derive Clone for GoogleCloudStorage (#7112) --- src/gcp/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/gcp/mod.rs b/src/gcp/mod.rs index 2aa9976..2f6630d 100644 --- a/src/gcp/mod.rs +++ b/src/gcp/mod.rs @@ -72,7 +72,7 @@ pub type GcpSigningCredentialProvider = Arc>; /// Interface for [Google Cloud Storage](https://cloud.google.com/storage/). -#[derive(Debug)] +#[derive(Debug, Clone)] pub struct GoogleCloudStorage { client: Arc, } From c763a4c58883d136e918fa55d22481f7194c1d46 Mon Sep 17 00:00:00 2001 From: Marco Neumann Date: Wed, 12 Feb 2025 14:17:09 +0100 Subject: [PATCH 384/397] feat(object_store): random IP address selection (#7123) * feat(object_store): random IP address selection Closes #7117. * refactor: directly call stdlib w/o hyper-util --- src/client/dns.rs | 50 +++++++++++++++++++++++++++++++++++++++++++++++ src/client/mod.rs | 18 +++++++++++++++++ 2 files changed, 68 insertions(+) create mode 100644 src/client/dns.rs diff --git a/src/client/dns.rs b/src/client/dns.rs new file mode 100644 index 0000000..32e9291 --- /dev/null +++ b/src/client/dns.rs @@ -0,0 +1,50 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::net::ToSocketAddrs; + +use rand::prelude::SliceRandom; +use reqwest::dns::{Addrs, Name, Resolve, Resolving}; +use tokio::task::JoinSet; + +type DynErr = Box; + +#[derive(Debug)] +pub(crate) struct ShuffleResolver; + +impl Resolve for ShuffleResolver { + fn resolve(&self, name: Name) -> Resolving { + Box::pin(async move { + // use `JoinSet` to propagate cancelation + let mut tasks = JoinSet::new(); + tasks.spawn_blocking(move || { + let it = (name.as_str(), 0).to_socket_addrs()?; + let mut addrs = it.collect::>(); + + addrs.shuffle(&mut rand::rng()); + + Ok(Box::new(addrs.into_iter()) as Addrs) + }); + + tasks + .join_next() + .await + .expect("spawned on task") + .map_err(|err| Box::new(err) as DynErr)? + }) + } +} diff --git a/src/client/mod.rs b/src/client/mod.rs index 1b7ce5a..6297159 100644 --- a/src/client/mod.rs +++ b/src/client/mod.rs @@ -19,6 +19,8 @@ pub(crate) mod backoff; +mod dns; + #[cfg(test)] pub(crate) mod mock_server; @@ -110,6 +112,10 @@ pub enum ClientConfigKey { ProxyCaCertificate, /// List of hosts that bypass proxy ProxyExcludes, + /// Randomize order addresses that the DNS resolution yields. + /// + /// This will spread the connections accross more servers. + RandomizeAddresses, /// Request timeout /// /// The timeout is applied from when the request starts connecting until the @@ -137,6 +143,7 @@ impl AsRef for ClientConfigKey { Self::ProxyUrl => "proxy_url", Self::ProxyCaCertificate => "proxy_ca_certificate", Self::ProxyExcludes => "proxy_excludes", + Self::RandomizeAddresses => "randomize_addresses", Self::Timeout => "timeout", Self::UserAgent => "user_agent", } @@ -163,6 +170,7 @@ impl FromStr for ClientConfigKey { "proxy_url" => Ok(Self::ProxyUrl), "proxy_ca_certificate" => Ok(Self::ProxyCaCertificate), "proxy_excludes" => Ok(Self::ProxyExcludes), + "randomize_addresses" => Ok(Self::RandomizeAddresses), "timeout" => Ok(Self::Timeout), "user_agent" => Ok(Self::UserAgent), _ => Err(super::Error::UnknownConfigurationKey { @@ -245,6 +253,7 @@ pub struct ClientOptions { http2_max_frame_size: Option>, http1_only: ConfigValue, http2_only: ConfigValue, + randomize_addresses: ConfigValue, } impl Default for ClientOptions { @@ -280,6 +289,7 @@ impl Default for ClientOptions { // https://github.com/apache/arrow-rs/issues/5194 http1_only: true.into(), http2_only: Default::default(), + randomize_addresses: true.into(), } } } @@ -322,6 +332,9 @@ impl ClientOptions { ClientConfigKey::ProxyUrl => self.proxy_url = Some(value.into()), ClientConfigKey::ProxyCaCertificate => self.proxy_ca_certificate = Some(value.into()), ClientConfigKey::ProxyExcludes => self.proxy_excludes = Some(value.into()), + ClientConfigKey::RandomizeAddresses => { + self.randomize_addresses.parse(value); + } ClientConfigKey::Timeout => self.timeout = Some(ConfigValue::Deferred(value.into())), ClientConfigKey::UserAgent => { self.user_agent = Some(ConfigValue::Deferred(value.into())) @@ -358,6 +371,7 @@ impl ClientOptions { ClientConfigKey::ProxyUrl => self.proxy_url.clone(), ClientConfigKey::ProxyCaCertificate => self.proxy_ca_certificate.clone(), ClientConfigKey::ProxyExcludes => self.proxy_excludes.clone(), + ClientConfigKey::RandomizeAddresses => Some(self.randomize_addresses.to_string()), ClientConfigKey::Timeout => self.timeout.as_ref().map(fmt_duration), ClientConfigKey::UserAgent => self .user_agent @@ -675,6 +689,10 @@ impl ClientOptions { // transparently decompress the body via the non-default `gzip` feature. builder = builder.no_gzip(); + if self.randomize_addresses.get()? { + builder = builder.dns_resolver(Arc::new(dns::ShuffleResolver)); + } + builder .https_only(!self.allow_http.get()?) .build() From a4b106b80cf90849377ad471875ccf1e33688fd0 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Wed, 12 Feb 2025 09:37:06 -0500 Subject: [PATCH 385/397] Revert "Update rand requirement from 0.8 to 0.9 (#7045)" (#7125) * Revert "Update rand requirement from 0.8 to 0.9 (#7045)" This reverts commit 69eeee336eb216a990ad5eb2b4ded93b547594b1. * downgrade API --- Cargo.toml | 4 ++-- src/aws/dynamo.rs | 6 +++--- src/azure/client.rs | 2 +- src/client/backoff.rs | 8 ++++---- src/client/dns.rs | 2 +- src/integration.rs | 10 +++++----- src/upload.rs | 6 +++--- src/util.rs | 14 +++++++------- 8 files changed, 26 insertions(+), 26 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 168d2eb..992ae66 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -49,7 +49,7 @@ hyper = { version = "1.2", default-features = false, optional = true } quick-xml = { version = "0.37.0", features = ["serialize", "overlapped-lists"], optional = true } serde = { version = "1.0", default-features = false, features = ["derive"], optional = true } serde_json = { version = "1.0", default-features = false, optional = true } -rand = { version = "0.9", default-features = false, features = ["std", "std_rng", "thread_rng"], optional = true } +rand = { version = "0.8", default-features = false, features = ["std", "std_rng"], optional = true } reqwest = { version = "0.12", default-features = false, features = ["rustls-tls-native-roots", "http2"], optional = true } ring = { version = "0.17", default-features = false, features = ["std"], optional = true } rustls-pemfile = { version = "2.0", default-features = false, features = ["std"], optional = true } @@ -76,7 +76,7 @@ futures-test = "0.3" hyper = { version = "1.2", features = ["server"] } hyper-util = "0.1" http-body-util = "0.1" -rand = "0.9" +rand = "0.8" tempfile = "3.1.0" regex = "1.11.1" # The "gzip" feature for reqwest is enabled for an integration test. diff --git a/src/aws/dynamo.rs b/src/aws/dynamo.rs index a66a343..6283e76 100644 --- a/src/aws/dynamo.rs +++ b/src/aws/dynamo.rs @@ -527,8 +527,8 @@ mod tests { use super::*; use crate::aws::AmazonS3; use crate::ObjectStore; - use rand::distr::Alphanumeric; - use rand::{rng, Rng}; + use rand::distributions::Alphanumeric; + use rand::{thread_rng, Rng}; #[test] fn test_attribute_serde() { @@ -571,7 +571,7 @@ mod tests { _ => panic!("Should conflict"), } - let rng = rng(); + let rng = thread_rng(); let etag = String::from_utf8(rng.sample_iter(Alphanumeric).take(32).collect()).unwrap(); let t = Some(etag.as_str()); diff --git a/src/azure/client.rs b/src/azure/client.rs index 7195729..2c2e27e 100644 --- a/src/azure/client.rs +++ b/src/azure/client.rs @@ -561,7 +561,7 @@ impl AzureClient { _part_idx: usize, payload: PutPayload, ) -> Result { - let part_idx = u128::from_be_bytes(rand::rng().random()); + let part_idx = u128::from_be_bytes(rand::thread_rng().gen()); let content_id = format!("{part_idx:032x}"); let block_id = BASE64_STANDARD.encode(&content_id); diff --git a/src/client/backoff.rs b/src/client/backoff.rs index 8193e8b..8382a2e 100644 --- a/src/client/backoff.rs +++ b/src/client/backoff.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -use rand::{prelude::*, rng}; +use rand::prelude::*; use std::time::Duration; /// Exponential backoff with decorrelated jitter algorithm @@ -78,7 +78,7 @@ impl Backoff { /// Creates a new `Backoff` with the optional `rng` /// - /// Used [`rand::rng()`] if no rng provided + /// Used [`rand::thread_rng()`] if no rng provided pub(crate) fn new_with_rng( config: &BackoffConfig, rng: Option>, @@ -98,8 +98,8 @@ impl Backoff { let range = self.init_backoff..(self.next_backoff_secs * self.base); let rand_backoff = match self.rng.as_mut() { - Some(rng) => rng.random_range(range), - None => rng().random_range(range), + Some(rng) => rng.gen_range(range), + None => thread_rng().gen_range(range), }; let next_backoff = self.max_backoff_secs.min(rand_backoff); diff --git a/src/client/dns.rs b/src/client/dns.rs index 32e9291..51df926 100644 --- a/src/client/dns.rs +++ b/src/client/dns.rs @@ -35,7 +35,7 @@ impl Resolve for ShuffleResolver { let it = (name.as_str(), 0).to_socket_addrs()?; let mut addrs = it.collect::>(); - addrs.shuffle(&mut rand::rng()); + addrs.shuffle(&mut rand::thread_rng()); Ok(Box::new(addrs.into_iter()) as Addrs) }); diff --git a/src/integration.rs b/src/integration.rs index f10b2d3..5a133f7 100644 --- a/src/integration.rs +++ b/src/integration.rs @@ -35,8 +35,8 @@ use crate::{ use bytes::Bytes; use futures::stream::FuturesUnordered; use futures::{StreamExt, TryStreamExt}; -use rand::distr::Alphanumeric; -use rand::{rng, Rng}; +use rand::distributions::Alphanumeric; +use rand::{thread_rng, Rng}; pub(crate) async fn flatten_list_stream( storage: &DynObjectStore, @@ -633,7 +633,7 @@ pub async fn put_opts(storage: &dyn ObjectStore, supports_update: bool) { // As a result each conditional operation will need to wait for the lease to timeout before proceeding // One solution would be to clear DynamoDB before each test, but this would require non-trivial additional code // so we instead just generate a random suffix for the filenames - let rng = rng(); + let rng = thread_rng(); let suffix = String::from_utf8(rng.sample_iter(Alphanumeric).take(32).collect()).unwrap(); delete_fixtures(storage).await; @@ -742,10 +742,10 @@ pub async fn put_opts(storage: &dyn ObjectStore, supports_update: bool) { /// Returns a chunk of length `chunk_length` fn get_chunk(chunk_length: usize) -> Bytes { let mut data = vec![0_u8; chunk_length]; - let mut rng = rng(); + let mut rng = thread_rng(); // Set a random selection of bytes for _ in 0..1000 { - data[rng.random_range(0..chunk_length)] = rng.random(); + data[rng.gen_range(0..chunk_length)] = rng.gen(); } data.into() } diff --git a/src/upload.rs b/src/upload.rs index af5975a..4df4d8f 100644 --- a/src/upload.rs +++ b/src/upload.rs @@ -312,11 +312,11 @@ mod tests { let mut expected = Vec::with_capacity(1024); for _ in 0..50 { - let chunk_size = rng.random_range(0..30); - let data: Vec<_> = (0..chunk_size).map(|_| rng.random()).collect(); + let chunk_size = rng.gen_range(0..30); + let data: Vec<_> = (0..chunk_size).map(|_| rng.gen()).collect(); expected.extend_from_slice(&data); - match rng.random_bool(method) { + match rng.gen_bool(method) { true => write.put(data.into()), false => write.write(&data), } diff --git a/src/util.rs b/src/util.rs index f46c959..17a7a8c 100644 --- a/src/util.rs +++ b/src/util.rs @@ -329,7 +329,7 @@ mod tests { use crate::Error; use super::*; - use rand::{rng, Rng}; + use rand::{thread_rng, Rng}; use std::ops::Range; /// Calls coalesce_ranges and validates the returned data is correct @@ -395,20 +395,20 @@ mod tests { #[tokio::test] async fn test_coalesce_fuzz() { - let mut rand = rng(); + let mut rand = thread_rng(); for _ in 0..100 { - let object_len = rand.random_range(10..250); - let range_count = rand.random_range(0..10); + let object_len = rand.gen_range(10..250); + let range_count = rand.gen_range(0..10); let ranges: Vec<_> = (0..range_count) .map(|_| { - let start = rand.random_range(0..object_len); + let start = rand.gen_range(0..object_len); let max_len = 20.min(object_len - start); - let len = rand.random_range(0..max_len); + let len = rand.gen_range(0..max_len); start..start + len }) .collect(); - let coalesce = rand.random_range(1..5); + let coalesce = rand.gen_range(1..5); let fetches = do_fetch(ranges.clone(), coalesce).await; for fetch in fetches.windows(2) { From ef1337ae5c1b63fd5a04a1414e48c09ddef48d26 Mon Sep 17 00:00:00 2001 From: Marco Neumann Date: Fri, 14 Feb 2025 14:00:10 +0100 Subject: [PATCH 386/397] refactor: remove unused `async` from `InMemory::entry` (#7133) --- src/memory.rs | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/memory.rs b/src/memory.rs index 26beff1..f03dbc6 100644 --- a/src/memory.rs +++ b/src/memory.rs @@ -235,7 +235,7 @@ impl ObjectStore for InMemory { } async fn get_opts(&self, location: &Path, options: GetOptions) -> Result { - let entry = self.entry(location).await?; + let entry = self.entry(location)?; let e_tag = entry.e_tag.to_string(); let meta = ObjectMeta { @@ -270,7 +270,7 @@ impl ObjectStore for InMemory { } async fn get_ranges(&self, location: &Path, ranges: &[Range]) -> Result> { - let entry = self.entry(location).await?; + let entry = self.entry(location)?; ranges .iter() .map(|range| { @@ -295,7 +295,7 @@ impl ObjectStore for InMemory { } async fn head(&self, location: &Path) -> Result { - let entry = self.entry(location).await?; + let entry = self.entry(location)?; Ok(ObjectMeta { location: location.clone(), @@ -390,7 +390,7 @@ impl ObjectStore for InMemory { } async fn copy(&self, from: &Path, to: &Path) -> Result<()> { - let entry = self.entry(from).await?; + let entry = self.entry(from)?; self.storage .write() .insert(to, entry.data, entry.attributes); @@ -398,7 +398,7 @@ impl ObjectStore for InMemory { } async fn copy_if_not_exists(&self, from: &Path, to: &Path) -> Result<()> { - let entry = self.entry(from).await?; + let entry = self.entry(from)?; let mut storage = self.storage.write(); if storage.map.contains_key(to) { return Err(Error::AlreadyExists { @@ -483,7 +483,7 @@ impl InMemory { Self { storage } } - async fn entry(&self, location: &Path) -> Result { + fn entry(&self, location: &Path) -> Result { let storage = self.storage.read(); let value = storage .map From 1c9b24772de44f4ae73e5f666fca0969dcea4d00 Mon Sep 17 00:00:00 2001 From: Kyle Lacy Date: Wed, 19 Feb 2025 00:30:38 -0800 Subject: [PATCH 387/397] object_store: Disable all compression formats in HTTP reqwest client (#7143) --- src/client/mod.rs | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/client/mod.rs b/src/client/mod.rs index 6297159..11f8b3e 100644 --- a/src/client/mod.rs +++ b/src/client/mod.rs @@ -685,9 +685,11 @@ impl ClientOptions { builder = builder.danger_accept_invalid_certs(true) } - // Reqwest will remove the `Content-Length` header if it is configured to - // transparently decompress the body via the non-default `gzip` feature. - builder = builder.no_gzip(); + // Explicitly disable compression, since it may be automatically enabled + // when certain reqwest features are enabled. Compression interferes + // with the `Content-Length` header, which is used to determine the + // size of objects. + builder = builder.no_gzip().no_brotli().no_zstd().no_deflate(); if self.randomize_addresses.get()? { builder = builder.dns_resolver(Arc::new(dns::ShuffleResolver)); From b4b7fd5b7b03b551359233941680c8aa2275cb1f Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Wed, 26 Feb 2025 22:28:09 +0000 Subject: [PATCH 388/397] Decouple ObjectStore from Reqwest (#7183) * Create HttpClient * WIP * Decouple from reqwest * Hook up HttpConnector * WIP * Update other stores * Format * Remove out of date comment * RAT * Test fixes * Lints * Fix feature flags --- Cargo.toml | 17 +- src/aws/builder.rs | 34 +++- src/aws/client.rs | 64 +++--- src/aws/credential.rs | 111 +++++++---- src/aws/dynamo.rs | 17 +- src/aws/mod.rs | 4 +- src/azure/builder.rs | 25 ++- src/azure/client.rs | 99 +++++---- src/azure/credential.rs | 56 +++--- src/azure/mod.rs | 4 +- src/client/body.rs | 213 ++++++++++++++++++++ src/client/builder.rs | 279 ++++++++++++++++++++++++++ src/client/connection.rs | 237 ++++++++++++++++++++++ src/client/get.rs | 21 +- src/client/header.rs | 4 +- src/client/mod.rs | 31 ++- src/client/retry.rs | 420 +++++++++++++++++++-------------------- src/gcp/builder.rs | 30 ++- src/gcp/client.rs | 52 ++--- src/gcp/credential.rs | 45 +++-- src/gcp/mod.rs | 4 +- src/http/client.rs | 33 +-- src/http/mod.rs | 20 +- src/lib.rs | 6 +- src/parse.rs | 2 +- src/payload.rs | 7 - 26 files changed, 1334 insertions(+), 501 deletions(-) create mode 100644 src/client/body.rs create mode 100644 src/client/builder.rs create mode 100644 src/client/connection.rs diff --git a/Cargo.toml b/Cargo.toml index 992ae66..7e51245 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -45,24 +45,28 @@ walkdir = { version = "2", optional = true } # Cloud storage support base64 = { version = "0.22", default-features = false, features = ["std"], optional = true } +form_urlencoded = { version = "1.2", optional = true } +http = { version = "1.2.0", optional = true } +http-body-util = { version = "0.1", optional = true } +httparse = { version = "1.8.0", default-features = false, features = ["std"], optional = true } hyper = { version = "1.2", default-features = false, optional = true } +md-5 = { version = "0.10.6", default-features = false, optional = true } quick-xml = { version = "0.37.0", features = ["serialize", "overlapped-lists"], optional = true } -serde = { version = "1.0", default-features = false, features = ["derive"], optional = true } -serde_json = { version = "1.0", default-features = false, optional = true } rand = { version = "0.8", default-features = false, features = ["std", "std_rng"], optional = true } reqwest = { version = "0.12", default-features = false, features = ["rustls-tls-native-roots", "http2"], optional = true } ring = { version = "0.17", default-features = false, features = ["std"], optional = true } rustls-pemfile = { version = "2.0", default-features = false, features = ["std"], optional = true } +serde = { version = "1.0", default-features = false, features = ["derive"], optional = true } +serde_json = { version = "1.0", default-features = false, features = ["std"], optional = true } +serde_urlencoded = { version = "0.7", optional = true } tokio = { version = "1.29.0", features = ["sync", "macros", "rt", "time", "io-util"] } -md-5 = { version = "0.10.6", default-features = false, optional = true } -httparse = { version = "1.8.0", default-features = false, features = ["std"], optional = true } [target.'cfg(target_family="unix")'.dev-dependencies] nix = { version = "0.29.0", features = ["fs"] } [features] default = ["fs"] -cloud = ["serde", "serde_json", "quick-xml", "hyper", "reqwest", "reqwest/json", "reqwest/stream", "chrono/serde", "base64", "rand", "ring"] +cloud = ["serde", "serde_json", "quick-xml", "hyper", "reqwest", "reqwest/stream", "chrono/serde", "base64", "rand", "ring", "dep:http", "http-body-util", "form_urlencoded", "serde_urlencoded"] azure = ["cloud", "httparse"] fs = ["walkdir"] gcp = ["cloud", "rustls-pemfile"] @@ -72,16 +76,13 @@ tls-webpki-roots = ["reqwest?/rustls-tls-webpki-roots"] integration = [] [dev-dependencies] # In alphabetical order -futures-test = "0.3" hyper = { version = "1.2", features = ["server"] } hyper-util = "0.1" -http-body-util = "0.1" rand = "0.8" tempfile = "3.1.0" regex = "1.11.1" # The "gzip" feature for reqwest is enabled for an integration test. reqwest = { version = "0.12", features = ["gzip"] } -http = "1.1.0" [[test]] name = "get_range_file" diff --git a/src/aws/builder.rs b/src/aws/builder.rs index 29b2eef..5e3d32e 100644 --- a/src/aws/builder.rs +++ b/src/aws/builder.rs @@ -23,7 +23,7 @@ use crate::aws::{ AmazonS3, AwsCredential, AwsCredentialProvider, Checksum, S3ConditionalPut, S3CopyIfNotExists, STORE, }; -use crate::client::TokenCredentialProvider; +use crate::client::{HttpConnector, ReqwestConnector, TokenCredentialProvider}; use crate::config::ConfigValue; use crate::{ClientConfigKey, ClientOptions, Result, RetryConfig, StaticCredentialProvider}; use base64::prelude::BASE64_STANDARD; @@ -171,6 +171,8 @@ pub struct AmazonS3Builder { encryption_customer_key_base64: Option, /// When set to true, charge requester for bucket operations request_payer: ConfigValue, + /// The [`HttpConnector`] to use + http_connector: Option>, } /// Configuration keys for [`AmazonS3Builder`] @@ -882,6 +884,12 @@ impl AmazonS3Builder { self } + /// Overrides the [`HttpConnector`], by default uses [`ReqwestConnector`] + pub fn with_http_connector(mut self, connector: C) -> Self { + self.http_connector = Some(Arc::new(connector)); + self + } + /// Create a [`AmazonS3`] instance from the provided values, /// consuming `self`. pub fn build(mut self) -> Result { @@ -889,6 +897,10 @@ impl AmazonS3Builder { self.parse_url(&url)?; } + let http = self + .http_connector + .unwrap_or_else(|| Arc::new(ReqwestConnector::default())); + let bucket = self.bucket_name.ok_or(Error::MissingBucketName)?; let region = self.region.unwrap_or_else(|| "us-east-1".to_string()); let checksum = self.checksum_algorithm.map(|x| x.get()).transpose()?; @@ -925,11 +937,7 @@ impl AmazonS3Builder { let endpoint = format!("https://sts.{region}.amazonaws.com"); // Disallow non-HTTPs requests - let client = self - .client_options - .clone() - .with_allow_http(false) - .client()?; + let options = self.client_options.clone().with_allow_http(false); let token = WebIdentityProvider { token_path, @@ -940,16 +948,19 @@ impl AmazonS3Builder { Arc::new(TokenCredentialProvider::new( token, - client, + http.connect(&options)?, self.retry_config.clone(), )) as _ } else if let Some(uri) = self.container_credentials_relative_uri { info!("Using Task credential provider"); + + let options = self.client_options.clone().with_allow_http(true); + Arc::new(TaskCredentialProvider { url: format!("http://169.254.170.2{uri}"), retry: self.retry_config.clone(), // The instance metadata endpoint is access over HTTP - client: self.client_options.clone().with_allow_http(true).client()?, + client: http.connect(&options)?, cache: Default::default(), }) as _ } else { @@ -964,7 +975,7 @@ impl AmazonS3Builder { Arc::new(TokenCredentialProvider::new( token, - self.client_options.metadata_client()?, + http.connect(&self.client_options.metadata_options())?, self.retry_config.clone(), )) as _ }; @@ -986,7 +997,7 @@ impl AmazonS3Builder { region: region.clone(), credentials: Arc::clone(&credentials), }, - self.client_options.client()?, + http.connect(&self.client_options)?, self.retry_config.clone(), ) .with_min_ttl(Duration::from_secs(60)), // Credentials only valid for 5 minutes @@ -1039,7 +1050,8 @@ impl AmazonS3Builder { request_payer: self.request_payer.get()?, }; - let client = Arc::new(S3Client::new(config)?); + let http_client = http.connect(&config.client_options)?; + let client = Arc::new(S3Client::new(config, http_client)); Ok(AmazonS3 { client }) } diff --git a/src/aws/client.rs b/src/aws/client.rs index 246f277..2cf808a 100644 --- a/src/aws/client.rs +++ b/src/aws/client.rs @@ -22,6 +22,7 @@ use crate::aws::{ AwsAuthorizer, AwsCredentialProvider, S3ConditionalPut, S3CopyIfNotExists, COPY_SOURCE_HEADER, STORE, STRICT_PATH_ENCODE_SET, TAGS_HEADER, }; +use crate::client::builder::{HttpRequestBuilder, RequestBuilderError}; use crate::client::get::GetClient; use crate::client::header::{get_etag, HeaderConfig}; use crate::client::header::{get_put_result, get_version}; @@ -31,7 +32,7 @@ use crate::client::s3::{ CompleteMultipartUpload, CompleteMultipartUploadResult, CopyPartResult, InitiateMultipartUploadResult, ListResponse, PartMetadata, }; -use crate::client::GetOptionsExt; +use crate::client::{GetOptionsExt, HttpClient, HttpError, HttpResponse}; use crate::multipart::PartId; use crate::path::DELIMITER; use crate::{ @@ -42,17 +43,15 @@ use async_trait::async_trait; use base64::prelude::BASE64_STANDARD; use base64::Engine; use bytes::{Buf, Bytes}; -use hyper::header::{ +use http::header::{ CACHE_CONTROL, CONTENT_DISPOSITION, CONTENT_ENCODING, CONTENT_LANGUAGE, CONTENT_LENGTH, CONTENT_TYPE, }; -use hyper::http::HeaderName; -use hyper::{http, HeaderMap}; +use http::{HeaderMap, HeaderName, Method}; use itertools::Itertools; use md5::{Digest, Md5}; use percent_encoding::{utf8_percent_encode, PercentEncode}; use quick_xml::events::{self as xml_events}; -use reqwest::{Client as ReqwestClient, Method, RequestBuilder, Response}; use ring::digest; use ring::digest::Context; use serde::{Deserialize, Serialize}; @@ -67,7 +66,9 @@ const ALGORITHM: &str = "x-amz-checksum-algorithm"; #[derive(Debug, thiserror::Error)] pub(crate) enum Error { #[error("Error performing DeleteObjects request: {}", source)] - DeleteObjectsRequest { source: crate::client::retry::Error }, + DeleteObjectsRequest { + source: crate::client::retry::RetryError, + }, #[error( "DeleteObjects request failed for key {}: {} (code: {})", @@ -82,7 +83,7 @@ pub(crate) enum Error { }, #[error("Error getting DeleteObjects response body: {}", source)] - DeleteObjectsResponse { source: reqwest::Error }, + DeleteObjectsResponse { source: HttpError }, #[error("Got invalid DeleteObjects response: {}", source)] InvalidDeleteObjectsResponse { @@ -90,22 +91,24 @@ pub(crate) enum Error { }, #[error("Error performing list request: {}", source)] - ListRequest { source: crate::client::retry::Error }, + ListRequest { + source: crate::client::retry::RetryError, + }, #[error("Error getting list response body: {}", source)] - ListResponseBody { source: reqwest::Error }, + ListResponseBody { source: HttpError }, #[error("Error getting create multipart response body: {}", source)] - CreateMultipartResponseBody { source: reqwest::Error }, + CreateMultipartResponseBody { source: HttpError }, #[error("Error performing complete multipart request: {}: {}", path, source)] CompleteMultipartRequest { - source: crate::client::retry::Error, + source: crate::client::retry::RetryError, path: String, }, #[error("Error getting complete multipart response body: {}", source)] - CompleteMultipartResponseBody { source: reqwest::Error }, + CompleteMultipartResponseBody { source: HttpError }, #[error("Got invalid list response: {}", source)] InvalidListResponse { source: quick_xml::de::DeError }, @@ -272,7 +275,7 @@ pub enum RequestError { #[error("Retry")] Retry { - source: crate::client::retry::Error, + source: crate::client::retry::RetryError, path: String, }, } @@ -290,7 +293,7 @@ impl From for crate::Error { pub(crate) struct Request<'a> { path: &'a Path, config: &'a S3Config, - builder: RequestBuilder, + builder: HttpRequestBuilder, payload_sha256: Option, payload: Option, use_session_creds: bool, @@ -307,8 +310,8 @@ impl Request<'_> { pub(crate) fn header(self, k: K, v: &str) -> Self where - HeaderName: TryFrom, - >::Error: Into, + K: TryInto, + K::Error: Into, { let builder = self.builder.header(k, v); Self { builder, ..self } @@ -408,7 +411,7 @@ impl Request<'_> { self } - pub(crate) async fn send(self) -> Result { + pub(crate) async fn send(self) -> Result { let credential = match self.use_session_creds { true => self.config.get_session_credential().await?, false => SessionCredential { @@ -446,13 +449,12 @@ impl Request<'_> { #[derive(Debug)] pub(crate) struct S3Client { pub config: S3Config, - pub client: ReqwestClient, + pub client: HttpClient, } impl S3Client { - pub(crate) fn new(config: S3Config) -> Result { - let client = config.client_options.client()?; - Ok(Self { config, client }) + pub(crate) fn new(config: S3Config, client: HttpClient) -> Self { + Self { config, client } } pub(crate) fn request<'a>(&'a self, method: Method, path: &'a Path) -> Request<'a> { @@ -544,6 +546,7 @@ impl S3Client { .send_retry(&self.config.retry_config) .await .map_err(|source| Error::DeleteObjectsRequest { source })? + .into_body() .bytes() .await .map_err(|source| Error::DeleteObjectsResponse { source })?; @@ -641,6 +644,7 @@ impl S3Client { .idempotent(true) .send() .await? + .into_body() .bytes() .await .map_err(|source| Error::CreateMultipartResponseBody { source })?; @@ -683,17 +687,17 @@ impl S3Client { // If SSE-C is used, we must include the encryption headers in every upload request. request = request.with_encryption_headers(); } - let response = request.send().await?; - let checksum_sha256 = response - .headers() + let (parts, body) = request.send().await?.into_parts(); + let checksum_sha256 = parts + .headers .get(SHA256_CHECKSUM) .and_then(|v| v.to_str().ok()) .map(|v| v.to_string()); let e_tag = match is_copy { - false => get_etag(response.headers()).map_err(|source| Error::Metadata { source })?, + false => get_etag(&parts.headers).map_err(|source| Error::Metadata { source })?, true => { - let response = response + let response = body .bytes() .await .map_err(|source| Error::CreateMultipartResponseBody { source })?; @@ -756,7 +760,7 @@ impl S3Client { let request = self .client - .request(Method::POST, url) + .post(url) .query(&[("uploadId", upload_id)]) .body(body) .with_aws_sigv4(credential.authorizer(), None); @@ -781,6 +785,7 @@ impl S3Client { .map_err(|source| Error::Metadata { source })?; let data = response + .into_body() .bytes() .await .map_err(|source| Error::CompleteMultipartResponseBody { source })?; @@ -795,7 +800,7 @@ impl S3Client { } #[cfg(test)] - pub(crate) async fn get_object_tagging(&self, path: &Path) -> Result { + pub(crate) async fn get_object_tagging(&self, path: &Path) -> Result { let credential = self.config.get_session_credential().await?; let url = format!("{}?tagging", self.config.path_url(path)); let response = self @@ -821,7 +826,7 @@ impl GetClient for S3Client { }; /// Make an S3 GET request - async fn get_request(&self, path: &Path, options: GetOptions) -> Result { + async fn get_request(&self, path: &Path, options: GetOptions) -> Result { let credential = self.config.get_session_credential().await?; let url = self.config.path_url(path); let method = match options.head { @@ -895,6 +900,7 @@ impl ListClient for Arc { .send_retry(&self.config.retry_config) .await .map_err(|source| Error::ListRequest { source })? + .into_body() .bytes() .await .map_err(|source| Error::ListResponseBody { source })?; diff --git a/src/aws/credential.rs b/src/aws/credential.rs index 9c74e1c..1b62842 100644 --- a/src/aws/credential.rs +++ b/src/aws/credential.rs @@ -16,18 +16,18 @@ // under the License. use crate::aws::{AwsCredentialProvider, STORE, STRICT_ENCODE_SET, STRICT_PATH_ENCODE_SET}; +use crate::client::builder::HttpRequestBuilder; use crate::client::retry::RetryExt; use crate::client::token::{TemporaryToken, TokenCache}; -use crate::client::TokenProvider; +use crate::client::{HttpClient, HttpError, HttpRequest, TokenProvider}; use crate::util::{hex_digest, hex_encode, hmac_sha256}; use crate::{CredentialProvider, Result, RetryConfig}; use async_trait::async_trait; use bytes::Buf; use chrono::{DateTime, Utc}; -use hyper::header::HeaderName; +use http::header::{HeaderMap, HeaderName, HeaderValue, AUTHORIZATION}; +use http::{Method, StatusCode}; use percent_encoding::utf8_percent_encode; -use reqwest::header::{HeaderMap, HeaderValue, AUTHORIZATION}; -use reqwest::{Client, Method, Request, RequestBuilder, StatusCode}; use serde::Deserialize; use std::collections::BTreeMap; use std::sync::Arc; @@ -39,10 +39,12 @@ use url::Url; #[allow(clippy::enum_variant_names)] enum Error { #[error("Error performing CreateSession request: {source}")] - CreateSessionRequest { source: crate::client::retry::Error }, + CreateSessionRequest { + source: crate::client::retry::RetryError, + }, #[error("Error getting CreateSession response: {source}")] - CreateSessionResponse { source: reqwest::Error }, + CreateSessionResponse { source: HttpError }, #[error("Invalid CreateSessionOutput response: {source}")] CreateSessionOutput { source: quick_xml::DeError }, @@ -89,7 +91,7 @@ impl AwsCredential { } } -/// Authorize a [`Request`] with an [`AwsCredential`] using [AWS SigV4] +/// Authorize a [`HttpRequest`] with an [`AwsCredential`] using [AWS SigV4] /// /// [AWS SigV4]: https://docs.aws.amazon.com/general/latest/gr/sigv4-calculate-signature.html #[derive(Debug)] @@ -158,14 +160,16 @@ impl<'a> AwsAuthorizer<'a> { /// * Otherwise it is set to the hex encoded SHA256 of the request body /// /// [AWS SigV4]: https://docs.aws.amazon.com/IAM/latest/UserGuide/create-signed-request.html - pub fn authorize(&self, request: &mut Request, pre_calculated_digest: Option<&[u8]>) { + pub fn authorize(&self, request: &mut HttpRequest, pre_calculated_digest: Option<&[u8]>) { + let url = Url::parse(&request.uri().to_string()).unwrap(); + if let Some(ref token) = self.credential.token { let token_val = HeaderValue::from_str(token).unwrap(); let header = self.token_header.as_ref().unwrap_or(&TOKEN_HEADER); request.headers_mut().insert(header, token_val); } - let host = &request.url()[url::Position::BeforeHost..url::Position::AfterPort]; + let host = &url[url::Position::BeforeHost..url::Position::AfterPort]; let host_val = HeaderValue::from_str(host).unwrap(); request.headers_mut().insert("host", host_val); @@ -178,9 +182,9 @@ impl<'a> AwsAuthorizer<'a> { false => UNSIGNED_PAYLOAD.to_string(), true => match pre_calculated_digest { Some(digest) => hex_encode(digest), - None => match request.body() { - None => EMPTY_SHA256_HASH.to_string(), - Some(body) => match body.as_bytes() { + None => match request.body().is_empty() { + true => EMPTY_SHA256_HASH.to_string(), + false => match request.body().as_bytes() { Some(bytes) => hex_digest(bytes), None => STREAMING_PAYLOAD.to_string(), }, @@ -208,7 +212,7 @@ impl<'a> AwsAuthorizer<'a> { date, &scope, request.method(), - request.url(), + &url, &canonical_headers, &signed_headers, &digest, @@ -350,7 +354,7 @@ pub(crate) trait CredentialExt { ) -> Self; } -impl CredentialExt for RequestBuilder { +impl CredentialExt for HttpRequestBuilder { fn with_aws_sigv4( self, authorizer: Option>, @@ -358,7 +362,7 @@ impl CredentialExt for RequestBuilder { ) -> Self { match authorizer { Some(authorizer) => { - let (client, request) = self.build_split(); + let (client, request) = self.into_parts(); let mut request = request.expect("request valid"); authorizer.authorize(&mut request, payload_sha256); @@ -461,7 +465,7 @@ impl TokenProvider for InstanceCredentialProvider { async fn fetch_token( &self, - client: &Client, + client: &HttpClient, retry: &RetryConfig, ) -> Result>> { instance_creds(client, retry, &self.metadata_endpoint, self.imdsv1_fallback) @@ -490,7 +494,7 @@ impl TokenProvider for WebIdentityProvider { async fn fetch_token( &self, - client: &Client, + client: &HttpClient, retry: &RetryConfig, ) -> Result>> { web_identity( @@ -530,7 +534,7 @@ impl From for AwsCredential { /// async fn instance_creds( - client: &Client, + client: &HttpClient, retry_config: &RetryConfig, endpoint: &str, imdsv1_fallback: bool, @@ -549,7 +553,7 @@ async fn instance_creds( .await; let token = match token_result { - Ok(t) => Some(t.text().await?), + Ok(t) => Some(t.into_body().text().await?), Err(e) if imdsv1_fallback && matches!(e.status(), Some(StatusCode::FORBIDDEN)) => { warn!("received 403 from metadata endpoint, falling back to IMDSv1"); None @@ -564,7 +568,12 @@ async fn instance_creds( role_request = role_request.header(AWS_EC2_METADATA_TOKEN_HEADER, token); } - let role = role_request.send_retry(retry_config).await?.text().await?; + let role = role_request + .send_retry(retry_config) + .await? + .into_body() + .text() + .await?; let creds_url = format!("{endpoint}/{CREDENTIALS_PATH}/{role}"); let mut creds_request = client.request(Method::GET, creds_url); @@ -572,7 +581,12 @@ async fn instance_creds( creds_request = creds_request.header(AWS_EC2_METADATA_TOKEN_HEADER, token); } - let creds: InstanceCredentials = creds_request.send_retry(retry_config).await?.json().await?; + let creds: InstanceCredentials = creds_request + .send_retry(retry_config) + .await? + .into_body() + .json() + .await?; let now = Utc::now(); let ttl = (creds.expiration - now).to_std().unwrap_or_default(); @@ -615,7 +629,7 @@ impl From for AwsCredential { /// async fn web_identity( - client: &Client, + client: &HttpClient, retry_config: &RetryConfig, token_path: &str, role_arn: &str, @@ -626,7 +640,7 @@ async fn web_identity( .map_err(|e| format!("Failed to read token file '{token_path}': {e}"))?; let bytes = client - .request(Method::POST, endpoint) + .post(endpoint) .query(&[ ("Action", "AssumeRoleWithWebIdentity"), ("DurationSeconds", "3600"), @@ -640,6 +654,7 @@ async fn web_identity( .sensitive(true) .send() .await? + .into_body() .bytes() .await?; @@ -663,7 +678,7 @@ async fn web_identity( pub(crate) struct TaskCredentialProvider { pub url: String, pub retry: RetryConfig, - pub client: Client, + pub client: HttpClient, pub cache: TokenCache>, } @@ -684,11 +699,17 @@ impl CredentialProvider for TaskCredentialProvider { /// async fn task_credential( - client: &Client, + client: &HttpClient, retry: &RetryConfig, url: &str, ) -> Result>, StdError> { - let creds: InstanceCredentials = client.get(url).send_retry(retry).await?.json().await?; + let creds: InstanceCredentials = client + .get(url) + .send_retry(retry) + .await? + .into_body() + .json() + .await?; let now = Utc::now(); let ttl = (creds.expiration - now).to_std().unwrap_or_default(); @@ -714,7 +735,7 @@ impl TokenProvider for SessionProvider { async fn fetch_token( &self, - client: &Client, + client: &HttpClient, retry: &RetryConfig, ) -> Result>> { let creds = self.credentials.get_credential().await?; @@ -726,6 +747,7 @@ impl TokenProvider for SessionProvider { .send_retry(retry) .await .map_err(|source| Error::CreateSessionRequest { source })? + .into_body() .bytes() .await .map_err(|source| Error::CreateSessionResponse { source })?; @@ -752,14 +774,15 @@ struct CreateSessionOutput { mod tests { use super::*; use crate::client::mock_server::MockServer; - use hyper::Response; + use crate::client::HttpClient; + use http::Response; use reqwest::{Client, Method}; use std::env; // Test generated using https://docs.aws.amazon.com/general/latest/gr/sigv4-signed-request-examples.html #[test] fn test_sign_with_signed_payload() { - let client = Client::new(); + let client = HttpClient::new(Client::new()); // Test credentials from https://docs.aws.amazon.com/AmazonS3/latest/userguide/RESTAuthentication.html let credential = AwsCredential { @@ -780,7 +803,8 @@ mod tests { let mut request = client .request(Method::GET, "https://ec2.amazon.com/") - .build() + .into_parts() + .1 .unwrap(); let signer = AwsAuthorizer { @@ -799,7 +823,7 @@ mod tests { #[test] fn test_sign_with_signed_payload_request_payer() { - let client = Client::new(); + let client = HttpClient::new(Client::new()); // Test credentials from https://docs.aws.amazon.com/AmazonS3/latest/userguide/RESTAuthentication.html let credential = AwsCredential { @@ -820,7 +844,8 @@ mod tests { let mut request = client .request(Method::GET, "https://ec2.amazon.com/") - .build() + .into_parts() + .1 .unwrap(); let signer = AwsAuthorizer { @@ -839,7 +864,7 @@ mod tests { #[test] fn test_sign_with_unsigned_payload() { - let client = Client::new(); + let client = HttpClient::new(Client::new()); // Test credentials from https://docs.aws.amazon.com/AmazonS3/latest/userguide/RESTAuthentication.html let credential = AwsCredential { @@ -860,7 +885,8 @@ mod tests { let mut request = client .request(Method::GET, "https://ec2.amazon.com/") - .build() + .into_parts() + .1 .unwrap(); let authorizer = AwsAuthorizer { @@ -962,7 +988,7 @@ mod tests { #[test] fn test_sign_port() { - let client = Client::new(); + let client = HttpClient::new(Client::new()); let credential = AwsCredential { key_id: "H20ABqCkLZID4rLe".to_string(), @@ -982,7 +1008,8 @@ mod tests { ("list-type", "2"), ("prefix", ""), ]) - .build() + .into_parts() + .1 .unwrap(); let authorizer = AwsAuthorizer { @@ -1008,15 +1035,15 @@ mod tests { // For example https://github.com/aws/amazon-ec2-metadata-mock let endpoint = env::var("EC2_METADATA_ENDPOINT").unwrap(); - let client = Client::new(); + let client = HttpClient::new(Client::new()); let retry_config = RetryConfig::default(); // Verify only allows IMDSv2 - let resp = client + let (client, req) = client .request(Method::GET, format!("{endpoint}/latest/meta-data/ami-id")) - .send() - .await - .unwrap(); + .into_parts(); + + let resp = client.execute(req.unwrap()).await.unwrap(); assert_eq!( resp.status(), @@ -1048,7 +1075,7 @@ mod tests { let token = "TOKEN"; let endpoint = server.url(); - let client = Client::new(); + let client = HttpClient::new(Client::new()); let retry_config = RetryConfig::default(); // Test IMDSv2 diff --git a/src/aws/dynamo.rs b/src/aws/dynamo.rs index 6283e76..73380aa 100644 --- a/src/aws/dynamo.rs +++ b/src/aws/dynamo.rs @@ -23,7 +23,7 @@ use std::future::Future; use std::time::{Duration, Instant}; use chrono::Utc; -use reqwest::{Response, StatusCode}; +use http::{Method, StatusCode}; use serde::ser::SerializeMap; use serde::{Deserialize, Serialize, Serializer}; @@ -31,8 +31,8 @@ use crate::aws::client::S3Client; use crate::aws::credential::CredentialExt; use crate::aws::{AwsAuthorizer, AwsCredential}; use crate::client::get::GetClientExt; -use crate::client::retry::Error as RetryError; use crate::client::retry::RetryExt; +use crate::client::retry::{RequestError, RetryError}; use crate::path::Path; use crate::{Error, GetOptions, Result}; @@ -317,20 +317,20 @@ impl DynamoCommit { cred: Option<&AwsCredential>, target: &str, req: R, - ) -> Result { + ) -> Result { let region = &s3.config.region; let authorizer = cred.map(|x| AwsAuthorizer::new(x, "dynamodb", region)); let builder = match &s3.config.endpoint { - Some(e) => s3.client.post(e), + Some(e) => s3.client.request(Method::POST, e), None => { let url = format!("https://dynamodb.{region}.amazonaws.com"); - s3.client.post(url) + s3.client.request(Method::POST, url) } }; + // TODO: Timeout builder - .timeout(Duration::from_millis(self.timeout)) .json(&req) .header("X-Amz-Target", target) .with_aws_sigv4(authorizer, None) @@ -383,8 +383,8 @@ async fn check_precondition(client: &S3Client, path: &Path, etag: Option<&str>) /// Parses the error response if any fn parse_error_response(e: &RetryError) -> Option> { - match e { - RetryError::Client { + match e.inner() { + RequestError::Status { status: StatusCode::BAD_REQUEST, body: Some(b), } => serde_json::from_str(b).ok(), @@ -518,6 +518,7 @@ mod number { } } +use crate::client::HttpResponse; /// Re-export integration_test to be called by s3_test #[cfg(test)] pub(crate) use tests::integration_test; diff --git a/src/aws/mod.rs b/src/aws/mod.rs index b065927..0625ae1 100644 --- a/src/aws/mod.rs +++ b/src/aws/mod.rs @@ -140,7 +140,7 @@ impl Signer for AmazonS3 { .with_request_payer(self.client.config.request_payer); let path_url = self.path_url(path); - let mut url = Url::parse(&path_url).map_err(|e| crate::Error::Generic { + let mut url = path_url.parse().map_err(|e| Error::Generic { store: STORE, source: format!("Unable to parse url {path_url}: {e}").into(), })?; @@ -489,7 +489,7 @@ mod tests { use crate::ClientOptions; use base64::prelude::BASE64_STANDARD; use base64::Engine; - use hyper::HeaderMap; + use http::HeaderMap; const NON_EXISTENT_NAME: &str = "nonexistentname"; diff --git a/src/azure/builder.rs b/src/azure/builder.rs index f0572eb..ab0a484 100644 --- a/src/azure/builder.rs +++ b/src/azure/builder.rs @@ -21,7 +21,7 @@ use crate::azure::credential::{ ImdsManagedIdentityProvider, WorkloadIdentityOAuthProvider, }; use crate::azure::{AzureCredential, AzureCredentialProvider, MicrosoftAzure, STORE}; -use crate::client::TokenCredentialProvider; +use crate::client::{HttpConnector, ReqwestConnector, TokenCredentialProvider}; use crate::config::ConfigValue; use crate::{ClientConfigKey, ClientOptions, Result, RetryConfig, StaticCredentialProvider}; use percent_encoding::percent_decode_str; @@ -178,6 +178,8 @@ pub struct MicrosoftAzureBuilder { fabric_session_token: Option, /// Fabric cluster identifier fabric_cluster_identifier: Option, + /// The [`HttpConnector`] to use + http_connector: Option>, } /// Configuration keys for [`MicrosoftAzureBuilder`] @@ -887,6 +889,12 @@ impl MicrosoftAzureBuilder { self } + /// Overrides the [`HttpConnector`], by default uses [`ReqwestConnector`] + pub fn with_http_connector(mut self, connector: C) -> Self { + self.http_connector = Some(Arc::new(connector)); + self + } + /// Configure a connection to container with given name on Microsoft Azure Blob store. pub fn build(mut self) -> Result { if let Some(url) = self.url.take() { @@ -899,6 +907,10 @@ impl MicrosoftAzureBuilder { Arc::new(StaticCredentialProvider::new(credential)) }; + let http = self + .http_connector + .unwrap_or_else(|| Arc::new(ReqwestConnector::default())); + let (is_emulator, storage_url, auth, account) = if self.use_emulator.get()? { let account_name = self .account_name @@ -960,7 +972,7 @@ impl MicrosoftAzureBuilder { ); Arc::new(TokenCredentialProvider::new( fabric_credential, - self.client_options.client()?, + http.connect(&self.client_options)?, self.retry_config.clone(), )) as _ } else if let Some(bearer_token) = self.bearer_token { @@ -979,7 +991,7 @@ impl MicrosoftAzureBuilder { ); Arc::new(TokenCredentialProvider::new( client_credential, - self.client_options.client()?, + http.connect(&self.client_options)?, self.retry_config.clone(), )) as _ } else if let (Some(client_id), Some(client_secret), Some(tenant_id)) = @@ -993,7 +1005,7 @@ impl MicrosoftAzureBuilder { ); Arc::new(TokenCredentialProvider::new( client_credential, - self.client_options.client()?, + http.connect(&self.client_options)?, self.retry_config.clone(), )) as _ } else if let Some(query_pairs) = self.sas_query_pairs { @@ -1011,7 +1023,7 @@ impl MicrosoftAzureBuilder { ); Arc::new(TokenCredentialProvider::new( msi_credential, - self.client_options.metadata_client()?, + http.connect(&self.client_options.metadata_options())?, self.retry_config.clone(), )) as _ }; @@ -1030,7 +1042,8 @@ impl MicrosoftAzureBuilder { credentials: auth, }; - let client = Arc::new(AzureClient::new(config)?); + let http_client = http.connect(&config.client_options)?; + let client = Arc::new(AzureClient::new(config, http_client)); Ok(MicrosoftAzure { client }) } diff --git a/src/azure/client.rs b/src/azure/client.rs index 2c2e27e..13e40bb 100644 --- a/src/azure/client.rs +++ b/src/azure/client.rs @@ -18,11 +18,12 @@ use super::credential::AzureCredential; use crate::azure::credential::*; use crate::azure::{AzureCredentialProvider, STORE}; +use crate::client::builder::HttpRequestBuilder; use crate::client::get::GetClient; use crate::client::header::{get_put_result, HeaderConfig}; use crate::client::list::ListClient; use crate::client::retry::RetryExt; -use crate::client::GetOptionsExt; +use crate::client::{GetOptionsExt, HttpClient, HttpError, HttpRequest, HttpResponse}; use crate::multipart::PartId; use crate::path::DELIMITER; use crate::util::{deserialize_rfc1123, GetRange}; @@ -35,12 +36,11 @@ use base64::prelude::{BASE64_STANDARD, BASE64_STANDARD_NO_PAD}; use base64::Engine; use bytes::{Buf, Bytes}; use chrono::{DateTime, Utc}; -use hyper::http::HeaderName; -use rand::Rng as _; -use reqwest::{ +use http::{ header::{HeaderMap, HeaderValue, CONTENT_LENGTH, CONTENT_TYPE, IF_MATCH, IF_NONE_MATCH}, - Client as ReqwestClient, Method, RequestBuilder, Response, + HeaderName, Method, }; +use rand::Rng as _; use serde::{Deserialize, Serialize}; use std::collections::HashMap; use std::sync::Arc; @@ -63,27 +63,29 @@ static TAGS_HEADER: HeaderName = HeaderName::from_static("x-ms-tags"); pub(crate) enum Error { #[error("Error performing get request {}: {}", path, source)] GetRequest { - source: crate::client::retry::Error, + source: crate::client::retry::RetryError, path: String, }, #[error("Error performing put request {}: {}", path, source)] PutRequest { - source: crate::client::retry::Error, + source: crate::client::retry::RetryError, path: String, }, #[error("Error performing delete request {}: {}", path, source)] DeleteRequest { - source: crate::client::retry::Error, + source: crate::client::retry::RetryError, path: String, }, #[error("Error performing bulk delete request: {}", source)] - BulkDeleteRequest { source: crate::client::retry::Error }, + BulkDeleteRequest { + source: crate::client::retry::RetryError, + }, #[error("Error receiving bulk delete request body: {}", source)] - BulkDeleteRequestBody { source: reqwest::Error }, + BulkDeleteRequestBody { source: HttpError }, #[error( "Bulk delete request failed due to invalid input: {} (code: {})", @@ -108,10 +110,12 @@ pub(crate) enum Error { }, #[error("Error performing list request: {}", source)] - ListRequest { source: crate::client::retry::Error }, + ListRequest { + source: crate::client::retry::RetryError, + }, #[error("Error getting list response body: {}", source)] - ListResponseBody { source: reqwest::Error }, + ListResponseBody { source: HttpError }, #[error("Got invalid list response: {}", source)] InvalidListResponse { source: quick_xml::de::DeError }, @@ -125,10 +129,12 @@ pub(crate) enum Error { MissingETag, #[error("Error requesting user delegation key: {}", source)] - DelegationKeyRequest { source: crate::client::retry::Error }, + DelegationKeyRequest { + source: crate::client::retry::RetryError, + }, #[error("Error getting user delegation key response body: {}", source)] - DelegationKeyResponseBody { source: reqwest::Error }, + DelegationKeyResponseBody { source: HttpError }, #[error("Got invalid user delegation key response: {}", source)] DelegationKeyResponse { source: quick_xml::de::DeError }, @@ -194,7 +200,7 @@ struct PutRequest<'a> { path: &'a Path, config: &'a AzureConfig, payload: PutPayload, - builder: RequestBuilder, + builder: HttpRequestBuilder, idempotent: bool, } @@ -251,7 +257,7 @@ impl PutRequest<'_> { Self { builder, ..self } } - async fn send(self) -> Result { + async fn send(self) -> Result { let credential = self.config.get_credential().await?; let sensitive = credential .as_deref() @@ -317,7 +323,7 @@ fn serialize_part_delete_request( dst: &mut Vec, boundary: &str, idx: usize, - request: reqwest::Request, + request: HttpRequest, relative_url: String, ) { // Encode start marker for part @@ -349,7 +355,7 @@ fn serialize_part_delete_request( extend(dst, b"\r\n"); } -fn parse_multipart_response_boundary(response: &Response) -> Result { +fn parse_multipart_response_boundary(response: &HttpResponse) -> Result { let invalid_response = |msg: &str| Error::InvalidBulkDeleteResponse { reason: msg.to_string(), }; @@ -496,14 +502,13 @@ async fn parse_blob_batch_delete_body( #[derive(Debug)] pub(crate) struct AzureClient { config: AzureConfig, - client: ReqwestClient, + client: HttpClient, } impl AzureClient { /// create a new instance of [AzureClient] - pub(crate) fn new(config: AzureConfig) -> Result { - let client = config.client_options.client()?; - Ok(Self { config, client }) + pub(crate) fn new(config: AzureConfig, client: HttpClient) -> Self { + Self { config, client } } /// Returns the config @@ -517,7 +522,7 @@ impl AzureClient { fn put_request<'a>(&'a self, path: &'a Path, payload: PutPayload) -> PutRequest<'a> { let url = self.config.path_url(path); - let builder = self.client.request(Method::PUT, url); + let builder = self.client.request(Method::PUT, url.as_str()); PutRequest { path, @@ -614,7 +619,7 @@ impl AzureClient { .map(|c| c.sensitive_request()) .unwrap_or_default(); self.client - .request(Method::DELETE, url) + .delete(url.as_str()) .query(query) .header(&DELETE_SNAPSHOTS, "include") .with_azure_authorization(&credential, &self.config.account) @@ -644,17 +649,20 @@ impl AzureClient { // Build subrequest with proper authorization let request = self .client - .request(Method::DELETE, url) + .delete(url.as_str()) .header(CONTENT_LENGTH, HeaderValue::from(0)) // Each subrequest must be authorized individually [1] and we use // the CredentialExt for this. // [1]: https://learn.microsoft.com/en-us/rest/api/storageservices/blob-batch?tabs=microsoft-entra-id#request-body .with_azure_authorization(credential, &self.config.account) - .build() + .into_parts() + .1 .unwrap(); + let url: Url = request.uri().to_string().parse().unwrap(); + // Url for part requests must be relative and without base - let relative_url = self.config.service.make_relative(request.url()).unwrap(); + let relative_url = self.config.service.make_relative(&url).unwrap(); serialize_part_delete_request(&mut body_bytes, boundary, idx, request, relative_url) } @@ -684,7 +692,7 @@ impl AzureClient { let url = self.config.path_url(&Path::from("/")); let batch_response = self .client - .request(Method::POST, url) + .post(url.as_str()) .query(&[("restype", "container"), ("comp", "batch")]) .header( CONTENT_TYPE, @@ -701,6 +709,7 @@ impl AzureClient { let boundary = parse_multipart_response_boundary(&batch_response)?; let batch_body = batch_response + .into_body() .bytes() .await .map_err(|source| Error::BulkDeleteRequestBody { source })?; @@ -724,7 +733,7 @@ impl AzureClient { let mut builder = self .client - .request(Method::PUT, url) + .request(Method::PUT, url.as_str()) .header(©_SOURCE, source.to_string()) .header(CONTENT_LENGTH, HeaderValue::from_static("0")); @@ -772,9 +781,10 @@ impl AzureClient { .as_deref() .map(|c| c.sensitive_request()) .unwrap_or_default(); + let response = self .client - .request(Method::POST, url) + .post(url.as_str()) .body(body) .query(&[("restype", "service"), ("comp", "userdelegationkey")]) .with_azure_authorization(&credential, &self.config.account) @@ -784,6 +794,7 @@ impl AzureClient { .send() .await .map_err(|source| Error::DelegationKeyRequest { source })? + .into_body() .bytes() .await .map_err(|source| Error::DelegationKeyResponseBody { source })?; @@ -829,7 +840,7 @@ impl AzureClient { } #[cfg(test)] - pub(crate) async fn get_blob_tagging(&self, path: &Path) -> Result { + pub(crate) async fn get_blob_tagging(&self, path: &Path) -> Result { let credential = self.get_credential().await?; let url = self.config.path_url(path); let sensitive = credential @@ -838,7 +849,7 @@ impl AzureClient { .unwrap_or_default(); let response = self .client - .request(Method::GET, url) + .get(url.as_str()) .query(&[("comp", "tags")]) .with_azure_authorization(&credential, &self.config.account) .retryable(&self.config.retry_config) @@ -868,7 +879,7 @@ impl GetClient for AzureClient { /// Make an Azure GET request /// /// - async fn get_request(&self, path: &Path, options: GetOptions) -> Result { + async fn get_request(&self, path: &Path, options: GetOptions) -> Result { // As of 2024-01-02, Azure does not support suffix requests, // so we should fail fast here rather than sending one if let Some(GetRange::Suffix(_)) = options.range.as_ref() { @@ -886,7 +897,7 @@ impl GetClient for AzureClient { let mut builder = self .client - .request(method, url) + .request(method, url.as_str()) .header(CONTENT_LENGTH, HeaderValue::from_static("0")) .body(Bytes::new()); @@ -961,7 +972,7 @@ impl ListClient for Arc { .unwrap_or_default(); let response = self .client - .request(Method::GET, url) + .get(url.as_str()) .query(&query) .with_azure_authorization(&credential, &self.config.account) .retryable(&self.config.retry_config) @@ -969,6 +980,7 @@ impl ListClient for Arc { .send() .await .map_err(|source| Error::ListRequest { source })? + .into_body() .bytes() .await .map_err(|source| Error::ListResponseBody { source })?; @@ -1147,11 +1159,11 @@ pub(crate) struct UserDelegationKey { #[cfg(test)] mod tests { - use bytes::Bytes; - use regex::bytes::Regex; - use super::*; use crate::StaticCredentialProvider; + use bytes::Bytes; + use regex::bytes::Regex; + use reqwest::Client; #[test] fn deserde_azure() { @@ -1360,7 +1372,7 @@ mod tests { client_options: Default::default(), }; - let client = AzureClient::new(config).unwrap(); + let client = AzureClient::new(config, HttpClient::new(Client::new())); let credential = client.get_credential().await.unwrap(); let paths = &[Path::from("a"), Path::from("b"), Path::from("c")]; @@ -1454,7 +1466,7 @@ RequestId:778fdc83-801e-0000-62ff-0334671e2852 Time:2018-06-14T16:46:54.6040685Z\r --batchresponse_66925647-d0cb-4109-b6d3-28efe3e1e5ed--\r\n"; - let response: reqwest::Response = http::Response::builder() + let response: HttpResponse = http::Response::builder() .status(202) .header("Transfer-Encoding", "chunked") .header( @@ -1463,12 +1475,11 @@ Time:2018-06-14T16:46:54.6040685Z\r ) .header("x-ms-request-id", "778fdc83-801e-0000-62ff-033467000000") .header("x-ms-version", "2018-11-09") - .body(Bytes::from(response_body.as_slice())) - .unwrap() - .into(); + .body(Bytes::from(response_body.as_slice()).into()) + .unwrap(); let boundary = parse_multipart_response_boundary(&response).unwrap(); - let body = response.bytes().await.unwrap(); + let body = response.into_body().bytes().await.unwrap(); let paths = &[Path::from("a"), Path::from("b"), Path::from("c")]; diff --git a/src/azure/credential.rs b/src/azure/credential.rs index c9e6ac6..27f8776 100644 --- a/src/azure/credential.rs +++ b/src/azure/credential.rs @@ -15,22 +15,24 @@ // specific language governing permissions and limitations // under the License. +use super::client::UserDelegationKey; use crate::azure::STORE; +use crate::client::builder::{add_query_pairs, HttpRequestBuilder}; use crate::client::retry::RetryExt; use crate::client::token::{TemporaryToken, TokenCache}; -use crate::client::{CredentialProvider, TokenProvider}; +use crate::client::{CredentialProvider, HttpClient, HttpError, HttpRequest, TokenProvider}; use crate::util::hmac_sha256; use crate::RetryConfig; use async_trait::async_trait; use base64::prelude::{BASE64_STANDARD, BASE64_URL_SAFE_NO_PAD}; use base64::Engine; use chrono::{DateTime, SecondsFormat, Utc}; -use reqwest::header::{ +use http::header::{ HeaderMap, HeaderName, HeaderValue, ACCEPT, AUTHORIZATION, CONTENT_ENCODING, CONTENT_LANGUAGE, CONTENT_LENGTH, CONTENT_TYPE, DATE, IF_MATCH, IF_MODIFIED_SINCE, IF_NONE_MATCH, IF_UNMODIFIED_SINCE, RANGE, }; -use reqwest::{Client, Method, Request, RequestBuilder}; +use http::Method; use serde::Deserialize; use std::borrow::Cow; use std::collections::HashMap; @@ -42,8 +44,6 @@ use std::sync::Arc; use std::time::{Duration, Instant, SystemTime}; use url::Url; -use super::client::UserDelegationKey; - static AZURE_VERSION: HeaderValue = HeaderValue::from_static("2023-11-03"); static VERSION: HeaderName = HeaderName::from_static("x-ms-version"); pub(crate) static BLOB_TYPE: HeaderName = HeaderName::from_static("x-ms-blob-type"); @@ -73,10 +73,12 @@ const AZURE_STORAGE_RESOURCE: &str = "https://storage.azure.com"; #[derive(Debug, thiserror::Error)] pub enum Error { #[error("Error performing token request: {}", source)] - TokenRequest { source: crate::client::retry::Error }, + TokenRequest { + source: crate::client::retry::RetryError, + }, #[error("Error getting token response body: {}", source)] - TokenResponseBody { source: reqwest::Error }, + TokenResponseBody { source: HttpError }, #[error("Error reading federated token file ")] FederatedTokenFile, @@ -206,7 +208,7 @@ impl AzureSigner { } } -fn add_date_and_version_headers(request: &mut Request) { +fn add_date_and_version_headers(request: &mut HttpRequest) { // rfc2822 string should never contain illegal characters let date = Utc::now(); let date_str = date.format(RFC1123_FMT).to_string(); @@ -218,7 +220,7 @@ fn add_date_and_version_headers(request: &mut Request) { .insert(&VERSION, AZURE_VERSION.clone()); } -/// Authorize a [`Request`] with an [`AzureAuthorizer`] +/// Authorize a [`HttpRequest`] with an [`AzureAuthorizer`] #[derive(Debug)] pub struct AzureAuthorizer<'a> { credential: &'a AzureCredential, @@ -235,14 +237,15 @@ impl<'a> AzureAuthorizer<'a> { } /// Authorize `request` - pub fn authorize(&self, request: &mut Request) { + pub fn authorize(&self, request: &mut HttpRequest) { add_date_and_version_headers(request); match self.credential { AzureCredential::AccessKey(key) => { + let url = Url::parse(&request.uri().to_string()).unwrap(); let signature = generate_authorization( request.headers(), - request.url(), + &url, request.method(), self.account, key, @@ -262,10 +265,7 @@ impl<'a> AzureAuthorizer<'a> { ); } AzureCredential::SASToken(query_pairs) => { - request - .url_mut() - .query_pairs_mut() - .extend_pairs(query_pairs); + add_query_pairs(request.uri_mut(), query_pairs); } } } @@ -281,13 +281,13 @@ pub(crate) trait CredentialExt { ) -> Self; } -impl CredentialExt for RequestBuilder { +impl CredentialExt for HttpRequestBuilder { fn with_azure_authorization( self, credential: &Option>, account: &str, ) -> Self { - let (client, request) = self.build_split(); + let (client, request) = self.into_parts(); let mut request = request.expect("request valid"); match credential.as_deref() { @@ -622,13 +622,13 @@ impl TokenProvider for ClientSecretOAuthProvider { /// Fetch a token async fn fetch_token( &self, - client: &Client, + client: &HttpClient, retry: &RetryConfig, ) -> crate::Result>> { let response: OAuthTokenResponse = client .request(Method::POST, &self.token_url) .header(ACCEPT, HeaderValue::from_static(CONTENT_TYPE_JSON)) - .form(&[ + .form([ ("client_id", self.client_id.as_str()), ("client_secret", self.client_secret.as_str()), ("scope", AZURE_STORAGE_SCOPE), @@ -639,6 +639,7 @@ impl TokenProvider for ClientSecretOAuthProvider { .send() .await .map_err(|source| Error::TokenRequest { source })? + .into_body() .json() .await .map_err(|source| Error::TokenResponseBody { source })?; @@ -712,7 +713,7 @@ impl TokenProvider for ImdsManagedIdentityProvider { /// Fetch a token async fn fetch_token( &self, - client: &Client, + client: &HttpClient, retry: &RetryConfig, ) -> crate::Result>> { let mut query_items = vec![ @@ -747,6 +748,7 @@ impl TokenProvider for ImdsManagedIdentityProvider { .send_retry(retry) .await .map_err(|source| Error::TokenRequest { source })? + .into_body() .json() .await .map_err(|source| Error::TokenResponseBody { source })?; @@ -798,7 +800,7 @@ impl TokenProvider for WorkloadIdentityOAuthProvider { /// Fetch a token async fn fetch_token( &self, - client: &Client, + client: &HttpClient, retry: &RetryConfig, ) -> crate::Result>> { let token_str = std::fs::read_to_string(&self.federated_token_file) @@ -808,7 +810,7 @@ impl TokenProvider for WorkloadIdentityOAuthProvider { let response: OAuthTokenResponse = client .request(Method::POST, &self.token_url) .header(ACCEPT, HeaderValue::from_static(CONTENT_TYPE_JSON)) - .form(&[ + .form([ ("client_id", self.client_id.as_str()), ( "client_assertion_type", @@ -823,6 +825,7 @@ impl TokenProvider for WorkloadIdentityOAuthProvider { .send() .await .map_err(|source| Error::TokenRequest { source })? + .into_body() .json() .await .map_err(|source| Error::TokenResponseBody { source })?; @@ -1009,7 +1012,7 @@ impl TokenProvider for FabricTokenOAuthProvider { /// Fetch a token async fn fetch_token( &self, - client: &Client, + client: &HttpClient, retry: &RetryConfig, ) -> crate::Result>> { if let Some(storage_access_token) = &self.storage_access_token { @@ -1037,6 +1040,7 @@ impl TokenProvider for FabricTokenOAuthProvider { .send() .await .map_err(|source| Error::TokenRequest { source })? + .into_body() .text() .await .map_err(|source| Error::TokenResponseBody { source })?; @@ -1061,8 +1065,8 @@ impl CredentialProvider for AzureCliCredential { #[cfg(test)] mod tests { use futures::executor::block_on; + use http::{Response, StatusCode}; use http_body_util::BodyExt; - use hyper::{Response, StatusCode}; use reqwest::{Client, Method}; use tempfile::NamedTempFile; @@ -1078,7 +1082,7 @@ mod tests { std::env::set_var(MSI_SECRET_ENV_KEY, "env-secret"); let endpoint = server.url(); - let client = Client::new(); + let client = HttpClient::new(Client::new()); let retry_config = RetryConfig::default(); // Test IMDS @@ -1137,7 +1141,7 @@ mod tests { std::fs::write(tokenfile.path(), "federated-token").unwrap(); let endpoint = server.url(); - let client = Client::new(); + let client = HttpClient::new(Client::new()); let retry_config = RetryConfig::default(); // Test IMDS diff --git a/src/azure/mod.rs b/src/azure/mod.rs index bbecba5..b4243dd 100644 --- a/src/azure/mod.rs +++ b/src/azure/mod.rs @@ -58,7 +58,7 @@ const STORE: &str = "MicrosoftAzure"; /// Interface for [Microsoft Azure Blob Storage](https://azure.microsoft.com/en-us/services/storage/blobs/). #[derive(Debug)] pub struct MicrosoftAzure { - client: Arc, + client: Arc, } impl MicrosoftAzure { @@ -68,7 +68,7 @@ impl MicrosoftAzure { } /// Create a full URL to the resource specified by `path` with this instance's configuration. - fn path_url(&self, path: &Path) -> url::Url { + fn path_url(&self, path: &Path) -> Url { self.client.config().path_url(path) } } diff --git a/src/client/body.rs b/src/client/body.rs new file mode 100644 index 0000000..549b3e4 --- /dev/null +++ b/src/client/body.rs @@ -0,0 +1,213 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::client::connection::{HttpError, HttpErrorKind}; +use crate::{collect_bytes, PutPayload}; +use bytes::Bytes; +use futures::stream::BoxStream; +use futures::StreamExt; +use http_body_util::combinators::BoxBody; +use http_body_util::{BodyExt, Full}; +use hyper::body::{Body, Frame, SizeHint}; +use std::pin::Pin; +use std::task::{Context, Poll}; + +/// An HTTP Request +pub type HttpRequest = http::Request; + +/// The [`Body`] of an [`HttpRequest`] +#[derive(Debug, Clone)] +pub struct HttpRequestBody(Inner); + +impl HttpRequestBody { + /// An empty [`HttpRequestBody`] + pub fn empty() -> Self { + Self(Inner::Bytes(Bytes::new())) + } + + pub(crate) fn into_reqwest(self) -> reqwest::Body { + match self.0 { + Inner::Bytes(b) => b.into(), + Inner::PutPayload(_, payload) => reqwest::Body::wrap_stream(futures::stream::iter( + payload.into_iter().map(Ok::<_, HttpError>), + )), + } + } + + /// Returns true if this body is empty + pub fn is_empty(&self) -> bool { + match &self.0 { + Inner::Bytes(x) => x.is_empty(), + Inner::PutPayload(_, x) => x.iter().any(|x| !x.is_empty()), + } + } + + /// Returns the total length of the [`Bytes`] in this body + pub fn content_length(&self) -> usize { + match &self.0 { + Inner::Bytes(x) => x.len(), + Inner::PutPayload(_, x) => x.content_length(), + } + } + + /// If this body consists of a single contiguous [`Bytes`], returns it + pub fn as_bytes(&self) -> Option<&Bytes> { + match &self.0 { + Inner::Bytes(x) => Some(x), + _ => None, + } + } +} + +impl From for HttpRequestBody { + fn from(value: Bytes) -> Self { + Self(Inner::Bytes(value)) + } +} + +impl From> for HttpRequestBody { + fn from(value: Vec) -> Self { + Self(Inner::Bytes(value.into())) + } +} + +impl From for HttpRequestBody { + fn from(value: String) -> Self { + Self(Inner::Bytes(value.into())) + } +} + +impl From for HttpRequestBody { + fn from(value: PutPayload) -> Self { + Self(Inner::PutPayload(0, value)) + } +} + +#[derive(Debug, Clone)] +enum Inner { + Bytes(Bytes), + PutPayload(usize, PutPayload), +} + +impl Body for HttpRequestBody { + type Data = Bytes; + type Error = HttpError; + + fn poll_frame( + mut self: Pin<&mut Self>, + _cx: &mut Context<'_>, + ) -> Poll, Self::Error>>> { + Poll::Ready(match &mut self.0 { + Inner::Bytes(bytes) => { + let out = bytes.split_off(0); + if out.is_empty() { + None + } else { + Some(Ok(Frame::data(out))) + } + } + Inner::PutPayload(offset, payload) => { + let slice = payload.as_ref(); + if *offset == slice.len() { + None + } else { + Some(Ok(Frame::data( + slice[std::mem::replace(offset, *offset + 1)].clone(), + ))) + } + } + }) + } + + fn is_end_stream(&self) -> bool { + match self.0 { + Inner::Bytes(ref bytes) => bytes.is_empty(), + Inner::PutPayload(offset, ref body) => offset == body.as_ref().len(), + } + } + + fn size_hint(&self) -> SizeHint { + match self.0 { + Inner::Bytes(ref bytes) => SizeHint::with_exact(bytes.len() as u64), + Inner::PutPayload(offset, ref payload) => { + let iter = payload.as_ref().iter().skip(offset); + SizeHint::with_exact(iter.map(|x| x.len() as u64).sum()) + } + } + } +} + +/// An HTTP response +pub type HttpResponse = http::Response; + +/// The body of an [`HttpResponse`] +#[derive(Debug)] +pub struct HttpResponseBody(BoxBody); + +impl HttpResponseBody { + /// Create an [`HttpResponseBody`] from the provided [`Body`] + /// + /// Note: [`BodyExt::map_err`] can be used to alter error variants + pub fn new(body: B) -> Self + where + B: Body + Send + Sync + 'static, + { + Self(BoxBody::new(body)) + } + + /// Collects this response into a [`Bytes`] + pub async fn bytes(self) -> Result { + let size_hint = self.0.size_hint().lower(); + let s = self.0.into_data_stream(); + collect_bytes(s, Some(size_hint)).await + } + + /// Returns a stream of this response data + pub fn bytes_stream(self) -> BoxStream<'static, Result> { + self.0.into_data_stream().boxed() + } + + /// Returns the response as a [`String`] + pub(crate) async fn text(self) -> Result { + let b = self.bytes().await?; + String::from_utf8(b.into()).map_err(|e| HttpError::new(HttpErrorKind::Decode, e)) + } + + #[cfg(any(feature = "aws", feature = "gcp", feature = "azure"))] + pub(crate) async fn json(self) -> Result { + let b = self.bytes().await?; + serde_json::from_slice(&b).map_err(|e| HttpError::new(HttpErrorKind::Decode, e)) + } +} + +impl From for HttpResponseBody { + fn from(value: Bytes) -> Self { + Self::new(Full::new(value).map_err(|e| match e {})) + } +} + +impl From> for HttpResponseBody { + fn from(value: Vec) -> Self { + Bytes::from(value).into() + } +} + +impl From for HttpResponseBody { + fn from(value: String) -> Self { + Bytes::from(value).into() + } +} diff --git a/src/client/builder.rs b/src/client/builder.rs new file mode 100644 index 0000000..0fbc12f --- /dev/null +++ b/src/client/builder.rs @@ -0,0 +1,279 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::client::connection::HttpErrorKind; +use crate::client::{HttpClient, HttpError, HttpRequest, HttpRequestBody}; +use http::header::{InvalidHeaderName, InvalidHeaderValue}; +use http::uri::InvalidUri; +use http::{HeaderName, HeaderValue, Method, Uri}; + +#[derive(Debug, thiserror::Error)] +pub(crate) enum RequestBuilderError { + #[error("Invalid URI")] + InvalidUri(#[from] InvalidUri), + + #[error("Invalid Header Value")] + InvalidHeaderValue(#[from] InvalidHeaderValue), + + #[error("Invalid Header Name")] + InvalidHeaderName(#[from] InvalidHeaderName), + + #[error("JSON serialization error")] + SerdeJson(#[from] serde_json::Error), + + #[error("URL serialization error")] + SerdeUrl(#[from] serde_urlencoded::ser::Error), +} + +impl From for HttpError { + fn from(value: RequestBuilderError) -> Self { + Self::new(HttpErrorKind::Request, value) + } +} + +impl From for RequestBuilderError { + fn from(value: std::convert::Infallible) -> Self { + match value {} + } +} + +pub(crate) struct HttpRequestBuilder { + client: HttpClient, + request: Result, +} + +impl HttpRequestBuilder { + pub(crate) fn new(client: HttpClient) -> Self { + Self { + client, + request: Ok(HttpRequest::new(HttpRequestBody::empty())), + } + } + + #[cfg(any(feature = "aws", feature = "azure"))] + pub(crate) fn from_parts(client: HttpClient, request: HttpRequest) -> Self { + Self { + client, + request: Ok(request), + } + } + + pub(crate) fn method(mut self, method: Method) -> Self { + if let Ok(r) = &mut self.request { + *r.method_mut() = method; + } + self + } + + pub(crate) fn uri(mut self, url: U) -> Self + where + U: TryInto, + U::Error: Into, + { + match (url.try_into(), &mut self.request) { + (Ok(uri), Ok(r)) => *r.uri_mut() = uri, + (Err(e), Ok(_)) => self.request = Err(e.into()), + (_, Err(_)) => {} + } + self + } + + pub(crate) fn header(mut self, name: K, value: V) -> Self + where + K: TryInto, + K::Error: Into, + V: TryInto, + V::Error: Into, + { + match (name.try_into(), value.try_into(), &mut self.request) { + (Ok(name), Ok(value), Ok(r)) => { + r.headers_mut().insert(name, value); + } + (Err(e), _, Ok(_)) => self.request = Err(e.into()), + (_, Err(e), Ok(_)) => self.request = Err(e.into()), + (_, _, Err(_)) => {} + } + self + } + + #[cfg(feature = "aws")] + pub(crate) fn headers(mut self, headers: http::HeaderMap) -> Self { + use http::header::{Entry, OccupiedEntry}; + + if let Ok(ref mut req) = self.request { + // IntoIter of HeaderMap yields (Option, HeaderValue). + // The first time a name is yielded, it will be Some(name), and if + // there are more values with the same name, the next yield will be + // None. + + let mut prev_entry: Option> = None; + for (key, value) in headers { + match key { + Some(key) => match req.headers_mut().entry(key) { + Entry::Occupied(mut e) => { + e.insert(value); + prev_entry = Some(e); + } + Entry::Vacant(e) => { + let e = e.insert_entry(value); + prev_entry = Some(e); + } + }, + None => match prev_entry { + Some(ref mut entry) => { + entry.append(value); + } + None => unreachable!("HeaderMap::into_iter yielded None first"), + }, + } + } + } + self + } + + #[cfg(feature = "gcp")] + pub(crate) fn bearer_auth(mut self, token: &str) -> Self { + let value = HeaderValue::try_from(format!("Bearer {}", token)); + match (value, &mut self.request) { + (Ok(mut v), Ok(r)) => { + v.set_sensitive(true); + r.headers_mut().insert(http::header::AUTHORIZATION, v); + } + (Err(e), Ok(_)) => self.request = Err(e.into()), + (_, Err(_)) => {} + } + self + } + + #[cfg(any(feature = "aws", feature = "gcp"))] + pub(crate) fn json(mut self, s: S) -> Self { + match (serde_json::to_vec(&s), &mut self.request) { + (Ok(json), Ok(request)) => { + *request.body_mut() = json.into(); + } + (Err(e), Ok(_)) => self.request = Err(e.into()), + (_, Err(_)) => {} + } + self + } + + #[cfg(any(feature = "aws", feature = "gcp", feature = "azure"))] + pub(crate) fn query(mut self, query: &T) -> Self { + let mut error = None; + if let Ok(ref mut req) = self.request { + let mut out = format!("{}?", req.uri().path()); + let mut encoder = form_urlencoded::Serializer::new(&mut out); + let serializer = serde_urlencoded::Serializer::new(&mut encoder); + + if let Err(err) = query.serialize(serializer) { + error = Some(err.into()); + } + + match http::uri::PathAndQuery::from_maybe_shared(out) { + Ok(p) => { + let mut parts = req.uri().clone().into_parts(); + parts.path_and_query = Some(p); + *req.uri_mut() = Uri::from_parts(parts).unwrap(); + } + Err(err) => error = Some(err.into()), + } + } + if let Some(err) = error { + self.request = Err(err); + } + self + } + + #[cfg(any(feature = "gcp", feature = "azure"))] + pub(crate) fn form(mut self, form: T) -> Self { + let mut error = None; + if let Ok(ref mut req) = self.request { + match serde_urlencoded::to_string(form) { + Ok(body) => { + req.headers_mut().insert( + http::header::CONTENT_TYPE, + HeaderValue::from_static("application/x-www-form-urlencoded"), + ); + *req.body_mut() = body.into(); + } + Err(err) => error = Some(err.into()), + } + } + if let Some(err) = error { + self.request = Err(err); + } + self + } + + #[cfg(any(feature = "aws", feature = "gcp", feature = "azure"))] + pub(crate) fn body(mut self, b: impl Into) -> Self { + if let Ok(r) = &mut self.request { + *r.body_mut() = b.into(); + } + self + } + + pub(crate) fn into_parts(self) -> (HttpClient, Result) { + (self.client, self.request) + } +} + +#[cfg(any(test, feature = "azure"))] +pub(crate) fn add_query_pairs(uri: &mut Uri, query_pairs: I) +where + I: IntoIterator, + I::Item: std::borrow::Borrow<(K, V)>, + K: AsRef, + V: AsRef, +{ + let mut parts = uri.clone().into_parts(); + + let mut out = match parts.path_and_query { + Some(p) => match p.query() { + Some(x) => format!("{}?{}", p.path(), x), + None => format!("{}?", p.path()), + }, + None => "/?".to_string(), + }; + let mut serializer = form_urlencoded::Serializer::new(&mut out); + serializer.extend_pairs(query_pairs); + + parts.path_and_query = Some(out.try_into().unwrap()); + *uri = Uri::from_parts(parts).unwrap(); +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_add_query_pairs() { + let mut uri = Uri::from_static("https://foo@example.com/bananas?foo=1"); + + add_query_pairs(&mut uri, [("bingo", "foo"), ("auth", "test")]); + assert_eq!( + uri.to_string(), + "https://foo@example.com/bananas?foo=1&bingo=foo&auth=test" + ); + + add_query_pairs(&mut uri, [("t1", "funky shenanigans"), ("a", "😀")]); + assert_eq!( + uri.to_string(), + "https://foo@example.com/bananas?foo=1&bingo=foo&auth=test&t1=funky+shenanigans&a=%F0%9F%98%80" + ); + } +} diff --git a/src/client/connection.rs b/src/client/connection.rs new file mode 100644 index 0000000..8b63169 --- /dev/null +++ b/src/client/connection.rs @@ -0,0 +1,237 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::client::body::{HttpRequest, HttpResponse}; +use crate::client::builder::{HttpRequestBuilder, RequestBuilderError}; +use crate::client::HttpResponseBody; +use crate::ClientOptions; +use async_trait::async_trait; +use http::{Method, Uri}; +use http_body_util::BodyExt; +use std::error::Error; +use std::sync::Arc; + +/// An HTTP protocol error +/// +/// Clients should return this when an HTTP request fails to be completed, e.g. because +/// of a connection issue. This does **not** include HTTP requests that are return +/// non 2xx Status Codes, as these should instead be returned as an [`HttpResponse`] +/// with the appropriate status code set. +#[derive(Debug, thiserror::Error)] +#[error("HTTP error: {source}")] +pub struct HttpError { + kind: HttpErrorKind, + #[source] + source: Box, +} + +/// Identifies the kind of [`HttpError`] +/// +/// This is used, among other things, to determine if a request can be retried +#[derive(Debug, Copy, Clone, PartialEq, Eq)] +#[non_exhaustive] +pub enum HttpErrorKind { + /// An error occurred whilst connecting to the remote + /// + /// Will be automatically retried + Connect, + /// An error occurred whilst making the request + /// + /// Will be automatically retried + Request, + /// Request timed out + /// + /// Will be automatically retried if the request is idempotent + Timeout, + /// The request was aborted + /// + /// Will be automatically retried if the request is idempotent + Interrupted, + /// An error occurred whilst decoding the response + /// + /// Will not be automatically retried + Decode, + /// An unknown error occurred + /// + /// Will not be automatically retried + Unknown, +} + +impl HttpError { + /// Create a new [`HttpError`] with the optional status code + pub fn new(kind: HttpErrorKind, e: E) -> Self + where + E: Error + Send + Sync + 'static, + { + Self { + kind, + source: Box::new(e), + } + } + + pub(crate) fn reqwest(e: reqwest::Error) -> Self { + let mut kind = if e.is_timeout() { + HttpErrorKind::Timeout + } else if e.is_connect() { + HttpErrorKind::Connect + } else if e.is_decode() { + HttpErrorKind::Decode + } else { + HttpErrorKind::Unknown + }; + + // Reqwest error variants aren't great, attempt to refine them + let mut source = e.source(); + while let Some(e) = source { + if let Some(e) = e.downcast_ref::() { + if e.is_closed() || e.is_incomplete_message() || e.is_body_write_aborted() { + kind = HttpErrorKind::Request; + } else if e.is_timeout() { + kind = HttpErrorKind::Timeout; + } + break; + } + if let Some(e) = e.downcast_ref::() { + match e.kind() { + std::io::ErrorKind::TimedOut => kind = HttpErrorKind::Timeout, + std::io::ErrorKind::ConnectionAborted + | std::io::ErrorKind::BrokenPipe + | std::io::ErrorKind::UnexpectedEof => kind = HttpErrorKind::Interrupted, + _ => {} + } + break; + } + source = e.source(); + } + Self { + kind, + // We strip URL as it will be included by RetryError if not sensitive + source: Box::new(e.without_url()), + } + } + + /// Returns the [`HttpErrorKind`] + pub fn kind(&self) -> HttpErrorKind { + self.kind + } +} + +/// An asynchronous function from a [`HttpRequest`] to a [`HttpResponse`]. +#[async_trait] +pub trait HttpService: std::fmt::Debug + Send + Sync + 'static { + /// Perform [`HttpRequest`] returning [`HttpResponse`] + async fn call(&self, req: HttpRequest) -> Result; +} + +/// An HTTP client +#[derive(Debug, Clone)] +pub struct HttpClient(Arc); + +impl HttpClient { + /// Create a new [`HttpClient`] from an [`HttpService`] + pub fn new(service: impl HttpService + 'static) -> Self { + Self(Arc::new(service)) + } + + /// Performs [`HttpRequest`] using this client + pub async fn execute(&self, request: HttpRequest) -> Result { + self.0.call(request).await + } + + #[allow(unused)] + pub(crate) fn get(&self, url: U) -> HttpRequestBuilder + where + U: TryInto, + U::Error: Into, + { + self.request(Method::GET, url) + } + + #[allow(unused)] + pub(crate) fn post(&self, url: U) -> HttpRequestBuilder + where + U: TryInto, + U::Error: Into, + { + self.request(Method::POST, url) + } + + #[allow(unused)] + pub(crate) fn put(&self, url: U) -> HttpRequestBuilder + where + U: TryInto, + U::Error: Into, + { + self.request(Method::PUT, url) + } + + #[allow(unused)] + pub(crate) fn delete(&self, url: U) -> HttpRequestBuilder + where + U: TryInto, + U::Error: Into, + { + self.request(Method::DELETE, url) + } + + pub(crate) fn request(&self, method: Method, url: U) -> HttpRequestBuilder + where + U: TryInto, + U::Error: Into, + { + HttpRequestBuilder::new(self.clone()) + .uri(url) + .method(method) + } +} + +#[async_trait] +impl HttpService for reqwest::Client { + async fn call(&self, req: HttpRequest) -> Result { + let (parts, body) = req.into_parts(); + + let url = parts.uri.to_string().parse().unwrap(); + let mut req = reqwest::Request::new(parts.method, url); + *req.headers_mut() = parts.headers; + *req.body_mut() = Some(body.into_reqwest()); + + let r = self.execute(req).await.map_err(HttpError::reqwest)?; + let res: http::Response = r.into(); + let (parts, body) = res.into_parts(); + + let body = HttpResponseBody::new(body.map_err(HttpError::reqwest)); + Ok(HttpResponse::from_parts(parts, body)) + } +} + +/// A factory for [`HttpClient`] +pub trait HttpConnector: std::fmt::Debug + Send + Sync + 'static { + /// Create a new [`HttpClient`] with the provided [`ClientOptions`] + fn connect(&self, options: &ClientOptions) -> crate::Result; +} + +/// [`HttpConnector`] using [`reqwest::Client`] +#[derive(Debug, Default)] +#[allow(missing_copy_implementations)] +pub struct ReqwestConnector {} + +impl HttpConnector for ReqwestConnector { + fn connect(&self, options: &ClientOptions) -> crate::Result { + let client = options.client()?; + Ok(HttpClient::new(client)) + } +} diff --git a/src/client/get.rs b/src/client/get.rs index f252dd9..4c65c6d 100644 --- a/src/client/get.rs +++ b/src/client/get.rs @@ -18,17 +18,17 @@ use std::ops::Range; use crate::client::header::{header_meta, HeaderConfig}; +use crate::client::HttpResponse; use crate::path::Path; use crate::{Attribute, Attributes, GetOptions, GetRange, GetResult, GetResultPayload, Result}; use async_trait::async_trait; use futures::{StreamExt, TryStreamExt}; -use hyper::header::{ +use http::header::{ CACHE_CONTROL, CONTENT_DISPOSITION, CONTENT_ENCODING, CONTENT_LANGUAGE, CONTENT_RANGE, CONTENT_TYPE, }; -use hyper::StatusCode; +use http::StatusCode; use reqwest::header::ToStrError; -use reqwest::Response; /// A client that can perform a get request #[async_trait] @@ -38,7 +38,7 @@ pub(crate) trait GetClient: Send + Sync + 'static { /// Configure the [`HeaderConfig`] for this client const HEADER_CONFIG: HeaderConfig; - async fn get_request(&self, path: &Path, options: GetOptions) -> Result; + async fn get_request(&self, path: &Path, options: GetOptions) -> Result; } /// Extension trait for [`GetClient`] that adds common retrieval functionality @@ -148,7 +148,7 @@ enum GetResultError { fn get_result( location: &Path, range: Option, - response: Response, + response: HttpResponse, ) -> Result { let mut meta = header_meta(location, response.headers(), T::HEADER_CONFIG)?; @@ -241,6 +241,7 @@ fn get_result( } let stream = response + .into_body() .bytes_stream() .map_err(|source| crate::Error::Generic { store: T::STORE, @@ -259,8 +260,7 @@ fn get_result( #[cfg(test)] mod tests { use super::*; - use hyper::http; - use hyper::http::header::*; + use http::header::*; struct TestClient {} @@ -275,7 +275,7 @@ mod tests { user_defined_metadata_prefix: Some("x-test-meta-"), }; - async fn get_request(&self, _: &Path, _: GetOptions) -> Result { + async fn get_request(&self, _: &Path, _: GetOptions) -> Result { unimplemented!() } } @@ -286,7 +286,7 @@ mod tests { status: StatusCode, content_range: Option<&str>, headers: Option>, - ) -> Response { + ) -> HttpResponse { let mut builder = http::Response::builder(); if let Some(range) = content_range { builder = builder.header(CONTENT_RANGE, range); @@ -306,9 +306,8 @@ mod tests { builder .status(status) .header(CONTENT_LENGTH, object_size) - .body(body) + .body(body.into()) .unwrap() - .into() } #[tokio::test] diff --git a/src/client/header.rs b/src/client/header.rs index db06da6..d7e14b3 100644 --- a/src/client/header.rs +++ b/src/client/header.rs @@ -20,8 +20,8 @@ use crate::path::Path; use crate::ObjectMeta; use chrono::{DateTime, TimeZone, Utc}; -use hyper::header::{CONTENT_LENGTH, ETAG, LAST_MODIFIED}; -use hyper::HeaderMap; +use http::header::{CONTENT_LENGTH, ETAG, LAST_MODIFIED}; +use http::HeaderMap; #[derive(Debug, Copy, Clone)] /// Configuration for header extraction diff --git a/src/client/mod.rs b/src/client/mod.rs index 11f8b3e..4fe3cff 100644 --- a/src/client/mod.rs +++ b/src/client/mod.rs @@ -42,19 +42,28 @@ pub(crate) mod header; #[cfg(any(feature = "aws", feature = "gcp"))] pub(crate) mod s3; +mod body; +pub use body::{HttpRequest, HttpRequestBody, HttpResponse, HttpResponseBody}; + +pub(crate) mod builder; + +mod connection; +pub use connection::{ + HttpClient, HttpConnector, HttpError, HttpErrorKind, HttpService, ReqwestConnector, +}; + #[cfg(any(feature = "aws", feature = "gcp", feature = "azure"))] pub(crate) mod parts; use async_trait::async_trait; +use reqwest::header::{HeaderMap, HeaderValue}; +use reqwest::{Client, ClientBuilder, NoProxy, Proxy}; +use serde::{Deserialize, Serialize}; use std::collections::HashMap; use std::str::FromStr; use std::sync::Arc; use std::time::Duration; -use reqwest::header::{HeaderMap, HeaderValue}; -use reqwest::{Client, ClientBuilder, NoProxy, Proxy, RequestBuilder}; -use serde::{Deserialize, Serialize}; - use crate::config::{fmt_duration, ConfigValue}; use crate::path::Path; use crate::{GetOptions, Result}; @@ -593,17 +602,16 @@ impl ClientOptions { } } - /// Create a [`Client`] with overrides optimised for metadata endpoint access + /// Returns a copy of this [`ClientOptions`] with overrides necessary for metadata endpoint access /// /// In particular: /// * Allows HTTP as metadata endpoints do not use TLS /// * Configures a low connection timeout to provide quick feedback if not present #[cfg(any(feature = "aws", feature = "gcp", feature = "azure"))] - pub(crate) fn metadata_client(&self) -> Result { + pub(crate) fn metadata_options(&self) -> Self { self.clone() .with_allow_http(true) .with_connect_timeout(Duration::from_secs(1)) - .client() } pub(crate) fn client(&self) -> Result { @@ -706,7 +714,7 @@ pub(crate) trait GetOptionsExt { fn with_get_options(self, options: GetOptions) -> Self; } -impl GetOptionsExt for RequestBuilder { +impl GetOptionsExt for HttpRequestBuilder { fn with_get_options(mut self, options: GetOptions) -> Self { use hyper::header::*; @@ -782,13 +790,13 @@ mod cloud { #[derive(Debug)] pub(crate) struct TokenCredentialProvider { inner: T, - client: Client, + client: HttpClient, retry: RetryConfig, cache: TokenCache>, } impl TokenCredentialProvider { - pub(crate) fn new(inner: T, client: Client, retry: RetryConfig) -> Self { + pub(crate) fn new(inner: T, client: HttpClient, retry: RetryConfig) -> Self { Self { inner, client, @@ -822,12 +830,13 @@ mod cloud { async fn fetch_token( &self, - client: &Client, + client: &HttpClient, retry: &RetryConfig, ) -> Result>>; } } +use crate::client::builder::HttpRequestBuilder; #[cfg(any(feature = "aws", feature = "azure", feature = "gcp"))] pub(crate) use cloud::*; diff --git a/src/client/retry.rs b/src/client/retry.rs index a3f8fcb..96244aa 100644 --- a/src/client/retry.rs +++ b/src/client/retry.rs @@ -18,60 +18,118 @@ //! A shared HTTP client implementation incorporating retries use crate::client::backoff::{Backoff, BackoffConfig}; +use crate::client::builder::HttpRequestBuilder; +use crate::client::connection::HttpErrorKind; +use crate::client::{HttpClient, HttpError, HttpRequest, HttpResponse}; use crate::PutPayload; use futures::future::BoxFuture; +use http::{Method, Uri}; use reqwest::header::LOCATION; -use reqwest::{Client, Request, Response, StatusCode}; -use std::error::Error as StdError; +use reqwest::StatusCode; use std::time::{Duration, Instant}; -use tracing::{debug, info}; +use tracing::info; /// Retry request error #[derive(Debug, thiserror::Error)] -pub enum Error { - #[error("Received redirect without LOCATION, this normally indicates an incorrectly configured region")] +pub struct RetryError { + method: Method, + uri: Option, + retries: usize, + max_retries: usize, + elapsed: Duration, + retry_timeout: Duration, + inner: RequestError, +} + +impl std::fmt::Display for RetryError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "Error performing {} ", self.method)?; + match &self.uri { + Some(uri) => write!(f, "{uri} ")?, + None => write!(f, "REDACTED ")?, + } + write!(f, "in {:?}", self.elapsed)?; + if self.retries != 0 { + write!( + f, + ", after {} retries, max_retries: {}, retry_timeout: {:?} ", + self.retries, self.max_retries, self.retry_timeout + )?; + } + write!(f, " - {}", self.inner) + } +} + +/// Context of the retry loop +struct RetryContext { + method: Method, + uri: Option, + retries: usize, + max_retries: usize, + start: Instant, + retry_timeout: Duration, +} + +impl RetryContext { + fn err(self, error: RequestError) -> RetryError { + RetryError { + uri: self.uri, + method: self.method, + retries: self.retries, + max_retries: self.max_retries, + elapsed: self.start.elapsed(), + retry_timeout: self.retry_timeout, + inner: error, + } + } + + fn exhausted(&self) -> bool { + self.retries == self.max_retries || self.start.elapsed() > self.retry_timeout + } +} + +/// The reason a request failed +#[derive(Debug, thiserror::Error)] +pub enum RequestError { + #[error("Received redirect without LOCATION, this normally indicates an incorrectly configured region" + )] BareRedirect, - #[error("Server error, body contains Error, with status {status}: {}", body.as_deref().unwrap_or("No Body"))] - Server { + #[error("Server returned non-2xx status code: {status}: {}", body.as_deref().unwrap_or(""))] + Status { status: StatusCode, body: Option, }, - #[error("Client error with status {status}: {}", body.as_deref().unwrap_or("No Body"))] - Client { - status: StatusCode, - body: Option, - }, + #[error("Server returned error response: {body}")] + Response { status: StatusCode, body: String }, - #[error("Error after {retries} retries in {elapsed:?}, max_retries:{max_retries}, retry_timeout:{retry_timeout:?}, source:{source}")] - Reqwest { - retries: usize, - max_retries: usize, - elapsed: Duration, - retry_timeout: Duration, - source: reqwest::Error, - }, + #[error(transparent)] + Http(#[from] HttpError), } -impl Error { +impl RetryError { + /// Returns the underlying [`RequestError`] + pub fn inner(&self) -> &RequestError { + &self.inner + } + /// Returns the status code associated with this error if any pub fn status(&self) -> Option { - match self { - Self::BareRedirect => None, - Self::Server { status, .. } => Some(*status), - Self::Client { status, .. } => Some(*status), - Self::Reqwest { source, .. } => source.status(), + match &self.inner { + RequestError::Status { status, .. } | RequestError::Response { status, .. } => { + Some(*status) + } + RequestError::BareRedirect | RequestError::Http(_) => None, } } /// Returns the error body if any pub fn body(&self) -> Option<&str> { - match self { - Self::Client { body, .. } => body.as_deref(), - Self::Server { body, .. } => body.as_deref(), - Self::BareRedirect => None, - Self::Reqwest { .. } => None, + match &self.inner { + RequestError::Status { body, .. } => body.as_deref(), + RequestError::Response { body, .. } => Some(body), + RequestError::BareRedirect | RequestError::Http(_) => None, } } @@ -109,34 +167,29 @@ impl Error { } } -impl From for std::io::Error { - fn from(err: Error) -> Self { +impl From for std::io::Error { + fn from(err: RetryError) -> Self { use std::io::ErrorKind; - match &err { - Error::Client { - status: StatusCode::NOT_FOUND, - .. - } => Self::new(ErrorKind::NotFound, err), - Error::Client { - status: StatusCode::BAD_REQUEST, - .. - } => Self::new(ErrorKind::InvalidInput, err), - Error::Client { - status: StatusCode::UNAUTHORIZED | StatusCode::FORBIDDEN, - .. - } => Self::new(ErrorKind::PermissionDenied, err), - Error::Reqwest { source, .. } if source.is_timeout() => { - Self::new(ErrorKind::TimedOut, err) + let kind = match err.status() { + Some(StatusCode::NOT_FOUND) => ErrorKind::NotFound, + Some(StatusCode::BAD_REQUEST) => ErrorKind::InvalidInput, + Some(StatusCode::UNAUTHORIZED) | Some(StatusCode::FORBIDDEN) => { + ErrorKind::PermissionDenied } - Error::Reqwest { source, .. } if source.is_connect() => { - Self::new(ErrorKind::NotConnected, err) - } - _ => Self::new(ErrorKind::Other, err), - } + _ => match &err.inner { + RequestError::Http(h) => match h.kind() { + HttpErrorKind::Timeout => ErrorKind::TimedOut, + HttpErrorKind::Connect => ErrorKind::NotConnected, + _ => ErrorKind::Other, + }, + _ => ErrorKind::Other, + }, + }; + Self::new(kind, err) } } -pub(crate) type Result = std::result::Result; +pub(crate) type Result = std::result::Result; /// The configuration for how to respond to request errors /// @@ -190,8 +243,8 @@ fn body_contains_error(response_body: &str) -> bool { } pub(crate) struct RetryableRequest { - client: Client, - request: Request, + client: HttpClient, + request: HttpRequest, max_retries: usize, retry_timeout: Duration, @@ -247,35 +300,32 @@ impl RetryableRequest { } } - pub(crate) async fn send(self) -> Result { - let max_retries = self.max_retries; - let retry_timeout = self.retry_timeout; - let mut retries = 0; - let now = Instant::now(); + pub(crate) async fn send(self) -> Result { + let mut ctx = RetryContext { + retries: 0, + uri: (!self.sensitive).then(|| self.request.uri().clone()), + method: self.request.method().clone(), + max_retries: self.max_retries, + start: Instant::now(), + retry_timeout: self.retry_timeout, + }; let mut backoff = self.backoff; let is_idempotent = self .idempotent .unwrap_or_else(|| self.request.method().is_safe()); - let sanitize_err = move |e: reqwest::Error| match self.sensitive { - true => e.without_url(), - false => e, - }; - loop { - let mut request = self - .request - .try_clone() - .expect("request body must be cloneable"); + let mut request = self.request.clone(); if let Some(payload) = &self.payload { - *request.body_mut() = Some(payload.body()); + *request.body_mut() = payload.clone().into(); } match self.client.execute(request).await { - Ok(r) => match r.error_for_status_ref() { - Ok(_) if r.status().is_success() => { + Ok(r) => { + let status = r.status(); + if status.is_success() { // For certain S3 requests, 200 response may contain `InternalError` or // `SlowDown` in the message. These responses should be handled similarly // to r5xx errors. @@ -284,164 +334,95 @@ impl RetryableRequest { return Ok(r); } - let status = r.status(); - let headers = r.headers().clone(); - - let bytes = r.bytes().await.map_err(|e| Error::Reqwest { - retries, - max_retries, - elapsed: now.elapsed(), - retry_timeout, - source: e, - })?; - - let response_body = String::from_utf8_lossy(&bytes); - debug!("Checking for error in response_body: {}", response_body); + let (parts, body) = r.into_parts(); + let body = match body.text().await { + Ok(body) => body, + Err(e) => return Err(ctx.err(RequestError::Http(e))), + }; - if !body_contains_error(&response_body) { + if !body_contains_error(&body) { // Success response and no error, clone and return response - let mut success_response = hyper::Response::new(bytes); - *success_response.status_mut() = status; - *success_response.headers_mut() = headers; - - return Ok(reqwest::Response::from(success_response)); + return Ok(HttpResponse::from_parts(parts, body.into())); } else { // Retry as if this was a 5xx response - if retries == max_retries || now.elapsed() > retry_timeout { - return Err(Error::Server { - body: Some(response_body.into_owned()), - status, - }); + if ctx.exhausted() { + return Err(ctx.err(RequestError::Response { body, status })); } let sleep = backoff.next(); - retries += 1; + ctx.retries += 1; info!( "Encountered a response status of {} but body contains Error, backing off for {} seconds, retry {} of {}", status, sleep.as_secs_f32(), - retries, - max_retries, + ctx.retries, + ctx.max_retries, ); tokio::time::sleep(sleep).await; } - } - Ok(r) if r.status() == StatusCode::NOT_MODIFIED => { - return Err(Error::Client { - body: None, - status: StatusCode::NOT_MODIFIED, - }) - } - Ok(r) => { - let is_bare_redirect = - r.status().is_redirection() && !r.headers().contains_key(LOCATION); + } else if status == StatusCode::NOT_MODIFIED { + return Err(ctx.err(RequestError::Status { status, body: None })); + } else if status.is_redirection() { + let is_bare_redirect = !r.headers().contains_key(LOCATION); return match is_bare_redirect { - true => Err(Error::BareRedirect), - // Not actually sure if this is reachable, but here for completeness - false => Err(Error::Client { + true => Err(ctx.err(RequestError::BareRedirect)), + false => Err(ctx.err(RequestError::Status { body: None, status: r.status(), - }), + })), }; - } - Err(e) => { - let e = sanitize_err(e); + } else { let status = r.status(); - if retries == max_retries - || now.elapsed() > retry_timeout + if ctx.exhausted() || !(status.is_server_error() || (self.retry_on_conflict && status == StatusCode::CONFLICT)) { - return Err(match status.is_client_error() { - true => match r.text().await { - Ok(body) => Error::Client { - body: Some(body).filter(|b| !b.is_empty()), + let source = match status.is_client_error() { + true => match r.into_body().text().await { + Ok(body) => RequestError::Status { status, + body: Some(body), }, - Err(e) => Error::Reqwest { - retries, - max_retries, - elapsed: now.elapsed(), - retry_timeout, - source: e, - }, + Err(e) => RequestError::Http(e), }, - false => Error::Reqwest { - retries, - max_retries, - elapsed: now.elapsed(), - retry_timeout, - source: e, - }, - }); - } + false => RequestError::Status { status, body: None }, + }; + return Err(ctx.err(source)); + }; let sleep = backoff.next(); - retries += 1; + ctx.retries += 1; info!( - "Encountered server error, backing off for {} seconds, retry {} of {}: {}", + "Encountered server error, backing off for {} seconds, retry {} of {}", sleep.as_secs_f32(), - retries, - max_retries, - e, + ctx.retries, + ctx.max_retries, ); tokio::time::sleep(sleep).await; } - }, + } Err(e) => { - let e = sanitize_err(e); + // let e = sanitize_err(e); - let mut do_retry = false; - if e.is_connect() - || e.is_body() - || (e.is_request() && !e.is_timeout()) - || (is_idempotent && e.is_timeout()) - { - do_retry = true - } else { - let mut source = e.source(); - while let Some(e) = source { - if let Some(e) = e.downcast_ref::() { - do_retry = e.is_closed() - || e.is_incomplete_message() - || e.is_body_write_aborted() - || (is_idempotent && e.is_timeout()); - break; - } - if let Some(e) = e.downcast_ref::() { - if e.kind() == std::io::ErrorKind::TimedOut { - do_retry = is_idempotent; - } else { - do_retry = matches!( - e.kind(), - std::io::ErrorKind::ConnectionReset - | std::io::ErrorKind::ConnectionAborted - | std::io::ErrorKind::BrokenPipe - | std::io::ErrorKind::UnexpectedEof - ); - } - break; - } - source = e.source(); - } - } + let do_retry = match e.kind() { + HttpErrorKind::Connect | HttpErrorKind::Request => true, // Request not sent, can retry + HttpErrorKind::Timeout | HttpErrorKind::Interrupted => is_idempotent, + HttpErrorKind::Unknown | HttpErrorKind::Decode => false, + }; - if retries == max_retries || now.elapsed() > retry_timeout || !do_retry { - return Err(Error::Reqwest { - retries, - max_retries, - elapsed: now.elapsed(), - retry_timeout, - source: e, - }); + if ctx.retries == ctx.max_retries + || ctx.start.elapsed() > ctx.retry_timeout + || !do_retry + { + return Err(ctx.err(RequestError::Http(e))); } let sleep = backoff.next(); - retries += 1; + ctx.retries += 1; info!( "Encountered transport error backing off for {} seconds, retry {} of {}: {}", sleep.as_secs_f32(), - retries, - max_retries, + ctx.retries, + ctx.max_retries, e, ); tokio::time::sleep(sleep).await; @@ -460,12 +441,12 @@ pub(crate) trait RetryExt { /// # Panic /// /// This will panic if the request body is a stream - fn send_retry(self, config: &RetryConfig) -> BoxFuture<'static, Result>; + fn send_retry(self, config: &RetryConfig) -> BoxFuture<'static, Result>; } -impl RetryExt for reqwest::RequestBuilder { +impl RetryExt for HttpRequestBuilder { fn retryable(self, config: &RetryConfig) -> RetryableRequest { - let (client, request) = self.build_split(); + let (client, request) = self.into_parts(); let request = request.expect("request must be valid"); RetryableRequest { @@ -482,7 +463,7 @@ impl RetryExt for reqwest::RequestBuilder { } } - fn send_retry(self, config: &RetryConfig) -> BoxFuture<'static, Result> { + fn send_retry(self, config: &RetryConfig) -> BoxFuture<'static, Result> { let request = self.retryable(config); Box::pin(async move { request.send().await }) } @@ -491,7 +472,8 @@ impl RetryExt for reqwest::RequestBuilder { #[cfg(test)] mod tests { use crate::client::mock_server::MockServer; - use crate::client::retry::{body_contains_error, Error, RetryExt}; + use crate::client::retry::{body_contains_error, RequestError, RetryExt}; + use crate::client::HttpClient; use crate::RetryConfig; use hyper::header::LOCATION; use hyper::Response; @@ -522,10 +504,12 @@ mod tests { retry_timeout: Duration::from_secs(1000), }; - let client = Client::builder() - .timeout(Duration::from_millis(100)) - .build() - .unwrap(); + let client = HttpClient::new( + Client::builder() + .timeout(Duration::from_millis(100)) + .build() + .unwrap(), + ); let do_request = || client.request(Method::GET, mock.url()).send_retry(&retry); @@ -545,24 +529,24 @@ mod tests { assert_eq!(e.status().unwrap(), StatusCode::BAD_REQUEST); assert_eq!(e.body(), Some("cupcakes")); assert_eq!( - e.to_string(), - "Client error with status 400 Bad Request: cupcakes" + e.inner().to_string(), + "Server returned non-2xx status code: 400 Bad Request: cupcakes" ); // Handles client errors with no payload mock.push( Response::builder() .status(StatusCode::BAD_REQUEST) - .body(String::new()) + .body("NAUGHTY NAUGHTY".to_string()) .unwrap(), ); let e = do_request().await.unwrap_err(); assert_eq!(e.status().unwrap(), StatusCode::BAD_REQUEST); - assert_eq!(e.body(), None); + assert_eq!(e.body(), Some("NAUGHTY NAUGHTY")); assert_eq!( - e.to_string(), - "Client error with status 400 Bad Request: No Body" + e.inner().to_string(), + "Server returned non-2xx status code: 400 Bad Request: NAUGHTY NAUGHTY" ); // Should retry server error request @@ -598,7 +582,6 @@ mod tests { let r = do_request().await.unwrap(); assert_eq!(r.status(), StatusCode::OK); - assert_eq!(r.url().path(), "/foo"); // Follows 401 redirects mock.push( @@ -611,7 +594,6 @@ mod tests { let r = do_request().await.unwrap(); assert_eq!(r.status(), StatusCode::OK); - assert_eq!(r.url().path(), "/bar"); // Handles redirect loop for _ in 0..10 { @@ -625,7 +607,7 @@ mod tests { } let e = do_request().await.unwrap_err().to_string(); - assert!(e.contains("error following redirect for url"), "{}", e); + assert!(e.contains("error following redirect"), "{}", e); // Handles redirect missing location mock.push( @@ -636,8 +618,8 @@ mod tests { ); let e = do_request().await.unwrap_err(); - assert!(matches!(e, Error::BareRedirect)); - assert_eq!(e.to_string(), "Received redirect without LOCATION, this normally indicates an incorrectly configured region"); + assert!(matches!(e.inner, RequestError::BareRedirect)); + assert_eq!(e.inner().to_string(), "Received redirect without LOCATION, this normally indicates an incorrectly configured region"); // Gives up after the retrying the specified number of times for _ in 0..=retry.max_retries { @@ -651,8 +633,7 @@ mod tests { let e = do_request().await.unwrap_err().to_string(); assert!( - e.contains("Error after 2 retries in") && - e.contains("max_retries:2, retry_timeout:1000s, source:HTTP status server error (502 Bad Gateway) for url"), + e.contains(" after 2 retries, max_retries: 2, retry_timeout: 1000s - Server returned non-2xx status code: 502 Bad Gateway"), "{e}" ); @@ -667,10 +648,7 @@ mod tests { } let e = do_request().await.unwrap_err().to_string(); assert!( - e.contains("Error after 2 retries in") - && e.contains( - "max_retries:2, retry_timeout:1000s, source:error sending request for url" - ), + e.contains("after 2 retries, max_retries: 2, retry_timeout: 1000s - HTTP error: error sending request"), "{e}" ); @@ -689,7 +667,7 @@ mod tests { let res = client.request(Method::PUT, mock.url()).send_retry(&retry); let e = res.await.unwrap_err().to_string(); assert!( - e.contains("Error after 0 retries in") && e.contains("error sending request for url"), + !e.contains("retries") && e.contains("error sending request"), "{e}" ); @@ -750,7 +728,8 @@ mod tests { let r = req.send().await.unwrap(); assert_eq!(r.status(), StatusCode::OK); // Response with InternalError should have been retried - assert!(!r.text().await.unwrap().contains("InternalError")); + let b = r.into_body().text().await.unwrap(); + assert!(!b.contains("InternalError")); // Should not retry success response with no error in body mock.push( @@ -766,7 +745,8 @@ mod tests { .retry_error_body(true); let r = req.send().await.unwrap(); assert_eq!(r.status(), StatusCode::OK); - assert!(r.text().await.unwrap().contains("success")); + let b = r.into_body().text().await.unwrap(); + assert!(b.contains("success")); // Shutdown mock.shutdown().await diff --git a/src/gcp/builder.rs b/src/gcp/builder.rs index cc5c1e1..7939783 100644 --- a/src/gcp/builder.rs +++ b/src/gcp/builder.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -use crate::client::TokenCredentialProvider; +use crate::client::{HttpConnector, ReqwestConnector, TokenCredentialProvider}; use crate::gcp::client::{GoogleCloudStorageClient, GoogleCloudStorageConfig}; use crate::gcp::credential::{ ApplicationDefaultCredentials, InstanceCredentialProvider, ServiceAccountCredentials, @@ -111,6 +111,8 @@ pub struct GoogleCloudStorageBuilder { credentials: Option, /// Credentials for sign url signing_credentials: Option, + /// The [`HttpConnector`] to use + http_connector: Option>, } /// Configuration keys for [`GoogleCloudStorageBuilder`] @@ -207,6 +209,7 @@ impl Default for GoogleCloudStorageBuilder { url: None, credentials: None, signing_credentials: None, + http_connector: None, } } } @@ -424,6 +427,12 @@ impl GoogleCloudStorageBuilder { self } + /// Overrides the [`HttpConnector`], by default uses [`ReqwestConnector`] + pub fn with_http_connector(mut self, connector: C) -> Self { + self.http_connector = Some(Arc::new(connector)); + self + } + /// Configure a connection to Google Cloud Storage, returning a /// new [`GoogleCloudStorage`] and consuming `self` pub fn build(mut self) -> Result { @@ -433,6 +442,10 @@ impl GoogleCloudStorageBuilder { let bucket_name = self.bucket_name.ok_or(Error::MissingBucketName {})?; + let http = self + .http_connector + .unwrap_or_else(|| Arc::new(ReqwestConnector::default())); + // First try to initialize from the service account information. let service_account_credentials = match (self.service_account_path, self.service_account_key) { @@ -471,7 +484,7 @@ impl GoogleCloudStorageBuilder { } else if let Some(credentials) = service_account_credentials.clone() { Arc::new(TokenCredentialProvider::new( credentials.token_provider()?, - self.client_options.client()?, + http.connect(&self.client_options)?, self.retry_config.clone(), )) as _ } else if let Some(credentials) = application_default_credentials.clone() { @@ -479,7 +492,7 @@ impl GoogleCloudStorageBuilder { ApplicationDefaultCredentials::AuthorizedUser(token) => Arc::new( TokenCredentialProvider::new( token, - self.client_options.client()?, + http.connect(&self.client_options)?, self.retry_config.clone(), ) .with_min_ttl(TOKEN_MIN_TTL), @@ -487,7 +500,7 @@ impl GoogleCloudStorageBuilder { ApplicationDefaultCredentials::ServiceAccount(token) => { Arc::new(TokenCredentialProvider::new( token.token_provider()?, - self.client_options.client()?, + http.connect(&self.client_options)?, self.retry_config.clone(), )) as _ } @@ -496,7 +509,7 @@ impl GoogleCloudStorageBuilder { Arc::new( TokenCredentialProvider::new( InstanceCredentialProvider::default(), - self.client_options.metadata_client()?, + http.connect(&self.client_options.metadata_options())?, self.retry_config.clone(), ) .with_min_ttl(TOKEN_MIN_TTL), @@ -517,7 +530,7 @@ impl GoogleCloudStorageBuilder { ApplicationDefaultCredentials::AuthorizedUser(token) => { Arc::new(TokenCredentialProvider::new( AuthorizedUserSigningCredentials::from(token)?, - self.client_options.client()?, + http.connect(&self.client_options)?, self.retry_config.clone(), )) as _ } @@ -528,7 +541,7 @@ impl GoogleCloudStorageBuilder { } else { Arc::new(TokenCredentialProvider::new( InstanceSigningCredentialProvider::default(), - self.client_options.metadata_client()?, + http.connect(&self.client_options.metadata_options())?, self.retry_config.clone(), )) as _ }; @@ -542,8 +555,9 @@ impl GoogleCloudStorageBuilder { self.client_options, ); + let http_client = http.connect(&config.client_options)?; Ok(GoogleCloudStorage { - client: Arc::new(GoogleCloudStorageClient::new(config)?), + client: Arc::new(GoogleCloudStorageClient::new(config, http_client)?), }) } } diff --git a/src/gcp/client.rs b/src/gcp/client.rs index 8dd1c69..a52ad36 100644 --- a/src/gcp/client.rs +++ b/src/gcp/client.rs @@ -15,6 +15,7 @@ // specific language governing permissions and limitations // under the License. +use crate::client::builder::HttpRequestBuilder; use crate::client::get::GetClient; use crate::client::header::{get_put_result, get_version, HeaderConfig}; use crate::client::list::ListClient; @@ -23,7 +24,7 @@ use crate::client::s3::{ CompleteMultipartUpload, CompleteMultipartUploadResult, InitiateMultipartUploadResult, ListResponse, }; -use crate::client::GetOptionsExt; +use crate::client::{GetOptionsExt, HttpClient, HttpError, HttpResponse}; use crate::gcp::{GcpCredential, GcpCredentialProvider, GcpSigningCredentialProvider, STORE}; use crate::multipart::PartId; use crate::path::{Path, DELIMITER}; @@ -36,13 +37,12 @@ use async_trait::async_trait; use base64::prelude::BASE64_STANDARD; use base64::Engine; use bytes::Buf; -use hyper::header::{ +use http::header::{ CACHE_CONTROL, CONTENT_DISPOSITION, CONTENT_ENCODING, CONTENT_LANGUAGE, CONTENT_LENGTH, CONTENT_TYPE, }; +use http::{HeaderName, Method, StatusCode}; use percent_encoding::{percent_encode, utf8_percent_encode, NON_ALPHANUMERIC}; -use reqwest::header::HeaderName; -use reqwest::{Client, Method, RequestBuilder, Response, StatusCode}; use serde::{Deserialize, Serialize}; use std::sync::Arc; @@ -55,28 +55,30 @@ static VERSION_MATCH: HeaderName = HeaderName::from_static("x-goog-if-generation #[derive(Debug, thiserror::Error)] enum Error { #[error("Error performing list request: {}", source)] - ListRequest { source: crate::client::retry::Error }, + ListRequest { + source: crate::client::retry::RetryError, + }, #[error("Error getting list response body: {}", source)] - ListResponseBody { source: reqwest::Error }, + ListResponseBody { source: HttpError }, #[error("Got invalid list response: {}", source)] InvalidListResponse { source: quick_xml::de::DeError }, #[error("Error performing get request {}: {}", path, source)] GetRequest { - source: crate::client::retry::Error, + source: crate::client::retry::RetryError, path: String, }, #[error("Error performing request {}: {}", path, source)] Request { - source: crate::client::retry::Error, + source: crate::client::retry::RetryError, path: String, }, #[error("Error getting put response body: {}", source)] - PutResponseBody { source: reqwest::Error }, + PutResponseBody { source: HttpError }, #[error("Got invalid put request: {}", source)] InvalidPutRequest { source: quick_xml::se::SeError }, @@ -93,19 +95,23 @@ enum Error { MissingVersion, #[error("Error performing complete multipart request: {}", source)] - CompleteMultipartRequest { source: crate::client::retry::Error }, + CompleteMultipartRequest { + source: crate::client::retry::RetryError, + }, #[error("Error getting complete multipart response body: {}", source)] - CompleteMultipartResponseBody { source: reqwest::Error }, + CompleteMultipartResponseBody { source: HttpError }, #[error("Got invalid multipart response: {}", source)] InvalidMultipartResponse { source: quick_xml::de::DeError }, #[error("Error signing blob: {}", source)] - SignBlobRequest { source: crate::client::retry::Error }, + SignBlobRequest { + source: crate::client::retry::RetryError, + }, #[error("Got invalid signing blob response: {}", source)] - InvalidSignBlobResponse { source: reqwest::Error }, + InvalidSignBlobResponse { source: HttpError }, #[error("Got invalid signing blob signature: {}", source)] InvalidSignBlobSignature { source: base64::DecodeError }, @@ -169,7 +175,7 @@ pub(crate) struct Request<'a> { path: &'a Path, config: &'a GoogleCloudStorageConfig, payload: Option, - builder: RequestBuilder, + builder: HttpRequestBuilder, idempotent: bool, } @@ -225,7 +231,7 @@ impl Request<'_> { } } - async fn send(self) -> Result { + async fn send(self) -> Result { let credential = self.config.credentials.get_credential().await?; let resp = self .builder @@ -268,7 +274,7 @@ struct SignBlobResponse { pub(crate) struct GoogleCloudStorageClient { config: GoogleCloudStorageConfig, - client: Client, + client: HttpClient, bucket_name_encoded: String, @@ -277,8 +283,7 @@ pub(crate) struct GoogleCloudStorageClient { } impl GoogleCloudStorageClient { - pub(crate) fn new(config: GoogleCloudStorageConfig) -> Result { - let client = config.client_options.client()?; + pub(crate) fn new(config: GoogleCloudStorageConfig, client: HttpClient) -> Result { let bucket_name_encoded = percent_encode(config.bucket_name.as_bytes(), NON_ALPHANUMERIC).to_string(); @@ -337,10 +342,8 @@ impl GoogleCloudStorageClient { .idempotent(true) .send() .await - .map_err(|source| Error::SignBlobRequest { source })?; - - //If successful, the signature is returned in the signedBlob field in the response. - let response = response + .map_err(|source| Error::SignBlobRequest { source })? + .into_body() .json::() .await .map_err(|source| Error::InvalidSignBlobResponse { source })?; @@ -445,6 +448,7 @@ impl GoogleCloudStorageClient { .await?; let data = response + .into_body() .bytes() .await .map_err(|source| Error::PutResponseBody { source })?; @@ -527,6 +531,7 @@ impl GoogleCloudStorageClient { .map_err(|source| Error::Metadata { source })?; let data = response + .into_body() .bytes() .await .map_err(|source| Error::CompleteMultipartResponseBody { source })?; @@ -600,7 +605,7 @@ impl GetClient for GoogleCloudStorageClient { }; /// Perform a get request - async fn get_request(&self, path: &Path, options: GetOptions) -> Result { + async fn get_request(&self, path: &Path, options: GetOptions) -> Result { let credential = self.get_credential().await?; let url = self.object_url(path); @@ -675,6 +680,7 @@ impl ListClient for Arc { .send_retry(&self.config.retry_config) .await .map_err(|source| Error::ListRequest { source })? + .into_body() .bytes() .await .map_err(|source| Error::ListResponseBody { source })?; diff --git a/src/gcp/credential.rs b/src/gcp/credential.rs index 4b21ad1..373c2c2 100644 --- a/src/gcp/credential.rs +++ b/src/gcp/credential.rs @@ -18,7 +18,7 @@ use super::client::GoogleCloudStorageClient; use crate::client::retry::RetryExt; use crate::client::token::TemporaryToken; -use crate::client::TokenProvider; +use crate::client::{HttpClient, HttpError, TokenProvider}; use crate::gcp::{GcpSigningCredentialProvider, STORE}; use crate::util::{hex_digest, hex_encode, STRICT_ENCODE_SET}; use crate::{RetryConfig, StaticCredentialProvider}; @@ -27,10 +27,9 @@ use base64::prelude::BASE64_URL_SAFE_NO_PAD; use base64::Engine; use chrono::{DateTime, Utc}; use futures::TryFutureExt; -use hyper::HeaderMap; +use http::{HeaderMap, Method}; use itertools::Itertools; use percent_encoding::utf8_percent_encode; -use reqwest::{Client, Method}; use ring::signature::RsaKeyPair; use serde::Deserialize; use std::collections::BTreeMap; @@ -83,10 +82,12 @@ pub enum Error { UnsupportedKey { encoding: String }, #[error("Error performing token request: {}", source)] - TokenRequest { source: crate::client::retry::Error }, + TokenRequest { + source: crate::client::retry::RetryError, + }, #[error("Error getting token response body: {}", source)] - TokenResponseBody { source: reqwest::Error }, + TokenResponseBody { source: HttpError }, } impl From for crate::Error { @@ -259,7 +260,7 @@ impl TokenProvider for SelfSignedJwt { /// Fetch a fresh token async fn fetch_token( &self, - _client: &Client, + _client: &HttpClient, _retry: &RetryConfig, ) -> crate::Result>> { let now = seconds_since_epoch(); @@ -395,19 +396,20 @@ pub(crate) struct InstanceCredentialProvider {} /// Make a request to the metadata server to fetch a token, using a a given hostname. async fn make_metadata_request( - client: &Client, + client: &HttpClient, hostname: &str, retry: &RetryConfig, ) -> crate::Result { let url = format!("http://{hostname}/computeMetadata/v1/instance/service-accounts/default/token"); let response: TokenResponse = client - .request(Method::GET, url) + .get(url) .header("Metadata-Flavor", "Google") .query(&[("audience", "https://www.googleapis.com/oauth2/v4/token")]) .send_retry(retry) .await .map_err(|source| Error::TokenRequest { source })? + .into_body() .json() .await .map_err(|source| Error::TokenResponseBody { source })?; @@ -426,7 +428,7 @@ impl TokenProvider for InstanceCredentialProvider { /// References: async fn fetch_token( &self, - client: &Client, + client: &HttpClient, retry: &RetryConfig, ) -> crate::Result>> { let metadata_host = if let Ok(host) = env::var("GCE_METADATA_HOST") { @@ -459,18 +461,19 @@ impl TokenProvider for InstanceCredentialProvider { /// Make a request to the metadata server to fetch the client email, using a given hostname. async fn make_metadata_request_for_email( - client: &Client, + client: &HttpClient, hostname: &str, retry: &RetryConfig, ) -> crate::Result { let url = format!("http://{hostname}/computeMetadata/v1/instance/service-accounts/default/email",); let response = client - .request(Method::GET, url) + .get(url) .header("Metadata-Flavor", "Google") .send_retry(retry) .await .map_err(|source| Error::TokenRequest { source })? + .into_body() .text() .await .map_err(|source| Error::TokenResponseBody { source })?; @@ -495,7 +498,7 @@ impl TokenProvider for InstanceSigningCredentialProvider { /// References: async fn fetch_token( &self, - client: &Client, + client: &HttpClient, retry: &RetryConfig, ) -> crate::Result>> { let metadata_host = if let Ok(host) = env::var("GCE_METADATA_HOST") { @@ -605,13 +608,18 @@ impl AuthorizedUserSigningCredentials { Ok(Self { credential }) } - async fn client_email(&self, client: &Client, retry: &RetryConfig) -> crate::Result { + async fn client_email( + &self, + client: &HttpClient, + retry: &RetryConfig, + ) -> crate::Result { let response = client - .request(Method::GET, "https://oauth2.googleapis.com/tokeninfo") + .get("https://oauth2.googleapis.com/tokeninfo") .query(&[("access_token", &self.credential.refresh_token)]) .send_retry(retry) .await .map_err(|source| Error::TokenRequest { source })? + .into_body() .json::() .await .map_err(|source| Error::TokenResponseBody { source })?; @@ -626,7 +634,7 @@ impl TokenProvider for AuthorizedUserSigningCredentials { async fn fetch_token( &self, - client: &Client, + client: &HttpClient, retry: &RetryConfig, ) -> crate::Result>> { let email = self.client_email(client, retry).await?; @@ -647,12 +655,12 @@ impl TokenProvider for AuthorizedUserCredentials { async fn fetch_token( &self, - client: &Client, + client: &HttpClient, retry: &RetryConfig, ) -> crate::Result>> { let response = client - .request(Method::POST, DEFAULT_TOKEN_GCP_URI) - .form(&[ + .post(DEFAULT_TOKEN_GCP_URI) + .form([ ("grant_type", "refresh_token"), ("client_id", &self.client_id), ("client_secret", &self.client_secret), @@ -663,6 +671,7 @@ impl TokenProvider for AuthorizedUserCredentials { .send() .await .map_err(|source| Error::TokenRequest { source })? + .into_body() .json::() .await .map_err(|source| Error::TokenResponseBody { source })?; diff --git a/src/gcp/mod.rs b/src/gcp/mod.rs index 2f6630d..5f8c67d 100644 --- a/src/gcp/mod.rs +++ b/src/gcp/mod.rs @@ -48,7 +48,7 @@ use crate::{ use async_trait::async_trait; use client::GoogleCloudStorageClient; use futures::stream::BoxStream; -use hyper::Method; +use http::Method; use url::Url; use crate::client::get::GetClientExt; @@ -414,7 +414,7 @@ mod test { .unwrap_err() .to_string(); assert!( - err.contains("Client error with status 404 Not Found"), + err.contains("Server returned non-2xx status code: 404 Not Found"), "{}", err ) diff --git a/src/http/client.rs b/src/http/client.rs index 9983fdf..652d326 100644 --- a/src/http/client.rs +++ b/src/http/client.rs @@ -18,29 +18,29 @@ use crate::client::get::GetClient; use crate::client::header::HeaderConfig; use crate::client::retry::{self, RetryConfig, RetryExt}; -use crate::client::GetOptionsExt; +use crate::client::{GetOptionsExt, HttpClient, HttpError, HttpResponse}; use crate::path::{Path, DELIMITER}; use crate::util::deserialize_rfc1123; use crate::{Attribute, Attributes, ClientOptions, GetOptions, ObjectMeta, PutPayload, Result}; use async_trait::async_trait; use bytes::Buf; use chrono::{DateTime, Utc}; -use hyper::header::{ +use http::header::{ CACHE_CONTROL, CONTENT_DISPOSITION, CONTENT_ENCODING, CONTENT_LANGUAGE, CONTENT_LENGTH, CONTENT_TYPE, }; use percent_encoding::percent_decode_str; -use reqwest::{Method, Response, StatusCode}; +use reqwest::{Method, StatusCode}; use serde::Deserialize; use url::Url; #[derive(Debug, thiserror::Error)] enum Error { #[error("Request error: {}", source)] - Request { source: retry::Error }, + Request { source: retry::RetryError }, #[error("Request error: {}", source)] - Reqwest { source: reqwest::Error }, + Reqwest { source: HttpError }, #[error("Range request not supported by {}", href)] RangeNotSupported { href: String }, @@ -86,7 +86,7 @@ impl From for crate::Error { #[derive(Debug)] pub(crate) struct Client { url: Url, - client: reqwest::Client, + client: HttpClient, retry_config: RetryConfig, client_options: ClientOptions, } @@ -94,26 +94,26 @@ pub(crate) struct Client { impl Client { pub(crate) fn new( url: Url, + client: HttpClient, client_options: ClientOptions, retry_config: RetryConfig, - ) -> Result { - let client = client_options.client()?; - Ok(Self { + ) -> Self { + Self { url, retry_config, client_options, client, - }) + } } pub(crate) fn base_url(&self) -> &Url { &self.url } - fn path_url(&self, location: &Path) -> Url { + fn path_url(&self, location: &Path) -> String { let mut url = self.url.clone(); url.path_segments_mut().unwrap().extend(location.parts()); - url + url.to_string() } /// Create a directory with `path` using MKCOL @@ -125,7 +125,7 @@ impl Client { .extend(path.split(DELIMITER)); self.client - .request(method, url) + .request(method, String::from(url)) .send_retry(&self.retry_config) .await .map_err(|source| Error::Request { source })?; @@ -167,7 +167,7 @@ impl Client { location: &Path, payload: PutPayload, attributes: Attributes, - ) -> Result { + ) -> Result { let mut retry = false; loop { let url = self.path_url(location); @@ -222,7 +222,7 @@ impl Client { pub(crate) async fn list(&self, location: Option<&Path>, depth: &str) -> Result { let url = location .map(|path| self.path_url(path)) - .unwrap_or_else(|| self.url.clone()); + .unwrap_or_else(|| self.url.to_string()); let method = Method::from_bytes(b"PROPFIND").unwrap(); let result = self @@ -236,6 +236,7 @@ impl Client { let response = match result { Ok(result) => result + .into_body() .bytes() .await .map_err(|source| Error::Reqwest { source })?, @@ -332,7 +333,7 @@ impl GetClient for Client { user_defined_metadata_prefix: None, }; - async fn get_request(&self, path: &Path, options: GetOptions) -> Result { + async fn get_request(&self, path: &Path, options: GetOptions) -> Result { let url = self.path_url(path); let method = match options.head { true => Method::HEAD, diff --git a/src/http/mod.rs b/src/http/mod.rs index 899740d..8fba4d7 100644 --- a/src/http/mod.rs +++ b/src/http/mod.rs @@ -41,6 +41,7 @@ use url::Url; use crate::client::get::GetClientExt; use crate::client::header::get_etag; +use crate::client::{HttpConnector, ReqwestConnector}; use crate::http::client::Client; use crate::path::Path; use crate::{ @@ -203,6 +204,7 @@ pub struct HttpBuilder { url: Option, client_options: ClientOptions, retry_config: RetryConfig, + http_connector: Option>, } impl HttpBuilder { @@ -235,13 +237,29 @@ impl HttpBuilder { self } + /// Overrides the [`HttpConnector`], by default uses [`ReqwestConnector`] + pub fn with_http_connector(mut self, connector: C) -> Self { + self.http_connector = Some(Arc::new(connector)); + self + } + /// Build an [`HttpStore`] with the configured options pub fn build(self) -> Result { let url = self.url.ok_or(Error::MissingUrl)?; let parsed = Url::parse(&url).map_err(|source| Error::UnableToParseUrl { url, source })?; + let client = match self.http_connector { + None => ReqwestConnector::default().connect(&self.client_options)?, + Some(x) => x.connect(&self.client_options)?, + }; + Ok(HttpStore { - client: Arc::new(Client::new(parsed, self.client_options, self.retry_config)?), + client: Arc::new(Client::new( + parsed, + client, + self.client_options, + self.retry_config, + )), }) } } diff --git a/src/lib.rs b/src/lib.rs index cffcbbd..58f757b 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -526,7 +526,7 @@ pub mod signer; pub mod throttle; #[cfg(feature = "cloud")] -mod client; +pub mod client; #[cfg(feature = "cloud")] pub use client::{ @@ -1411,7 +1411,7 @@ mod tests { pub(crate) async fn tagging(storage: Arc, validate: bool, get_tags: F) where F: Fn(Path) -> Fut + Send + Sync, - Fut: std::future::Future> + Send, + Fut: std::future::Future> + Send, { use bytes::Buf; use serde::Deserialize; @@ -1477,7 +1477,7 @@ mod tests { for path in [path, multi_path, buf_path] { let resp = get_tags(path.clone()).await.unwrap(); - let body = resp.bytes().await.unwrap(); + let body = resp.into_body().bytes().await.unwrap(); let mut resp: Tagging = quick_xml::de::from_reader(body.reader()).unwrap(); resp.list.tags.sort_by(|a, b| a.key.cmp(&b.key)); diff --git a/src/parse.rs b/src/parse.rs index bc65a0b..4e67e59 100644 --- a/src/parse.rs +++ b/src/parse.rs @@ -345,7 +345,7 @@ mod tests { #[cfg(feature = "http")] async fn test_url_http() { use crate::client::mock_server::MockServer; - use hyper::{header::USER_AGENT, Response}; + use http::{header::USER_AGENT, Response}; let server = MockServer::new().await; diff --git a/src/payload.rs b/src/payload.rs index d71f016..055336b 100644 --- a/src/payload.rs +++ b/src/payload.rs @@ -44,13 +44,6 @@ impl PutPayload { s.into() } - #[cfg(feature = "cloud")] - pub(crate) fn body(&self) -> reqwest::Body { - reqwest::Body::wrap_stream(futures::stream::iter( - self.clone().into_iter().map(Ok::<_, crate::Error>), - )) - } - /// Returns the total length of the [`Bytes`] in this payload pub fn content_length(&self) -> usize { self.0.iter().map(|b| b.len()).sum() From 19a2c7463144f2f597e8a86bc20d60aaa9fa2e6a Mon Sep 17 00:00:00 2001 From: Marco Neumann Date: Thu, 27 Feb 2025 15:02:23 +0100 Subject: [PATCH 389/397] feat: add `Extensions` to object store `GetOptions` (#7170) * feat: add `Extensions` to object store `GetOptions` Closes #7155. * refactor: replace own `Extensions` by `http` version * feat: wire `Extensions` into HTTP stack --- Cargo.toml | 4 ++-- src/client/builder.rs | 7 +++++++ src/client/mod.rs | 23 ++++++++++++++++++----- src/lib.rs | 5 +++++ 4 files changed, 32 insertions(+), 7 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 7e51245..0372514 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -34,6 +34,7 @@ async-trait = "0.1.53" bytes = "1.0" chrono = { version = "0.4.34", default-features = false, features = ["clock"] } futures = "0.3" +http = "1.2.0" humantime = "2.1" itertools = "0.14.0" parking_lot = { version = "0.12" } @@ -46,7 +47,6 @@ walkdir = { version = "2", optional = true } # Cloud storage support base64 = { version = "0.22", default-features = false, features = ["std"], optional = true } form_urlencoded = { version = "1.2", optional = true } -http = { version = "1.2.0", optional = true } http-body-util = { version = "0.1", optional = true } httparse = { version = "1.8.0", default-features = false, features = ["std"], optional = true } hyper = { version = "1.2", default-features = false, optional = true } @@ -66,7 +66,7 @@ nix = { version = "0.29.0", features = ["fs"] } [features] default = ["fs"] -cloud = ["serde", "serde_json", "quick-xml", "hyper", "reqwest", "reqwest/stream", "chrono/serde", "base64", "rand", "ring", "dep:http", "http-body-util", "form_urlencoded", "serde_urlencoded"] +cloud = ["serde", "serde_json", "quick-xml", "hyper", "reqwest", "reqwest/stream", "chrono/serde", "base64", "rand", "ring", "http-body-util", "form_urlencoded", "serde_urlencoded"] azure = ["cloud", "httparse"] fs = ["walkdir"] gcp = ["cloud", "rustls-pemfile"] diff --git a/src/client/builder.rs b/src/client/builder.rs index 0fbc12f..fcbc6e8 100644 --- a/src/client/builder.rs +++ b/src/client/builder.rs @@ -92,6 +92,13 @@ impl HttpRequestBuilder { self } + pub(crate) fn extensions(mut self, extensions: ::http::Extensions) -> Self { + if let Ok(r) = &mut self.request { + *r.extensions_mut() = extensions; + } + self + } + pub(crate) fn header(mut self, name: K, value: V) -> Self where K: TryInto, diff --git a/src/client/mod.rs b/src/client/mod.rs index 4fe3cff..36252f5 100644 --- a/src/client/mod.rs +++ b/src/client/mod.rs @@ -718,27 +718,40 @@ impl GetOptionsExt for HttpRequestBuilder { fn with_get_options(mut self, options: GetOptions) -> Self { use hyper::header::*; - if let Some(range) = options.range { + let GetOptions { + if_match, + if_none_match, + if_modified_since, + if_unmodified_since, + range, + version: _, + head: _, + extensions, + } = options; + + if let Some(range) = range { self = self.header(RANGE, range.to_string()); } - if let Some(tag) = options.if_match { + if let Some(tag) = if_match { self = self.header(IF_MATCH, tag); } - if let Some(tag) = options.if_none_match { + if let Some(tag) = if_none_match { self = self.header(IF_NONE_MATCH, tag); } const DATE_FORMAT: &str = "%a, %d %b %Y %H:%M:%S GMT"; - if let Some(date) = options.if_unmodified_since { + if let Some(date) = if_unmodified_since { self = self.header(IF_UNMODIFIED_SINCE, date.format(DATE_FORMAT).to_string()); } - if let Some(date) = options.if_modified_since { + if let Some(date) = if_modified_since { self = self.header(IF_MODIFIED_SINCE, date.format(DATE_FORMAT).to_string()); } + self = self.extensions(extensions); + self } } diff --git a/src/lib.rs b/src/lib.rs index 58f757b..21352f5 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -967,6 +967,11 @@ pub struct GetOptions { /// /// pub head: bool, + /// Implementation-specific extensions. Intended for use by [`ObjectStore`] implementations + /// that need to pass context-specific information (like tracing spans) via trait methods. + /// + /// These extensions are ignored entirely by backends offered through this crate. + pub extensions: ::http::Extensions, } impl GetOptions { From e43cc614925054027da55924ffe947e9d0d795f0 Mon Sep 17 00:00:00 2001 From: meteorgan Date: Fri, 28 Feb 2025 00:08:52 +0800 Subject: [PATCH 390/397] chore: enable conditional put by default for S3 (#7181) * fix cargo clippy * add Disabled variant to S3ConditionalPut * restore object_store.yml --- src/aws/builder.rs | 16 +++++++--------- src/aws/client.rs | 2 +- src/aws/mod.rs | 11 ++++++----- src/aws/precondition.rs | 8 +++++++- 4 files changed, 21 insertions(+), 16 deletions(-) diff --git a/src/aws/builder.rs b/src/aws/builder.rs index 5e3d32e..e49ba1d 100644 --- a/src/aws/builder.rs +++ b/src/aws/builder.rs @@ -160,7 +160,7 @@ pub struct AmazonS3Builder { /// Copy if not exists copy_if_not_exists: Option>, /// Put precondition - conditional_put: Option>, + conditional_put: ConfigValue, /// Ignore tags disable_tagging: ConfigValue, /// Encryption (See [`S3EncryptionConfigKey`]) @@ -525,7 +525,7 @@ impl AmazonS3Builder { self.copy_if_not_exists = Some(ConfigValue::Deferred(value.into())) } AmazonS3ConfigKey::ConditionalPut => { - self.conditional_put = Some(ConfigValue::Deferred(value.into())) + self.conditional_put = ConfigValue::Deferred(value.into()) } AmazonS3ConfigKey::RequestPayer => { self.request_payer = ConfigValue::Deferred(value.into()) @@ -583,9 +583,7 @@ impl AmazonS3Builder { AmazonS3ConfigKey::CopyIfNotExists => { self.copy_if_not_exists.as_ref().map(ToString::to_string) } - AmazonS3ConfigKey::ConditionalPut => { - self.conditional_put.as_ref().map(ToString::to_string) - } + AmazonS3ConfigKey::ConditionalPut => Some(self.conditional_put.to_string()), AmazonS3ConfigKey::DisableTagging => Some(self.disable_tagging.to_string()), AmazonS3ConfigKey::RequestPayer => Some(self.request_payer.to_string()), AmazonS3ConfigKey::Encryption(key) => match key { @@ -827,9 +825,10 @@ impl AmazonS3Builder { self } - /// Configure how to provide conditional put operations + /// Configure how to provide conditional put operations. + /// if not set, the default value will be `S3ConditionalPut::ETagMatch` pub fn with_conditional_put(mut self, config: S3ConditionalPut) -> Self { - self.conditional_put = Some(config.into()); + self.conditional_put = config.into(); self } @@ -905,7 +904,6 @@ impl AmazonS3Builder { let region = self.region.unwrap_or_else(|| "us-east-1".to_string()); let checksum = self.checksum_algorithm.map(|x| x.get()).transpose()?; let copy_if_not_exists = self.copy_if_not_exists.map(|x| x.get()).transpose()?; - let put_precondition = self.conditional_put.map(|x| x.get()).transpose()?; let credentials = if let Some(credentials) = self.credentials { credentials @@ -1045,7 +1043,7 @@ impl AmazonS3Builder { disable_tagging: self.disable_tagging.get()?, checksum, copy_if_not_exists, - conditional_put: put_precondition, + conditional_put: self.conditional_put.get()?, encryption_headers, request_payer: self.request_payer.get()?, }; diff --git a/src/aws/client.rs b/src/aws/client.rs index 2cf808a..d8132d0 100644 --- a/src/aws/client.rs +++ b/src/aws/client.rs @@ -204,7 +204,7 @@ pub(crate) struct S3Config { pub disable_tagging: bool, pub checksum: Option, pub copy_if_not_exists: Option, - pub conditional_put: Option, + pub conditional_put: S3ConditionalPut, pub request_payer: bool, pub(super) encryption_headers: S3EncryptionHeaders, } diff --git a/src/aws/mod.rs b/src/aws/mod.rs index 0625ae1..2c1852b 100644 --- a/src/aws/mod.rs +++ b/src/aws/mod.rs @@ -169,8 +169,8 @@ impl ObjectStore for AmazonS3 { match (opts.mode, &self.client.config.conditional_put) { (PutMode::Overwrite, _) => request.idempotent(true).do_put().await, - (PutMode::Create | PutMode::Update(_), None) => Err(Error::NotImplemented), - (PutMode::Create, Some(S3ConditionalPut::ETagMatch)) => { + (PutMode::Create, S3ConditionalPut::Disabled) => Err(Error::NotImplemented), + (PutMode::Create, S3ConditionalPut::ETagMatch) => { match request.header(&IF_NONE_MATCH, "*").do_put().await { // Technically If-None-Match should return NotModified but some stores, // such as R2, instead return PreconditionFailed @@ -184,11 +184,11 @@ impl ObjectStore for AmazonS3 { r => r, } } - (PutMode::Create, Some(S3ConditionalPut::Dynamo(d))) => { + (PutMode::Create, S3ConditionalPut::Dynamo(d)) => { d.conditional_op(&self.client, location, None, move || request.do_put()) .await } - (PutMode::Update(v), Some(put)) => { + (PutMode::Update(v), put) => { let etag = v.e_tag.ok_or_else(|| Error::Generic { store: STORE, source: "ETag required for conditional put".to_string().into(), @@ -221,6 +221,7 @@ impl ObjectStore for AmazonS3 { }) .await } + S3ConditionalPut::Disabled => Err(Error::NotImplemented), } } } @@ -561,7 +562,7 @@ mod tests { let integration = config.build().unwrap(); let config = &integration.client.config; let test_not_exists = config.copy_if_not_exists.is_some(); - let test_conditional_put = config.conditional_put.is_some(); + let test_conditional_put = config.conditional_put != S3ConditionalPut::Disabled; put_get_delete_list(&integration).await; get_opts(&integration).await; diff --git a/src/aws/precondition.rs b/src/aws/precondition.rs index b261ad0..ab5aea9 100644 --- a/src/aws/precondition.rs +++ b/src/aws/precondition.rs @@ -126,7 +126,7 @@ impl Parse for S3CopyIfNotExists { /// Configure how to provide conditional put support for [`AmazonS3`]. /// /// [`AmazonS3`]: super::AmazonS3 -#[derive(Debug, Clone, Eq, PartialEq)] +#[derive(Debug, Clone, Eq, PartialEq, Default)] #[allow(missing_copy_implementations)] #[non_exhaustive] pub enum S3ConditionalPut { @@ -136,6 +136,7 @@ pub enum S3ConditionalPut { /// Encoded as `etag` ignoring whitespace /// /// [HTTP precondition]: https://datatracker.ietf.org/doc/html/rfc9110#name-preconditions + #[default] ETagMatch, /// The name of a DynamoDB table to use for coordination @@ -147,6 +148,9 @@ pub enum S3ConditionalPut { /// /// This will use the same region, credentials and endpoint as configured for S3 Dynamo(DynamoCommit), + + /// Disable `conditional put` + Disabled, } impl std::fmt::Display for S3ConditionalPut { @@ -154,6 +158,7 @@ impl std::fmt::Display for S3ConditionalPut { match self { Self::ETagMatch => write!(f, "etag"), Self::Dynamo(lock) => write!(f, "dynamo: {}", lock.table_name()), + Self::Disabled => write!(f, "disabled"), } } } @@ -162,6 +167,7 @@ impl S3ConditionalPut { fn from_str(s: &str) -> Option { match s.trim() { "etag" => Some(Self::ETagMatch), + "disabled" => Some(Self::Disabled), trimmed => match trimmed.split_once(':')? { ("dynamo", s) => Some(Self::Dynamo(DynamoCommit::from_str(s)?)), _ => None, From 1a0560896e443cc53a9d47ebe354878751b99175 Mon Sep 17 00:00:00 2001 From: Marco Neumann Date: Thu, 27 Feb 2025 18:11:22 +0100 Subject: [PATCH 391/397] feat: add `Extensions` to object store `PutOptions` (#7213) Follow-up to #7170. --- src/aws/client.rs | 5 +++++ src/aws/mod.rs | 14 +++++++++++--- src/azure/client.rs | 19 ++++++++++++++++--- src/gcp/client.rs | 20 +++++++++++++++++--- src/lib.rs | 29 ++++++++++++++++++++++++++++- 5 files changed, 77 insertions(+), 10 deletions(-) diff --git a/src/aws/client.rs b/src/aws/client.rs index d8132d0..6cf5540 100644 --- a/src/aws/client.rs +++ b/src/aws/client.rs @@ -389,6 +389,11 @@ impl Request<'_> { Self { builder, ..self } } + pub(crate) fn with_extensions(self, extensions: ::http::Extensions) -> Self { + let builder = self.builder.extensions(extensions); + Self { builder, ..self } + } + pub(crate) fn with_payload(mut self, payload: PutPayload) -> Self { if (!self.config.skip_signature && self.config.sign_payload) || self.config.checksum.is_some() diff --git a/src/aws/mod.rs b/src/aws/mod.rs index 2c1852b..76e298f 100644 --- a/src/aws/mod.rs +++ b/src/aws/mod.rs @@ -159,15 +159,23 @@ impl ObjectStore for AmazonS3 { payload: PutPayload, opts: PutOptions, ) -> Result { + let PutOptions { + mode, + tags, + attributes, + extensions, + } = opts; + let request = self .client .request(Method::PUT, location) .with_payload(payload) - .with_attributes(opts.attributes) - .with_tags(opts.tags) + .with_attributes(attributes) + .with_tags(tags) + .with_extensions(extensions) .with_encryption_headers(); - match (opts.mode, &self.client.config.conditional_put) { + match (mode, &self.client.config.conditional_put) { (PutMode::Overwrite, _) => request.idempotent(true).do_put().await, (PutMode::Create, S3ConditionalPut::Disabled) => Err(Error::NotImplemented), (PutMode::Create, S3ConditionalPut::ETagMatch) => { diff --git a/src/azure/client.rs b/src/azure/client.rs index 13e40bb..c4d026b 100644 --- a/src/azure/client.rs +++ b/src/azure/client.rs @@ -257,6 +257,11 @@ impl PutRequest<'_> { Self { builder, ..self } } + fn with_extensions(self, extensions: ::http::Extensions) -> Self { + let builder = self.builder.extensions(extensions); + Self { builder, ..self } + } + async fn send(self) -> Result { let credential = self.config.get_credential().await?; let sensitive = credential @@ -540,12 +545,20 @@ impl AzureClient { payload: PutPayload, opts: PutOptions, ) -> Result { + let PutOptions { + mode, + tags, + attributes, + extensions, + } = opts; + let builder = self .put_request(path, payload) - .with_attributes(opts.attributes) - .with_tags(opts.tags); + .with_attributes(attributes) + .with_extensions(extensions) + .with_tags(tags); - let builder = match &opts.mode { + let builder = match &mode { PutMode::Overwrite => builder.idempotent(true), PutMode::Create => builder.header(&IF_NONE_MATCH, "*"), PutMode::Update(v) => { diff --git a/src/gcp/client.rs b/src/gcp/client.rs index a52ad36..e514624 100644 --- a/src/gcp/client.rs +++ b/src/gcp/client.rs @@ -231,6 +231,11 @@ impl Request<'_> { } } + fn with_extensions(self, extensions: ::http::Extensions) -> Self { + let builder = self.builder.extensions(extensions); + Self { builder, ..self } + } + async fn send(self) -> Result { let credential = self.config.credentials.get_credential().await?; let resp = self @@ -384,12 +389,21 @@ impl GoogleCloudStorageClient { payload: PutPayload, opts: PutOptions, ) -> Result { + let PutOptions { + mode, + // not supported by GCP + tags: _, + attributes, + extensions, + } = opts; + let builder = self .request(Method::PUT, path) .with_payload(payload) - .with_attributes(opts.attributes); + .with_attributes(attributes) + .with_extensions(extensions); - let builder = match &opts.mode { + let builder = match &mode { PutMode::Overwrite => builder.idempotent(true), PutMode::Create => builder.header(&VERSION_MATCH, "0"), PutMode::Update(v) => { @@ -398,7 +412,7 @@ impl GoogleCloudStorageClient { } }; - match (opts.mode, builder.do_put().await) { + match (mode, builder.do_put().await) { (PutMode::Create, Err(crate::Error::Precondition { path, source })) => { Err(crate::Error::AlreadyExists { path, source }) } diff --git a/src/lib.rs b/src/lib.rs index 21352f5..8f05fb3 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1154,7 +1154,7 @@ impl From for UpdateVersion { } /// Options for a put request -#[derive(Debug, Clone, PartialEq, Eq, Default)] +#[derive(Debug, Clone, Default)] pub struct PutOptions { /// Configure the [`PutMode`] for this operation pub mode: PutMode, @@ -1166,8 +1166,35 @@ pub struct PutOptions { /// /// Implementations that don't support an attribute should return an error pub attributes: Attributes, + /// Implementation-specific extensions. Intended for use by [`ObjectStore`] implementations + /// that need to pass context-specific information (like tracing spans) via trait methods. + /// + /// These extensions are ignored entirely by backends offered through this crate. + /// + /// They are also eclused from [`PartialEq`] and [`Eq`]. + pub extensions: ::http::Extensions, } +impl PartialEq for PutOptions { + fn eq(&self, other: &Self) -> bool { + let Self { + mode, + tags, + attributes, + extensions: _, + } = self; + let Self { + mode: other_mode, + tags: other_tags, + attributes: other_attributes, + extensions: _, + } = other; + (mode == other_mode) && (tags == other_tags) && (attributes == other_attributes) + } +} + +impl Eq for PutOptions {} + impl From for PutOptions { fn from(mode: PutMode) -> Self { Self { From bd0c489506769e68480625113a4e42905287bc4b Mon Sep 17 00:00:00 2001 From: Marco Neumann Date: Thu, 27 Feb 2025 18:47:09 +0100 Subject: [PATCH 392/397] feat: add `Extensions` to object store `PutMultipartOpts` (#7214) --- src/aws/client.rs | 11 +++++++++-- src/azure/client.rs | 11 +++++++++-- src/buffered.rs | 17 +++++++++++++++++ src/gcp/client.rs | 10 +++++++++- src/lib.rs | 27 ++++++++++++++++++++++++++- 5 files changed, 70 insertions(+), 6 deletions(-) diff --git a/src/aws/client.rs b/src/aws/client.rs index 6cf5540..fb2a033 100644 --- a/src/aws/client.rs +++ b/src/aws/client.rs @@ -633,6 +633,12 @@ impl S3Client { location: &Path, opts: PutMultipartOpts, ) -> Result { + let PutMultipartOpts { + tags, + attributes, + extensions, + } = opts; + let mut request = self.request(Method::POST, location); if let Some(algorithm) = self.config.checksum { match algorithm { @@ -644,8 +650,9 @@ impl S3Client { let response = request .query(&[("uploads", "")]) .with_encryption_headers() - .with_attributes(opts.attributes) - .with_tags(opts.tags) + .with_attributes(attributes) + .with_tags(tags) + .with_extensions(extensions) .idempotent(true) .send() .await? diff --git a/src/azure/client.rs b/src/azure/client.rs index c4d026b..dbeae63 100644 --- a/src/azure/client.rs +++ b/src/azure/client.rs @@ -599,6 +599,12 @@ impl AzureClient { parts: Vec, opts: PutMultipartOpts, ) -> Result { + let PutMultipartOpts { + tags, + attributes, + extensions, + } = opts; + let blocks = parts .into_iter() .map(|part| BlockId::from(part.content_id)) @@ -607,8 +613,9 @@ impl AzureClient { let payload = BlockList { blocks }.to_xml().into(); let response = self .put_request(path, payload) - .with_attributes(opts.attributes) - .with_tags(opts.tags) + .with_attributes(attributes) + .with_tags(tags) + .with_extensions(extensions) .query(&[("comp", "blocklist")]) .idempotent(true) .send() diff --git a/src/buffered.rs b/src/buffered.rs index fcd7e06..a767cb6 100644 --- a/src/buffered.rs +++ b/src/buffered.rs @@ -222,6 +222,7 @@ pub struct BufWriter { max_concurrency: usize, attributes: Option, tags: Option, + extensions: Option<::http::Extensions>, state: BufWriterState, store: Arc, } @@ -259,6 +260,7 @@ impl BufWriter { max_concurrency: 8, attributes: None, tags: None, + extensions: None, state: BufWriterState::Buffer(path, PutPayloadMut::new()), } } @@ -289,6 +291,19 @@ impl BufWriter { } } + /// Set the extensions of the uploaded object + /// + /// Implementation-specific extensions. Intended for use by [`ObjectStore`] implementations + /// that need to pass context-specific information (like tracing spans) via trait methods. + /// + /// These extensions are ignored entirely by backends offered through this crate. + pub fn with_extensions(self, extensions: ::http::Extensions) -> Self { + Self { + extensions: Some(extensions), + ..self + } + } + /// Write data to the writer in [`Bytes`]. /// /// Unlike [`AsyncWrite::poll_write`], `put` can write data without extra copying. @@ -325,6 +340,7 @@ impl BufWriter { let opts = PutMultipartOpts { attributes: self.attributes.take().unwrap_or_default(), tags: self.tags.take().unwrap_or_default(), + extensions: self.extensions.take().unwrap_or_default(), }; let upload = self.store.put_multipart_opts(&path, opts).await?; let mut chunked = @@ -384,6 +400,7 @@ impl AsyncWrite for BufWriter { let opts = PutMultipartOpts { attributes: self.attributes.take().unwrap_or_default(), tags: self.tags.take().unwrap_or_default(), + extensions: self.extensions.take().unwrap_or_default(), }; let store = Arc::clone(&self.store); self.state = BufWriterState::Prepare(Box::pin(async move { diff --git a/src/gcp/client.rs b/src/gcp/client.rs index e514624..1cc7296 100644 --- a/src/gcp/client.rs +++ b/src/gcp/client.rs @@ -453,9 +453,17 @@ impl GoogleCloudStorageClient { path: &Path, opts: PutMultipartOpts, ) -> Result { + let PutMultipartOpts { + // not supported by GCP + tags: _, + attributes, + extensions, + } = opts; + let response = self .request(Method::POST, path) - .with_attributes(opts.attributes) + .with_attributes(attributes) + .with_extensions(extensions) .header(&CONTENT_LENGTH, "0") .query(&[("uploads", "")]) .send() diff --git a/src/lib.rs b/src/lib.rs index 8f05fb3..5db7e01 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1223,7 +1223,7 @@ impl From for PutOptions { } /// Options for [`ObjectStore::put_multipart_opts`] -#[derive(Debug, Clone, PartialEq, Eq, Default)] +#[derive(Debug, Clone, Default)] pub struct PutMultipartOpts { /// Provide a [`TagSet`] for this object /// @@ -1233,8 +1233,33 @@ pub struct PutMultipartOpts { /// /// Implementations that don't support an attribute should return an error pub attributes: Attributes, + /// Implementation-specific extensions. Intended for use by [`ObjectStore`] implementations + /// that need to pass context-specific information (like tracing spans) via trait methods. + /// + /// These extensions are ignored entirely by backends offered through this crate. + /// + /// They are also eclused from [`PartialEq`] and [`Eq`]. + pub extensions: ::http::Extensions, +} + +impl PartialEq for PutMultipartOpts { + fn eq(&self, other: &Self) -> bool { + let Self { + tags, + attributes, + extensions: _, + } = self; + let Self { + tags: other_tags, + attributes: other_attributes, + extensions: _, + } = other; + (tags == other_tags) && (attributes == other_attributes) + } } +impl Eq for PutMultipartOpts {} + impl From for PutMultipartOpts { fn from(tags: TagSet) -> Self { Self { From e975177814778b711e11b048bb5c703d2b893427 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Mon, 3 Mar 2025 18:15:22 +0000 Subject: [PATCH 393/397] ObjectStore WASM32 Support (#7226) * ObjectStore WASM32 Support * Docs * Update .github/workflows/object_store.yml --- src/aws/builder.rs | 10 +++++----- src/aws/mod.rs | 4 ++++ src/azure/builder.rs | 10 +++++----- src/client/body.rs | 1 + src/client/connection.rs | 34 +++++++++++++++++++++++++++++++++- src/client/mod.rs | 24 +++++++++++++++++------- src/gcp/builder.rs | 10 +++++----- src/http/mod.rs | 11 +++++------ src/lib.rs | 17 +++++------------ src/parse.rs | 8 +++++++- 10 files changed, 87 insertions(+), 42 deletions(-) diff --git a/src/aws/builder.rs b/src/aws/builder.rs index e49ba1d..5dff94d 100644 --- a/src/aws/builder.rs +++ b/src/aws/builder.rs @@ -23,7 +23,7 @@ use crate::aws::{ AmazonS3, AwsCredential, AwsCredentialProvider, Checksum, S3ConditionalPut, S3CopyIfNotExists, STORE, }; -use crate::client::{HttpConnector, ReqwestConnector, TokenCredentialProvider}; +use crate::client::{http_connector, HttpConnector, TokenCredentialProvider}; use crate::config::ConfigValue; use crate::{ClientConfigKey, ClientOptions, Result, RetryConfig, StaticCredentialProvider}; use base64::prelude::BASE64_STANDARD; @@ -883,7 +883,9 @@ impl AmazonS3Builder { self } - /// Overrides the [`HttpConnector`], by default uses [`ReqwestConnector`] + /// The [`HttpConnector`] to use + /// + /// On non-WASM32 platforms uses [`reqwest`] by default, on WASM32 platforms must be provided pub fn with_http_connector(mut self, connector: C) -> Self { self.http_connector = Some(Arc::new(connector)); self @@ -896,9 +898,7 @@ impl AmazonS3Builder { self.parse_url(&url)?; } - let http = self - .http_connector - .unwrap_or_else(|| Arc::new(ReqwestConnector::default())); + let http = http_connector(self.http_connector)?; let bucket = self.bucket_name.ok_or(Error::MissingBucketName)?; let region = self.region.unwrap_or_else(|| "us-east-1".to_string()); diff --git a/src/aws/mod.rs b/src/aws/mod.rs index 76e298f..b8175bd 100644 --- a/src/aws/mod.rs +++ b/src/aws/mod.rs @@ -58,12 +58,16 @@ mod client; mod credential; mod dynamo; mod precondition; + +#[cfg(not(target_arch = "wasm32"))] mod resolve; pub use builder::{AmazonS3Builder, AmazonS3ConfigKey}; pub use checksum::Checksum; pub use dynamo::DynamoCommit; pub use precondition::{S3ConditionalPut, S3CopyIfNotExists}; + +#[cfg(not(target_arch = "wasm32"))] pub use resolve::resolve_bucket_region; /// This struct is used to maintain the URI path encoding diff --git a/src/azure/builder.rs b/src/azure/builder.rs index ab0a484..f176fc6 100644 --- a/src/azure/builder.rs +++ b/src/azure/builder.rs @@ -21,7 +21,7 @@ use crate::azure::credential::{ ImdsManagedIdentityProvider, WorkloadIdentityOAuthProvider, }; use crate::azure::{AzureCredential, AzureCredentialProvider, MicrosoftAzure, STORE}; -use crate::client::{HttpConnector, ReqwestConnector, TokenCredentialProvider}; +use crate::client::{http_connector, HttpConnector, TokenCredentialProvider}; use crate::config::ConfigValue; use crate::{ClientConfigKey, ClientOptions, Result, RetryConfig, StaticCredentialProvider}; use percent_encoding::percent_decode_str; @@ -889,7 +889,9 @@ impl MicrosoftAzureBuilder { self } - /// Overrides the [`HttpConnector`], by default uses [`ReqwestConnector`] + /// The [`HttpConnector`] to use + /// + /// On non-WASM32 platforms uses [`reqwest`] by default, on WASM32 platforms must be provided pub fn with_http_connector(mut self, connector: C) -> Self { self.http_connector = Some(Arc::new(connector)); self @@ -907,9 +909,7 @@ impl MicrosoftAzureBuilder { Arc::new(StaticCredentialProvider::new(credential)) }; - let http = self - .http_connector - .unwrap_or_else(|| Arc::new(ReqwestConnector::default())); + let http = http_connector(self.http_connector)?; let (is_emulator, storage_url, auth, account) = if self.use_emulator.get()? { let account_name = self diff --git a/src/client/body.rs b/src/client/body.rs index 549b3e4..8f62afa 100644 --- a/src/client/body.rs +++ b/src/client/body.rs @@ -39,6 +39,7 @@ impl HttpRequestBody { Self(Inner::Bytes(Bytes::new())) } + #[cfg(not(target_arch = "wasm32"))] pub(crate) fn into_reqwest(self) -> reqwest::Body { match self.0 { Inner::Bytes(b) => b.into(), diff --git a/src/client/connection.rs b/src/client/connection.rs index 8b63169..7e2daf4 100644 --- a/src/client/connection.rs +++ b/src/client/connection.rs @@ -84,9 +84,14 @@ impl HttpError { } pub(crate) fn reqwest(e: reqwest::Error) -> Self { + #[cfg(not(target_arch = "wasm32"))] + let is_connect = || e.is_connect(); + #[cfg(target_arch = "wasm32")] + let is_connect = || false; + let mut kind = if e.is_timeout() { HttpErrorKind::Timeout - } else if e.is_connect() { + } else if is_connect() { HttpErrorKind::Connect } else if e.is_decode() { HttpErrorKind::Decode @@ -200,6 +205,7 @@ impl HttpClient { } #[async_trait] +#[cfg(not(target_arch = "wasm32"))] impl HttpService for reqwest::Client { async fn call(&self, req: HttpRequest) -> Result { let (parts, body) = req.into_parts(); @@ -227,11 +233,37 @@ pub trait HttpConnector: std::fmt::Debug + Send + Sync + 'static { /// [`HttpConnector`] using [`reqwest::Client`] #[derive(Debug, Default)] #[allow(missing_copy_implementations)] +#[cfg(not(target_arch = "wasm32"))] pub struct ReqwestConnector {} +#[cfg(not(target_arch = "wasm32"))] impl HttpConnector for ReqwestConnector { fn connect(&self, options: &ClientOptions) -> crate::Result { let client = options.client()?; Ok(HttpClient::new(client)) } } + +#[cfg(target_arch = "wasm32")] +pub(crate) fn http_connector( + custom: Option>, +) -> crate::Result> { + match custom { + Some(x) => Ok(x), + None => Err(crate::Error::NotSupported { + source: "WASM32 architectures must provide an HTTPConnector" + .to_string() + .into(), + }), + } +} + +#[cfg(not(target_arch = "wasm32"))] +pub(crate) fn http_connector( + custom: Option>, +) -> crate::Result> { + match custom { + Some(x) => Ok(x), + None => Ok(Arc::new(ReqwestConnector {})), + } +} diff --git a/src/client/mod.rs b/src/client/mod.rs index 36252f5..bd0347b 100644 --- a/src/client/mod.rs +++ b/src/client/mod.rs @@ -19,6 +19,7 @@ pub(crate) mod backoff; +#[cfg(not(target_arch = "wasm32"))] mod dns; #[cfg(test)] @@ -48,22 +49,25 @@ pub use body::{HttpRequest, HttpRequestBody, HttpResponse, HttpResponseBody}; pub(crate) mod builder; mod connection; -pub use connection::{ - HttpClient, HttpConnector, HttpError, HttpErrorKind, HttpService, ReqwestConnector, -}; +pub(crate) use connection::http_connector; +#[cfg(not(target_arch = "wasm32"))] +pub use connection::ReqwestConnector; +pub use connection::{HttpClient, HttpConnector, HttpError, HttpErrorKind, HttpService}; #[cfg(any(feature = "aws", feature = "gcp", feature = "azure"))] pub(crate) mod parts; use async_trait::async_trait; use reqwest::header::{HeaderMap, HeaderValue}; -use reqwest::{Client, ClientBuilder, NoProxy, Proxy}; use serde::{Deserialize, Serialize}; use std::collections::HashMap; use std::str::FromStr; use std::sync::Arc; use std::time::Duration; +#[cfg(not(target_arch = "wasm32"))] +use reqwest::{NoProxy, Proxy}; + use crate::config::{fmt_duration, ConfigValue}; use crate::path::Path; use crate::{GetOptions, Result}; @@ -195,8 +199,10 @@ impl FromStr for ClientConfigKey { /// This is used to configure the client to trust a specific certificate. See /// [Self::from_pem] for an example #[derive(Debug, Clone)] +#[cfg(not(target_arch = "wasm32"))] pub struct Certificate(reqwest::tls::Certificate); +#[cfg(not(target_arch = "wasm32"))] impl Certificate { /// Create a `Certificate` from a PEM encoded certificate. /// @@ -243,6 +249,7 @@ impl Certificate { #[derive(Debug, Clone)] pub struct ClientOptions { user_agent: Option>, + #[cfg(not(target_arch = "wasm32"))] root_certificates: Vec, content_type_map: HashMap, default_content_type: Option, @@ -276,6 +283,7 @@ impl Default for ClientOptions { // we opt for a slightly higher default timeout of 30 seconds Self { user_agent: None, + #[cfg(not(target_arch = "wasm32"))] root_certificates: Default::default(), content_type_map: Default::default(), default_content_type: None, @@ -402,6 +410,7 @@ impl ClientOptions { /// /// This can be used to connect to a server that has a self-signed /// certificate for example. + #[cfg(not(target_arch = "wasm32"))] pub fn with_root_certificate(mut self, certificate: Certificate) -> Self { self.root_certificates.push(certificate); self @@ -614,8 +623,9 @@ impl ClientOptions { .with_connect_timeout(Duration::from_secs(1)) } - pub(crate) fn client(&self) -> Result { - let mut builder = ClientBuilder::new(); + #[cfg(not(target_arch = "wasm32"))] + pub(crate) fn client(&self) -> Result { + let mut builder = reqwest::ClientBuilder::new(); match &self.user_agent { Some(user_agent) => builder = builder.user_agent(user_agent.get()?), @@ -799,7 +809,7 @@ mod cloud { use crate::client::token::{TemporaryToken, TokenCache}; use crate::RetryConfig; - /// A [`CredentialProvider`] that uses [`Client`] to fetch temporary tokens + /// A [`CredentialProvider`] that uses [`HttpClient`] to fetch temporary tokens #[derive(Debug)] pub(crate) struct TokenCredentialProvider { inner: T, diff --git a/src/gcp/builder.rs b/src/gcp/builder.rs index 7939783..74aecae 100644 --- a/src/gcp/builder.rs +++ b/src/gcp/builder.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -use crate::client::{HttpConnector, ReqwestConnector, TokenCredentialProvider}; +use crate::client::{http_connector, HttpConnector, TokenCredentialProvider}; use crate::gcp::client::{GoogleCloudStorageClient, GoogleCloudStorageConfig}; use crate::gcp::credential::{ ApplicationDefaultCredentials, InstanceCredentialProvider, ServiceAccountCredentials, @@ -427,7 +427,9 @@ impl GoogleCloudStorageBuilder { self } - /// Overrides the [`HttpConnector`], by default uses [`ReqwestConnector`] + /// The [`HttpConnector`] to use + /// + /// On non-WASM32 platforms uses [`reqwest`] by default, on WASM32 platforms must be provided pub fn with_http_connector(mut self, connector: C) -> Self { self.http_connector = Some(Arc::new(connector)); self @@ -442,9 +444,7 @@ impl GoogleCloudStorageBuilder { let bucket_name = self.bucket_name.ok_or(Error::MissingBucketName {})?; - let http = self - .http_connector - .unwrap_or_else(|| Arc::new(ReqwestConnector::default())); + let http = http_connector(self.http_connector)?; // First try to initialize from the service account information. let service_account_credentials = diff --git a/src/http/mod.rs b/src/http/mod.rs index 8fba4d7..9786d83 100644 --- a/src/http/mod.rs +++ b/src/http/mod.rs @@ -41,7 +41,7 @@ use url::Url; use crate::client::get::GetClientExt; use crate::client::header::get_etag; -use crate::client::{HttpConnector, ReqwestConnector}; +use crate::client::{http_connector, HttpConnector}; use crate::http::client::Client; use crate::path::Path; use crate::{ @@ -237,7 +237,9 @@ impl HttpBuilder { self } - /// Overrides the [`HttpConnector`], by default uses [`ReqwestConnector`] + /// The [`HttpConnector`] to use + /// + /// On non-WASM32 platforms uses [`reqwest`] by default, on WASM32 platforms must be provided pub fn with_http_connector(mut self, connector: C) -> Self { self.http_connector = Some(Arc::new(connector)); self @@ -248,10 +250,7 @@ impl HttpBuilder { let url = self.url.ok_or(Error::MissingUrl)?; let parsed = Url::parse(&url).map_err(|source| Error::UnableToParseUrl { url, source })?; - let client = match self.http_connector { - None => ReqwestConnector::default().connect(&self.client_options)?, - Some(x) => x.connect(&self.client_options)?, - }; + let client = http_connector(self.http_connector)?.connect(&self.client_options)?; Ok(HttpStore { client: Arc::new(Client::new( diff --git a/src/lib.rs b/src/lib.rs index 5db7e01..836cd75 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -497,12 +497,6 @@ //! [`webpki-roots`]: https://crates.io/crates/webpki-roots //! -#[cfg(all( - target_arch = "wasm32", - any(feature = "gcp", feature = "aws", feature = "azure", feature = "http") -))] -compile_error!("Features 'gcp', 'aws', 'azure', 'http' are not supported on wasm."); - #[cfg(feature = "aws")] pub mod aws; #[cfg(feature = "azure")] @@ -530,10 +524,13 @@ pub mod client; #[cfg(feature = "cloud")] pub use client::{ - backoff::BackoffConfig, retry::RetryConfig, Certificate, ClientConfigKey, ClientOptions, - CredentialProvider, StaticCredentialProvider, + backoff::BackoffConfig, retry::RetryConfig, ClientConfigKey, ClientOptions, CredentialProvider, + StaticCredentialProvider, }; +#[cfg(all(feature = "cloud", not(target_arch = "wasm32")))] +pub use client::Certificate; + #[cfg(feature = "cloud")] mod config; @@ -1083,8 +1080,6 @@ impl GetResult { .await } GetResultPayload::Stream(s) => collect_bytes(s, Some(len)).await, - #[cfg(target_arch = "wasm32")] - _ => unimplemented!("File IO not implemented on wasm32."), } } @@ -1110,8 +1105,6 @@ impl GetResult { local::chunked_stream(file, path, self.range, CHUNK_SIZE) } GetResultPayload::Stream(s) => s, - #[cfg(target_arch = "wasm32")] - _ => unimplemented!("File IO not implemented on wasm32."), } } } diff --git a/src/parse.rs b/src/parse.rs index 4e67e59..00ea6cf 100644 --- a/src/parse.rs +++ b/src/parse.rs @@ -201,7 +201,13 @@ where let url = &url[..url::Position::BeforePath]; builder_opts!(crate::http::HttpBuilder, url, _options) } - #[cfg(not(all(feature = "aws", feature = "azure", feature = "gcp", feature = "http")))] + #[cfg(not(all( + feature = "aws", + feature = "azure", + feature = "gcp", + feature = "http", + not(target_arch = "wasm32") + )))] s => { return Err(super::Error::Generic { store: "parse_url", From e71f94eb1a7766125fb8c17b5fd9da1afd0e715a Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Wed, 5 Mar 2025 10:39:59 +0000 Subject: [PATCH 394/397] Prepare object_store 0.12.0 (#7234) * Prepare object_store 0.12.0 * Changelog tweaks --- CHANGELOG-old.md | 36 ++++++++++++++++++ CHANGELOG.md | 63 +++++++++++++++++++------------- Cargo.toml | 2 +- dev/release/update_change_log.sh | 4 +- 4 files changed, 76 insertions(+), 29 deletions(-) diff --git a/CHANGELOG-old.md b/CHANGELOG-old.md index c426892..f157e6f 100644 --- a/CHANGELOG-old.md +++ b/CHANGELOG-old.md @@ -19,6 +19,42 @@ # Historical Changelog +## [object_store_0.11.2](https://github.com/apache/arrow-rs/tree/object_store_0.11.2) (2024-12-20) + +[Full Changelog](https://github.com/apache/arrow-rs/compare/object_store_0.11.1...object_store_0.11.2) + +**Implemented enhancements:** + +- object-store's AzureClient should protect against multiple streams performing put\_block in parallel for the same BLOB path [\#6868](https://github.com/apache/arrow-rs/issues/6868) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Support S3 Put IfMatch [\#6799](https://github.com/apache/arrow-rs/issues/6799) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- object\_store Azure Government using OAuth [\#6759](https://github.com/apache/arrow-rs/issues/6759) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Support for AWS Requester Pays buckets [\#6716](https://github.com/apache/arrow-rs/issues/6716) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- \[object-store\]: Implement credential\_process support for S3 [\#6422](https://github.com/apache/arrow-rs/issues/6422) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- object\_store: Conditional put and rename\_if\_not\_exist on S3 [\#6285](https://github.com/apache/arrow-rs/issues/6285) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] + +**Fixed bugs:** + +- `object_store` errors when `reqwest` `gzip` feature is enabled [\#6842](https://github.com/apache/arrow-rs/issues/6842) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Multi-part s3 uploads fail when using checksum [\#6793](https://github.com/apache/arrow-rs/issues/6793) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- `with_unsigned_payload` shouldn't generate payload hash [\#6697](https://github.com/apache/arrow-rs/issues/6697) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- \[Object\_store\] min\_ttl is too high for GKE tokens [\#6625](https://github.com/apache/arrow-rs/issues/6625) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- object\_store `test_private_bucket` fails - store: "S3", source: BucketNotFound { bucket: "bloxbender" } [\#6600](https://github.com/apache/arrow-rs/issues/6600) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- S3 endpoint and trailing slash result in weird/invalid requests [\#6580](https://github.com/apache/arrow-rs/issues/6580) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] + +**Merged pull requests:** + +- Use randomized content ID for Azure multipart uploads [\#6869](https://github.com/apache/arrow-rs/pull/6869) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([avarnon](https://github.com/avarnon)) +- Always explicitly disable `gzip` automatic decompression on reqwest client used by object\_store [\#6843](https://github.com/apache/arrow-rs/pull/6843) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([phillipleblanc](https://github.com/phillipleblanc)) +- object-store: remove S3ConditionalPut::ETagPutIfNotExists [\#6802](https://github.com/apache/arrow-rs/pull/6802) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([benesch](https://github.com/benesch)) +- Fix multipart uploads with checksums on object locked buckets [\#6794](https://github.com/apache/arrow-rs/pull/6794) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([avantgardnerio](https://github.com/avantgardnerio)) +- Add AuthorityHost to AzureConfigKey [\#6773](https://github.com/apache/arrow-rs/pull/6773) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([zadeluca](https://github.com/zadeluca)) +- object\_store: Add support for requester pays buckets [\#6768](https://github.com/apache/arrow-rs/pull/6768) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([kylebarron](https://github.com/kylebarron)) +- check sign\_payload instead of skip\_signature before computing checksum [\#6698](https://github.com/apache/arrow-rs/pull/6698) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([mherrerarendon](https://github.com/mherrerarendon)) +- Update quick-xml requirement from 0.36.0 to 0.37.0 in /object\_store [\#6687](https://github.com/apache/arrow-rs/pull/6687) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([crepererum](https://github.com/crepererum)) +- Support native S3 conditional writes [\#6682](https://github.com/apache/arrow-rs/pull/6682) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([benesch](https://github.com/benesch)) +- \[object\_store\] fix S3 endpoint and trailing slash result in invalid requests [\#6641](https://github.com/apache/arrow-rs/pull/6641) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([adbmal](https://github.com/adbmal)) +- Lower GCP token min\_ttl to 4 minutes and add backoff to token refresh logic [\#6638](https://github.com/apache/arrow-rs/pull/6638) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([mwylde](https://github.com/mwylde)) +- Remove `test_private_bucket` object\_store test [\#6601](https://github.com/apache/arrow-rs/pull/6601) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([alamb](https://github.com/alamb)) ## [object_store_0.11.1](https://github.com/apache/arrow-rs/tree/object_store_0.11.1) (2024-10-15) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0e834c5..6dd9d0f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,42 +19,53 @@ # Changelog -## [object_store_0.11.2](https://github.com/apache/arrow-rs/tree/object_store_0.11.2) (2024-12-20) +## [object_store_0.12.0](https://github.com/apache/arrow-rs/tree/object_store_0.12.0) (2025-03-05) -[Full Changelog](https://github.com/apache/arrow-rs/compare/object_store_0.11.1...object_store_0.11.2) +[Full Changelog](https://github.com/apache/arrow-rs/compare/object_store_0.11.2...object_store_0.12.0) + +**Breaking changes:** + +- feat: add `Extensions` to object store `PutMultipartOpts` [\#7214](https://github.com/apache/arrow-rs/pull/7214) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([crepererum](https://github.com/crepererum)) +- feat: add `Extensions` to object store `PutOptions` [\#7213](https://github.com/apache/arrow-rs/pull/7213) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([crepererum](https://github.com/crepererum)) +- chore: enable conditional put by default for S3 [\#7181](https://github.com/apache/arrow-rs/pull/7181) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([meteorgan](https://github.com/meteorgan)) +- feat: add `Extensions` to object store `GetOptions` [\#7170](https://github.com/apache/arrow-rs/pull/7170) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([crepererum](https://github.com/crepererum)) +- feat\(object\_store\): Override DNS Resolution to Randomize IP Selection [\#7123](https://github.com/apache/arrow-rs/pull/7123) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([crepererum](https://github.com/crepererum)) +- Use `u64` range instead of `usize`, for better wasm32 support [\#6961](https://github.com/apache/arrow-rs/pull/6961) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([XiangpengHao](https://github.com/XiangpengHao)) +- object\_store: Add enabled-by-default "fs" feature [\#6636](https://github.com/apache/arrow-rs/pull/6636) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([Turbo87](https://github.com/Turbo87)) +- Return `BoxStream` with `'static` lifetime from `ObjectStore::list` [\#6619](https://github.com/apache/arrow-rs/pull/6619) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([kylebarron](https://github.com/kylebarron)) +- object\_store: Migrate from snafu to thiserror [\#6266](https://github.com/apache/arrow-rs/pull/6266) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([Turbo87](https://github.com/Turbo87)) **Implemented enhancements:** -- object-store's AzureClient should protect against multiple streams performing put\_block in parallel for the same BLOB path [\#6868](https://github.com/apache/arrow-rs/issues/6868) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- Support S3 Put IfMatch [\#6799](https://github.com/apache/arrow-rs/issues/6799) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- object\_store Azure Government using OAuth [\#6759](https://github.com/apache/arrow-rs/issues/6759) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- Support for AWS Requester Pays buckets [\#6716](https://github.com/apache/arrow-rs/issues/6716) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- \[object-store\]: Implement credential\_process support for S3 [\#6422](https://github.com/apache/arrow-rs/issues/6422) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- object\_store: Conditional put and rename\_if\_not\_exist on S3 [\#6285](https://github.com/apache/arrow-rs/issues/6285) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Object Store: S3 IP address selection is biased [\#7117](https://github.com/apache/arrow-rs/issues/7117) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- object\_store: GCSObjectStore should derive Clone [\#7113](https://github.com/apache/arrow-rs/issues/7113) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Remove all RCs after release [\#7059](https://github.com/apache/arrow-rs/issues/7059) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- LocalFileSystem::list\_with\_offset is very slow over network file system [\#7018](https://github.com/apache/arrow-rs/issues/7018) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Release object store `0.11.2` \(non API breaking\) Around Dec 15 2024 [\#6902](https://github.com/apache/arrow-rs/issues/6902) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] **Fixed bugs:** -- `object_store` errors when `reqwest` `gzip` feature is enabled [\#6842](https://github.com/apache/arrow-rs/issues/6842) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- Multi-part s3 uploads fail when using checksum [\#6793](https://github.com/apache/arrow-rs/issues/6793) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- `with_unsigned_payload` shouldn't generate payload hash [\#6697](https://github.com/apache/arrow-rs/issues/6697) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- \[Object\_store\] min\_ttl is too high for GKE tokens [\#6625](https://github.com/apache/arrow-rs/issues/6625) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- object\_store `test_private_bucket` fails - store: "S3", source: BucketNotFound { bucket: "bloxbender" } [\#6600](https://github.com/apache/arrow-rs/issues/6600) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- S3 endpoint and trailing slash result in weird/invalid requests [\#6580](https://github.com/apache/arrow-rs/issues/6580) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- LocalFileSystem errors with satisfiable range request [\#6749](https://github.com/apache/arrow-rs/issues/6749) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] **Merged pull requests:** -- Use randomized content ID for Azure multipart uploads [\#6869](https://github.com/apache/arrow-rs/pull/6869) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([avarnon](https://github.com/avarnon)) -- Always explicitly disable `gzip` automatic decompression on reqwest client used by object\_store [\#6843](https://github.com/apache/arrow-rs/pull/6843) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([phillipleblanc](https://github.com/phillipleblanc)) -- object-store: remove S3ConditionalPut::ETagPutIfNotExists [\#6802](https://github.com/apache/arrow-rs/pull/6802) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([benesch](https://github.com/benesch)) -- Fix multipart uploads with checksums on object locked buckets [\#6794](https://github.com/apache/arrow-rs/pull/6794) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([avantgardnerio](https://github.com/avantgardnerio)) -- Add AuthorityHost to AzureConfigKey [\#6773](https://github.com/apache/arrow-rs/pull/6773) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([zadeluca](https://github.com/zadeluca)) -- object\_store: Add support for requester pays buckets [\#6768](https://github.com/apache/arrow-rs/pull/6768) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([kylebarron](https://github.com/kylebarron)) -- check sign\_payload instead of skip\_signature before computing checksum [\#6698](https://github.com/apache/arrow-rs/pull/6698) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([mherrerarendon](https://github.com/mherrerarendon)) -- Update quick-xml requirement from 0.36.0 to 0.37.0 in /object\_store [\#6687](https://github.com/apache/arrow-rs/pull/6687) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([crepererum](https://github.com/crepererum)) -- Support native S3 conditional writes [\#6682](https://github.com/apache/arrow-rs/pull/6682) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([benesch](https://github.com/benesch)) -- \[object\_store\] fix S3 endpoint and trailing slash result in invalid requests [\#6641](https://github.com/apache/arrow-rs/pull/6641) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([adbmal](https://github.com/adbmal)) -- Lower GCP token min\_ttl to 4 minutes and add backoff to token refresh logic [\#6638](https://github.com/apache/arrow-rs/pull/6638) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([mwylde](https://github.com/mwylde)) -- Remove `test_private_bucket` object\_store test [\#6601](https://github.com/apache/arrow-rs/pull/6601) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([alamb](https://github.com/alamb)) +- ObjectStore WASM32 Support [\#7226](https://github.com/apache/arrow-rs/pull/7226) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- \[main\] Bump arrow version to 54.2.1 \(\#7207\) [\#7212](https://github.com/apache/arrow-rs/pull/7212) ([alamb](https://github.com/alamb)) +- Decouple ObjectStore from Reqwest [\#7183](https://github.com/apache/arrow-rs/pull/7183) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- object\_store: Disable all compression formats in HTTP reqwest client [\#7143](https://github.com/apache/arrow-rs/pull/7143) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([kylewlacy](https://github.com/kylewlacy)) +- refactor: remove unused `async` from `InMemory::entry` [\#7133](https://github.com/apache/arrow-rs/pull/7133) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([crepererum](https://github.com/crepererum)) +- object\_store/gcp: derive Clone for GoogleCloudStorage [\#7112](https://github.com/apache/arrow-rs/pull/7112) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([james-rms](https://github.com/james-rms)) +- Update version to 54.2.0 and add CHANGELOG [\#7110](https://github.com/apache/arrow-rs/pull/7110) ([alamb](https://github.com/alamb)) +- Remove all RCs after release [\#7060](https://github.com/apache/arrow-rs/pull/7060) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([kou](https://github.com/kou)) +- Update release schedule README.md [\#7053](https://github.com/apache/arrow-rs/pull/7053) ([alamb](https://github.com/alamb)) +- Create GitHub releases automatically on tagging [\#7042](https://github.com/apache/arrow-rs/pull/7042) ([kou](https://github.com/kou)) +- Change Log On Succesful S3 Copy / Multipart Upload to Debug [\#7033](https://github.com/apache/arrow-rs/pull/7033) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([diptanu](https://github.com/diptanu)) +- Prepare for `54.1.0` release [\#7031](https://github.com/apache/arrow-rs/pull/7031) ([alamb](https://github.com/alamb)) +- Add a custom implementation `LocalFileSystem::list_with_offset` [\#7019](https://github.com/apache/arrow-rs/pull/7019) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([corwinjoy](https://github.com/corwinjoy)) +- Improve docs for `AmazonS3Builder::from_env` [\#6977](https://github.com/apache/arrow-rs/pull/6977) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([kylebarron](https://github.com/kylebarron)) +- Fix WASM CI for Rust 1.84 release [\#6963](https://github.com/apache/arrow-rs/pull/6963) ([alamb](https://github.com/alamb)) +- Update itertools requirement from 0.13.0 to 0.14.0 in /object\_store [\#6925](https://github.com/apache/arrow-rs/pull/6925) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- Fix LocalFileSystem with range request that ends beyond end of file [\#6751](https://github.com/apache/arrow-rs/pull/6751) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([kylebarron](https://github.com/kylebarron)) diff --git a/Cargo.toml b/Cargo.toml index 0372514..0b40862 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "object_store" -version = "0.11.2" +version = "0.12.0" edition = "2021" license = "MIT/Apache-2.0" readme = "README.md" diff --git a/dev/release/update_change_log.sh b/dev/release/update_change_log.sh index 2797b62..f52c9f4 100755 --- a/dev/release/update_change_log.sh +++ b/dev/release/update_change_log.sh @@ -29,8 +29,8 @@ set -e -SINCE_TAG="object_store_0.11.1" -FUTURE_RELEASE="object_store_0.11.2" +SINCE_TAG="object_store_0.11.2" +FUTURE_RELEASE="object_store_0.12.0" SOURCE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" SOURCE_TOP_DIR="$(cd "${SOURCE_DIR}/../../" && pwd)" From 7525a3f088847a22418b71fb99f957efa4a0cac1 Mon Sep 17 00:00:00 2001 From: Matthijs Brobbel Date: Sat, 8 Mar 2025 10:25:56 +0100 Subject: [PATCH 395/397] Use `doc_auto_cfg`, logo and favicon for docs.rs (#7145) * Use `doc_auto_cfg` for docs.rs and add logo and icon * Rustfmt --------- Co-authored-by: Andrew Lamb --- src/lib.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/src/lib.rs b/src/lib.rs index 836cd75..ec660df 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -15,6 +15,7 @@ // specific language governing permissions and limitations // under the License. +#![cfg_attr(docsrs, feature(doc_auto_cfg))] #![deny(rustdoc::broken_intra_doc_links, rustdoc::bare_urls, rust_2018_idioms)] #![warn( missing_copy_implementations, From 6da5ac325f147482fb70d6b894a79738cb5d4e15 Mon Sep 17 00:00:00 2001 From: Arnaud Gourlay Date: Tue, 11 Mar 2025 15:13:57 +0100 Subject: [PATCH 396/397] Fix requirement of object store for http-body-util (#7265) --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index 0b40862..8370cd5 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -47,7 +47,7 @@ walkdir = { version = "2", optional = true } # Cloud storage support base64 = { version = "0.22", default-features = false, features = ["std"], optional = true } form_urlencoded = { version = "1.2", optional = true } -http-body-util = { version = "0.1", optional = true } +http-body-util = { version = "0.1.2", optional = true } httparse = { version = "1.8.0", default-features = false, features = ["std"], optional = true } hyper = { version = "1.2", default-features = false, optional = true } md-5 = { version = "0.10.6", default-features = false, optional = true } From 6a36dcb9bf2f5f4c44d48ea8e28fcb7f4d6db75f Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Thu, 20 Mar 2025 14:47:01 -0400 Subject: [PATCH 397/397] add .gitignore --- .gitignore | 99 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 99 insertions(+) create mode 100644 .gitignore diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..0788dae --- /dev/null +++ b/.gitignore @@ -0,0 +1,99 @@ +Cargo.lock +target +rusty-tags.vi +.history +.flatbuffers/ +.idea/ +.vscode +.devcontainer +venv/* +# created by doctests +parquet/data.parquet +# release notes cache +.githubchangeloggenerator.cache +.githubchangeloggenerator.cache.log +justfile +.prettierignore +.env +.editorconfig +# local azurite file +__azurite* +__blobstorage__ + +# .bak files +*.bak +*.bak2 +# OS-specific .gitignores + +# Mac .gitignore +# General +.DS_Store +.AppleDouble +.LSOverride + +# Icon must end with two \r +Icon + +# Thumbnails +._* + +# Files that might appear in the root of a volume +.DocumentRevisions-V100 +.fseventsd +.Spotlight-V100 +.TemporaryItems +.Trashes +.VolumeIcon.icns +.com.apple.timemachine.donotpresent + +# Directories potentially created on remote AFP share +.AppleDB +.AppleDesktop +Network Trash Folder +Temporary Items +.apdisk + +# Linux .gitignore +*~ + +# temporary files which can be created if a process still has a handle open of a deleted file +.fuse_hidden* + +# KDE directory preferences +.directory + +# Linux trash folder which might appear on any partition or disk +.Trash-* + +# .nfs files are created when an open file is removed but is still being accessed +.nfs* + +# Windows .gitignore +# Windows thumbnail cache files +Thumbs.db +Thumbs.db:encryptable +ehthumbs.db +ehthumbs_vista.db + +# Dump file +*.stackdump + +# Folder config file +[Dd]esktop.ini + +# Recycle Bin used on file shares +$RECYCLE.BIN/ + +# Windows Installer files +*.cab +*.msi +*.msix +*.msm +*.msp + +# Windows shortcuts +*.lnk + +# Python virtual env in parquet crate +parquet/pytest/venv/ +__pycache__/