From ae768c0c4c36e8bf0a7a7785e362333ce94e0381 Mon Sep 17 00:00:00 2001 From: Philip Sampaio Date: Sat, 19 Aug 2023 02:08:22 -0300 Subject: [PATCH] Fix FSS.S3 support when bucket is not provided This makes Explorer treat the "endpoint" as already containing the bucket name, and force the virtual host style in order to make Polars ignore the bucket name. Since FSS is always building the correct endpoint in case the user does not provide, we changed the endpoint to be required in the Rust struct. This work is related to https://github.com/elixir-explorer/fss/pull/2 --- lib/explorer/fss/s3.ex | 12 ++++++++---- lib/explorer/polars_backend/shared.ex | 4 +++- mix.exs | 2 +- mix.lock | 2 +- native/explorer/src/dataframe/io.rs | 27 ++++++++++++++++++++------- native/explorer/src/datatypes.rs | 27 +++++++++++++++++++++------ test/explorer/data_frame/csv_test.exs | 21 +++++++++++++++++++++ 7 files changed, 75 insertions(+), 20 deletions(-) diff --git a/lib/explorer/fss/s3.ex b/lib/explorer/fss/s3.ex index a69af33cd..8a750ca19 100644 --- a/lib/explorer/fss/s3.ex +++ b/lib/explorer/fss/s3.ex @@ -16,15 +16,19 @@ defimpl Explorer.FSS, for: FSS.S3.Entry do end defp url(%FSS.S3.Entry{} = entry) do + config = entry.config + + uri = URI.parse(config.endpoint) + uri = - if endpoint = entry.config.endpoint do - URI.parse(endpoint) + if not is_nil(config.bucket) and not String.contains?(config.endpoint, config.bucket) do + append_path(uri, "/" <> config.bucket) else - URI.parse("https://s3.#{entry.config.region}.amazonaws.com") + uri end uri - |> append_path("/" <> entry.bucket <> "/" <> entry.key) + |> append_path("/" <> entry.key) |> URI.to_string() end diff --git a/lib/explorer/polars_backend/shared.ex b/lib/explorer/polars_backend/shared.ex index 8df5e5e94..898b47d57 100644 --- a/lib/explorer/polars_backend/shared.ex +++ b/lib/explorer/polars_backend/shared.ex @@ -185,8 +185,10 @@ defmodule Explorer.PolarsBackend.Shared do the `System.tmp_dir()`. """ def build_path_for_entry(%FSS.S3.Entry{} = entry) do + bucket = entry.config.bucket || "default-explorer-bucket" + hash = - :crypto.hash(:sha256, entry.bucket <> "/" <> entry.key) |> Base.url_encode64(padding: false) + :crypto.hash(:sha256, bucket <> "/" <> entry.key) |> Base.url_encode64(padding: false) id = "s3-file-#{hash}" diff --git a/mix.exs b/mix.exs index 11e497b03..f64aa0e71 100644 --- a/mix.exs +++ b/mix.exs @@ -38,7 +38,7 @@ defmodule Explorer.MixProject do [ {:aws_signature, "~> 0.3"}, {:castore, "~> 1.0"}, - {:fss, github: "elixir-explorer/fss"}, + {:fss, github: "elixir-explorer/fss", branch: "ps-add-parse-bucket-url"}, {:rustler_precompiled, "~> 0.6"}, {:table, "~> 0.1.2"}, {:adbc, "~> 0.1", optional: true}, diff --git a/mix.lock b/mix.lock index 4080db7ba..93b0c240a 100644 --- a/mix.lock +++ b/mix.lock @@ -14,7 +14,7 @@ "earmark_parser": {:hex, :earmark_parser, "1.4.33", "3c3fd9673bb5dcc9edc28dd90f50c87ce506d1f71b70e3de69aa8154bc695d44", [:mix], [], "hexpm", "2d526833729b59b9fdb85785078697c72ac5e5066350663e5be6a1182da61b8f"}, "elixir_make": {:hex, :elixir_make, "0.7.7", "7128c60c2476019ed978210c245badf08b03dbec4f24d05790ef791da11aa17c", [:mix], [{:castore, "~> 0.1 or ~> 1.0", [hex: :castore, repo: "hexpm", optional: true]}], "hexpm", "5bc19fff950fad52bbe5f211b12db9ec82c6b34a9647da0c2224b8b8464c7e6c"}, "ex_doc": {:hex, :ex_doc, "0.30.3", "bfca4d340e3b95f2eb26e72e4890da83e2b3a5c5b0e52607333bf5017284b063", [:mix], [{:earmark_parser, "~> 1.4.31", [hex: :earmark_parser, repo: "hexpm", optional: false]}, {:makeup_elixir, "~> 0.14", [hex: :makeup_elixir, repo: "hexpm", optional: false]}, {:makeup_erlang, "~> 0.1", [hex: :makeup_erlang, repo: "hexpm", optional: false]}], "hexpm", "fbc8702046c1d25edf79de376297e608ac78cdc3a29f075484773ad1718918b6"}, - "fss": {:git, "https://github.com/elixir-explorer/fss.git", "417a1b6716c89d6a59cae01340ed0aff7e5b1164", []}, + "fss": {:git, "https://github.com/elixir-explorer/fss.git", "9154e1d66a34057d228e88bd4483f4e527ce4919", [branch: "ps-add-parse-bucket-url"]}, "jason": {:hex, :jason, "1.4.1", "af1504e35f629ddcdd6addb3513c3853991f694921b1b9368b0bd32beb9f1b63", [:mix], [{:decimal, "~> 1.0 or ~> 2.0", [hex: :decimal, repo: "hexpm", optional: true]}], "hexpm", "fbb01ecdfd565b56261302f7e1fcc27c4fb8f32d56eab74db621fc154604a7a1"}, "makeup": {:hex, :makeup, "1.1.0", "6b67c8bc2882a6b6a445859952a602afc1a41c2e08379ca057c0f525366fc3ca", [:mix], [{:nimble_parsec, "~> 1.2.2 or ~> 1.3", [hex: :nimble_parsec, repo: "hexpm", optional: false]}], "hexpm", "0a45ed501f4a8897f580eabf99a2e5234ea3e75a4373c8a52824f6e873be57a6"}, "makeup_elixir": {:hex, :makeup_elixir, "0.16.1", "cc9e3ca312f1cfeccc572b37a09980287e243648108384b97ff2b76e505c3555", [:mix], [{:makeup, "~> 1.0", [hex: :makeup, repo: "hexpm", optional: false]}, {:nimble_parsec, "~> 1.2.3 or ~> 1.3", [hex: :nimble_parsec, repo: "hexpm", optional: false]}], "hexpm", "e127a341ad1b209bd80f7bd1620a15693a9908ed780c3b763bccf7d200c767c6"}, diff --git a/native/explorer/src/dataframe/io.rs b/native/explorer/src/dataframe/io.rs index 389648297..4548cb6a1 100644 --- a/native/explorer/src/dataframe/io.rs +++ b/native/explorer/src/dataframe/io.rs @@ -262,13 +262,26 @@ fn build_aws_s3_cloud_writer( ) -> Result { let config = ex_entry.config; let mut aws_builder = object_store::aws::AmazonS3Builder::new() - .with_region(config.region) - .with_bucket_name(ex_entry.bucket) - .with_access_key_id(config.access_key_id) - .with_secret_access_key(config.secret_access_key); - - if let Some(endpoint) = config.endpoint { - aws_builder = aws_builder.with_allow_http(true).with_endpoint(endpoint); + .with_region(&config.region) + .with_access_key_id(&config.access_key_id) + .with_secret_access_key(&config.secret_access_key) + .with_allow_http(true) + .with_endpoint(&config.endpoint); + + if let Some(bucket_name) = &config.bucket { + aws_builder = aws_builder.with_bucket_name(bucket_name); + + // If the bucket is already in the endpoint, we force the "virtual-hosted" + // style in order to make Polars ignore the bucket name. + if config.endpoint.contains(bucket_name) { + aws_builder = aws_builder.with_virtual_hosted_style_request(true); + } + } else { + // This bucket name is going to be ignored because it's assumed to be + // already in the endpoint URL. + aws_builder = aws_builder + .with_bucket_name("explorer-default-bucket-name") + .with_virtual_hosted_style_request(true); } if let Some(token) = config.token { diff --git a/native/explorer/src/datatypes.rs b/native/explorer/src/datatypes.rs index f47a9f367..fea358790 100644 --- a/native/explorer/src/datatypes.rs +++ b/native/explorer/src/datatypes.rs @@ -421,21 +421,25 @@ pub struct ExS3Config { pub access_key_id: String, pub secret_access_key: String, pub region: String, - pub endpoint: Option, + pub endpoint: String, + pub bucket: Option, pub token: Option, } #[derive(NifStruct, Clone, Debug)] #[module = "FSS.S3.Entry"] pub struct ExS3Entry { - pub bucket: String, pub key: String, pub config: ExS3Config, } impl fmt::Display for ExS3Entry { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - write!(f, "s3://{}/{}", self.bucket, self.key) + if let Some(bucket_name) = &self.config.bucket { + write!(f, "s3://{}/{}", bucket_name, self.key) + } else { + write!(f, "s3://default-explorer-bucket/{}", self.key) + } } } @@ -450,13 +454,24 @@ impl ExS3Config { (S3Key::from_str("aws_allow_http").unwrap(), &true_as_string), ]; - if let Some(endpoint) = &self.endpoint { - aws_opts.push((S3Key::Endpoint, endpoint)) - } + aws_opts.push((S3Key::Endpoint, &self.endpoint)); if let Some(token) = &self.token { aws_opts.push((S3Key::Token, token)) } + + // When bucket is not present, we need to force the virtual host style + // in order to ignore the bucket name. + // We also enforce the host style because we already put the bucket when + // is an AWS S3 endpoint. + if self.bucket.is_none() || self.is_aws_endpoint() { + aws_opts.push((S3Key::VirtualHostedStyleRequest, &true_as_string)); + } + CloudOptions::default().with_aws(aws_opts) } + + pub fn is_aws_endpoint(&self) -> bool { + self.endpoint.contains("amazonaws.com") + } } diff --git a/test/explorer/data_frame/csv_test.exs b/test/explorer/data_frame/csv_test.exs index a7c7dea8e..2f34770cd 100644 --- a/test/explorer/data_frame/csv_test.exs +++ b/test/explorer/data_frame/csv_test.exs @@ -530,6 +530,27 @@ defmodule Explorer.DataFrame.CSVTest do assert {:error, "no such file or directory"} = DF.from_csv(path, config: s3_config) end + + @tag :cloud_integration + test "writes a CSV file to endpoint ignoring bucket name", %{df: df} do + config = %FSS.S3.Config{ + access_key_id: "test", + secret_access_key: "test", + endpoint: "http://localhost:4566/test-bucket", + bucket: nil, + region: "us-east-1" + } + + entry = %FSS.S3.Entry{ + key: "wine-yolo-#{System.monotonic_time()}.csv", + config: config + } + + assert :ok = DF.to_csv(df, entry) + + saved_df = DF.from_csv!(entry) + assert DF.to_columns(saved_df) == DF.to_columns(Explorer.Datasets.wine()) + end end describe "from_csv/2 - HTTP" do