Skip to content

Commit

Permalink
Move httpfs extension outside of duckdb and rename to cached_httpfs
Browse files Browse the repository at this point in the history
* Rename/modify all occurrences of httpfs
* Skip clang format for folder
  • Loading branch information
mkaruza committed Aug 7, 2024
1 parent a957839 commit dbde53e
Show file tree
Hide file tree
Showing 22 changed files with 3,946 additions and 5 deletions.
7 changes: 2 additions & 5 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -77,16 +77,13 @@ third_party/duckdb/build/$(DUCKDB_BUILD_TYPE)/src/$(DUCKDB_LIB):
DISABLE_SANITIZER=1 \
ENABLE_UBSAN=0 \
BUILD_UNITTESTS=OFF \
BUILD_HTTPFS=1 \
BUILD_JSON=1 \
CMAKE_EXPORT_COMPILE_COMMANDS=1 \
-j8
EXTENSION_CONFIGS="../pg_duckdb_extensions.cmake"

install-duckdb:
$(install_bin) -m 755 third_party/duckdb/build/$(DUCKDB_BUILD_TYPE)/src/$(DUCKDB_LIB) $(DESTDIR)$(PG_LIB)

clean-duckdb:
#rm -rf third_party/duckdb/build
rm -rf third_party/duckdb/build

install: install-duckdb

Expand Down
2 changes: 2 additions & 0 deletions third_party/cached_httpfs/.clang-format
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
DisableFormat: true
SortIncludes: false
51 changes: 51 additions & 0 deletions third_party/cached_httpfs/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
cmake_minimum_required(VERSION 2.8.12...3.29)

project(CachedHTTPFsExtension)

add_extension_definitions()

include_directories(include ../duckdb/third_party/httplib ../duckdb/parquet/include)

build_static_extension(
cached_httpfs
hffs.cpp
s3fs.cpp
httpfs.cpp
http_state.cpp
crypto.cpp
create_secret_functions.cpp
cached_httpfs_extension.cpp)

set(PARAMETERS "-warnings")
build_loadable_extension(
cached_httpfs
${PARAMETERS}
hffs.cpp
s3fs.cpp
httpfs.cpp
http_state.cpp
crypto.cpp
create_secret_functions.cpp
cached_httpfs_extension.cpp)

if(MINGW)
set(OPENSSL_USE_STATIC_LIBS TRUE)
endif()

find_package(OpenSSL REQUIRED)
include_directories(${OPENSSL_INCLUDE_DIR})
target_link_libraries(cached_httpfs_loadable_extension duckdb_mbedtls
${OPENSSL_LIBRARIES})
target_link_libraries(cached_httpfs_extension duckdb_mbedtls ${OPENSSL_LIBRARIES})

if(MINGW)
find_package(ZLIB)
target_link_libraries(cached_httpfs_loadable_extension ZLIB::ZLIB -lcrypt32)
target_link_libraries(cached_httpfs_extension ZLIB::ZLIB -lcrypt32)
endif()

install(
TARGETS cached_httpfs_extension
EXPORT "${DUCKDB_EXPORT_SET}"
LIBRARY DESTINATION "${INSTALL_LIB_DIR}"
ARCHIVE DESTINATION "${INSTALL_LIB_DIR}")
1 change: 1 addition & 0 deletions third_party/cached_httpfs/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Documentation on S3 tests setup can be found [here](../../test/sql/copy/s3/README.md)
104 changes: 104 additions & 0 deletions third_party/cached_httpfs/cached_httpfs_extension.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
#define DUCKDB_EXTENSION_MAIN

#include "cached_httpfs_extension.hpp"

#include "create_secret_functions.hpp"
#include "duckdb.hpp"
#include "s3fs.hpp"
#include "hffs.hpp"
#include "crypto.hpp"

namespace duckdb {

static void LoadInternal(DatabaseInstance &instance) {
S3FileSystem::Verify(); // run some tests to see if all the hashes work out
auto &fs = instance.GetFileSystem();

fs.RegisterSubSystem(make_uniq<HTTPFileSystem>());
fs.RegisterSubSystem(make_uniq<HuggingFaceFileSystem>());
fs.RegisterSubSystem(make_uniq<S3FileSystem>(BufferManager::GetBufferManager(instance)));

auto &config = DBConfig::GetConfig(instance);

// Global HTTP config
// Single timeout value is used for all 4 types of timeouts, we could split it into 4 if users need that
config.AddExtensionOption("http_timeout", "HTTP timeout read/write/connection/retry", LogicalType::UBIGINT,
Value(30000));
config.AddExtensionOption("http_retries", "HTTP retries on I/O error", LogicalType::UBIGINT, Value(3));
config.AddExtensionOption("http_retry_wait_ms", "Time between retries", LogicalType::UBIGINT, Value(100));
config.AddExtensionOption("force_download", "Forces upfront download of file", LogicalType::BOOLEAN, Value(false));
// Reduces the number of requests made while waiting, for example retry_wait_ms of 50 and backoff factor of 2 will
// result in wait times of 0 50 100 200 400...etc.
config.AddExtensionOption("http_retry_backoff", "Backoff factor for exponentially increasing retry wait time",
LogicalType::FLOAT, Value(4));
config.AddExtensionOption(
"http_keep_alive",
"Keep alive connections. Setting this to false can help when running into connection failures",
LogicalType::BOOLEAN, Value(true));
config.AddExtensionOption("enable_server_cert_verification", "Enable server side certificate verification.",
LogicalType::BOOLEAN, Value(false));
config.AddExtensionOption("ca_cert_file", "Path to a custom certificate file for self-signed certificates.",
LogicalType::VARCHAR, Value(""));
// Global S3 config
config.AddExtensionOption("s3_region", "S3 Region", LogicalType::VARCHAR, Value("us-east-1"));
config.AddExtensionOption("s3_access_key_id", "S3 Access Key ID", LogicalType::VARCHAR);
config.AddExtensionOption("s3_secret_access_key", "S3 Access Key", LogicalType::VARCHAR);
config.AddExtensionOption("s3_session_token", "S3 Session Token", LogicalType::VARCHAR);
config.AddExtensionOption("s3_endpoint", "S3 Endpoint", LogicalType::VARCHAR);
config.AddExtensionOption("s3_url_style", "S3 URL style", LogicalType::VARCHAR, Value("vhost"));
config.AddExtensionOption("s3_use_ssl", "S3 use SSL", LogicalType::BOOLEAN, Value(true));
config.AddExtensionOption("s3_url_compatibility_mode", "Disable Globs and Query Parameters on S3 URLs",
LogicalType::BOOLEAN, Value(false));

// S3 Uploader config
config.AddExtensionOption("s3_uploader_max_filesize", "S3 Uploader max filesize (between 50GB and 5TB)",
LogicalType::VARCHAR, "800GB");
config.AddExtensionOption("s3_uploader_max_parts_per_file", "S3 Uploader max parts per file (between 1 and 10000)",
LogicalType::UBIGINT, Value(10000));
config.AddExtensionOption("s3_uploader_thread_limit", "S3 Uploader global thread limit", LogicalType::UBIGINT,
Value(50));

// HuggingFace options
config.AddExtensionOption("hf_max_per_page", "Debug option to limit number of items returned in list requests",
LogicalType::UBIGINT, Value::UBIGINT(0));

auto provider = make_uniq<AWSEnvironmentCredentialsProvider>(config);
provider->SetAll();

CreateS3SecretFunctions::Register(instance);
CreateBearerTokenFunctions::Register(instance);
// set pointer to OpenSSL encryption state
config.encryption_util = make_shared_ptr<AESGCMStateSSLFactory>();
}

void CachedHttpfsExtension::Load(DuckDB &db) {
LoadInternal(*db.instance);
}
std::string CachedHttpfsExtension::Name() {
return "cached_httpfs";
}

std::string CachedHttpfsExtension::Version() const {
#ifdef EXT_VERSION_HTTPFS
return EXT_VERSION_HTTPFS;
#else
return "";
#endif
}

} // namespace duckdb

extern "C" {

DUCKDB_EXTENSION_API void cached_httpfs_init(duckdb::DatabaseInstance &db) {
LoadInternal(db);
}

DUCKDB_EXTENSION_API const char *cached_httpfs_version() {
return duckdb::DuckDB::LibraryVersion();
}
}

#ifndef DUCKDB_EXTENSION_MAIN
#error DUCKDB_EXTENSION_MAIN not defined
#endif
Loading

0 comments on commit dbde53e

Please sign in to comment.