Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

branch-3.0: [feat](clone) Speed clone tablet via batch small file downloading #45061 #45191

Merged
merged 1 commit into from
Dec 10, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions be/src/common/config.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -232,6 +232,8 @@ DEFINE_mInt32(max_download_speed_kbps, "50000");
DEFINE_mInt32(download_low_speed_limit_kbps, "50");
// download low speed time(seconds)
DEFINE_mInt32(download_low_speed_time, "300");
// whether to download small files in batch
DEFINE_mBool(enable_batch_download, "false");

DEFINE_String(sys_log_dir, "");
DEFINE_String(user_function_dir, "${DORIS_HOME}/lib/udf");
Expand Down
2 changes: 2 additions & 0 deletions be/src/common/config.h
Original file line number Diff line number Diff line change
Expand Up @@ -282,6 +282,8 @@ DECLARE_mInt32(max_download_speed_kbps);
DECLARE_mInt32(download_low_speed_limit_kbps);
// download low speed time(seconds)
DECLARE_mInt32(download_low_speed_time);
// whether to download small files in batch.
DECLARE_mBool(enable_batch_download);

// deprecated, use env var LOG_DIR in be.conf
DECLARE_String(sys_log_dir);
Expand Down
6 changes: 6 additions & 0 deletions be/src/gutil/strings/stringpiece.h
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,12 @@ class StringPiece {
assert(length <= static_cast<size_t>(std::numeric_limits<int>::max()));
length_ = static_cast<int>(length);
}
StringPiece(std::string_view view) // NOLINT(runtime/explicit)
: ptr_(view.data()), length_(0) {
size_t length = view.size();
assert(length <= static_cast<size_t>(std::numeric_limits<int>::max()));
length_ = static_cast<int>(length);
}
StringPiece(const char* offset, int len) : ptr_(offset), length_(len) { assert(len >= 0); }

// Substring of another StringPiece.
Expand Down
216 changes: 216 additions & 0 deletions be/src/http/action/batch_download_action.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,216 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

#include "http/action/batch_download_action.h"

#include <memory>
#include <string>
#include <utility>
#include <vector>

#include "common/config.h"
#include "common/logging.h"
#include "common/status.h"
#include "gutil/strings/split.h"
#include "http/http_channel.h"
#include "http/http_method.h"
#include "http/http_request.h"
#include "http/utils.h"
#include "io/fs/local_file_system.h"
#include "runtime/exec_env.h"
#include "util/security.h"

namespace doris {
namespace {
const std::string CHECK_PARAMETER = "check";
const std::string LIST_PARAMETER = "list";
const std::string DIR_PARAMETER = "dir";
const std::string TOKEN_PARAMETER = "token";
} // namespace

BatchDownloadAction::BatchDownloadAction(
ExecEnv* exec_env, std::shared_ptr<bufferevent_rate_limit_group> rate_limit_group,
const std::vector<std::string>& allow_dirs)
: HttpHandlerWithAuth(exec_env), _rate_limit_group(std::move(rate_limit_group)) {
for (const auto& dir : allow_dirs) {
std::string p;
Status st = io::global_local_filesystem()->canonicalize(dir, &p);
if (!st.ok()) {
continue;
}
_allow_paths.emplace_back(std::move(p));
}
}

void BatchDownloadAction::handle(HttpRequest* req) {
if (VLOG_CRITICAL_IS_ON) {
VLOG_CRITICAL << "accept one batch download request " << req->debug_string();
}

if (req->param(CHECK_PARAMETER) == "true") {
// For API support check
HttpChannel::send_reply(req, "OK");
return;
}

// Get 'dir' parameter, then assembly file absolute path
const std::string& dir_path = req->param(DIR_PARAMETER);
if (dir_path.empty()) {
std::string error_msg =
std::string("parameter " + DIR_PARAMETER + " not specified in url.");
LOG(WARNING) << "handle batch download request: " << error_msg
<< ", url: " << mask_token(req->uri());
HttpChannel::send_reply(req, HttpStatus::BAD_REQUEST, error_msg);
return;
}

if (dir_path.find("..") != std::string::npos) {
std::string error_msg = "Not allowed to read relative path: " + dir_path;
LOG(WARNING) << "handle batch download request: " << error_msg
<< ", url: " << mask_token(req->uri());
HttpChannel::send_reply(req, HttpStatus::FORBIDDEN, error_msg);
return;
}

Status status;
if (config::enable_token_check) {
status = _check_token(req);
if (!status.ok()) {
std::string error_msg = status.to_string();
if (status.is<ErrorCode::NOT_AUTHORIZED>()) {
HttpChannel::send_reply(req, HttpStatus::UNAUTHORIZED, error_msg);
return;
} else {
HttpChannel::send_reply(req, HttpStatus::INTERNAL_SERVER_ERROR, error_msg);
return;
}
}
}

status = _check_path_is_allowed(dir_path);
if (!status.ok()) {
std::string error_msg = status.to_string();
if (status.is<ErrorCode::NOT_FOUND>() || status.is<ErrorCode::IO_ERROR>()) {
HttpChannel::send_reply(req, HttpStatus::NOT_FOUND, error_msg);
return;
} else if (status.is<ErrorCode::NOT_AUTHORIZED>()) {
HttpChannel::send_reply(req, HttpStatus::UNAUTHORIZED, error_msg);
return;
} else {
HttpChannel::send_reply(req, HttpStatus::INTERNAL_SERVER_ERROR, error_msg);
return;
}
}

bool is_dir = false;
status = io::global_local_filesystem()->is_directory(dir_path, &is_dir);
if (!status.ok()) {
LOG(WARNING) << "handle batch download request: " << status.to_string()
<< ", url: " << mask_token(req->uri());
HttpChannel::send_reply(req, HttpStatus::INTERNAL_SERVER_ERROR, status.to_string());
return;
}

if (!is_dir) {
std::string error_msg = fmt::format("The requested path is not a directory: {}", dir_path);
LOG(WARNING) << "handle batch download request: " << error_msg
<< ", url: " << mask_token(req->uri());
HttpChannel::send_reply(req, HttpStatus::BAD_REQUEST, error_msg);
return;
}

_handle(req, dir_path);

VLOG_CRITICAL << "deal with batch download request finished! ";
}

void BatchDownloadAction::_handle(HttpRequest* req, const std::string& dir_path) {
bool is_list_request = req->param(LIST_PARAMETER) == "true";
if (is_list_request) {
// return the list of files in the specified directory
bool is_acquire_filesize = true;
do_dir_response(dir_path, req, is_acquire_filesize);
} else {
_handle_batch_download(req, dir_path);
}
}

void BatchDownloadAction::_handle_batch_download(HttpRequest* req, const std::string& dir_path) {
std::vector<std::string> files =
strings::Split(req->get_request_body(), "\n", strings::SkipWhitespace());
if (files.empty()) {
std::string error_msg = "No file specified in request body.";
LOG(WARNING) << "handle batch download request: " << error_msg
<< ", url: " << mask_token(req->uri());
HttpChannel::send_reply(req, HttpStatus::BAD_REQUEST, error_msg);
return;
}

if (files.size() > 64) {
std::string error_msg =
"The number of files to download in a batch should be less than 64.";
LOG(WARNING) << "handle batch download request: " << error_msg
<< ", url: " << mask_token(req->uri());
HttpChannel::send_reply(req, HttpStatus::BAD_REQUEST, error_msg);
return;
}

for (const auto& file : files) {
if (file.find('/') != std::string::npos) {
std::string error_msg =
fmt::format("Not allowed to read relative path: {}, dir: {}", file, dir_path);
LOG(WARNING) << "handle batch download request: " << error_msg
<< ", url: " << mask_token(req->uri());
HttpChannel::send_reply(req, HttpStatus::FORBIDDEN, error_msg);
return;
}
}

HttpChannel::send_files(req, dir_path, std::move(files));
}

Status BatchDownloadAction::_check_token(HttpRequest* req) {
const std::string& token_str = req->param(TOKEN_PARAMETER);
if (token_str.empty()) {
LOG(WARNING) << "token is not specified in request. url: " << mask_token(req->uri());
return Status::NotAuthorized("token is not specified.");
}

const std::string& local_token = _exec_env->token();
if (token_str != local_token) {
LOG(WARNING) << "invalid download token: " << mask_token(token_str)
<< ", local token: " << mask_token(local_token)
<< ", url: " << mask_token(req->uri());
return Status::NotAuthorized("invalid token {}", mask_token(token_str));
}

return Status::OK();
}

Status BatchDownloadAction::_check_path_is_allowed(const std::string& file_path) {
std::string canonical_file_path;
RETURN_IF_ERROR(io::global_local_filesystem()->canonicalize(file_path, &canonical_file_path));
for (auto& allow_path : _allow_paths) {
if (io::LocalFileSystem::contain_path(allow_path, canonical_file_path)) {
return Status::OK();
}
}

return Status::NotAuthorized("file path is not allowed: {}", canonical_file_path);
}

} // end namespace doris
65 changes: 65 additions & 0 deletions be/src/http/action/batch_download_action.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

#pragma once

#include <string>
#include <vector>

#include "common/status.h"
#include "http/http_handler.h"
#include "http/http_handler_with_auth.h"
#include "util/threadpool.h"

struct bufferevent_rate_limit_group;

namespace doris {

class ExecEnv;
class HttpRequest;

// A simple handler that serves incoming HTTP requests of batching file-download to send their
// respective HTTP responses.
//
// We use parameter named 'dir' to specify the static resource path, it is an absolute path.
//
// In HEAD request, then this handler will return the list of files in the specified directory.
//
// In GET request, the file names to download are specified in the request body as a list of strings,
// separated by '\n'. To avoid cost resource, the maximum number of files to download in a batch is 64.
class BatchDownloadAction : public HttpHandlerWithAuth {
public:
BatchDownloadAction(ExecEnv* exec_env,
std::shared_ptr<bufferevent_rate_limit_group> rate_limit_group,
const std::vector<std::string>& allow_dirs);

~BatchDownloadAction() override = default;

void handle(HttpRequest* req) override;

private:
Status _check_token(HttpRequest* req);
Status _check_path_is_allowed(const std::string& path);

void _handle(HttpRequest* req, const std::string& dir_path);
void _handle_batch_download(HttpRequest* req, const std::string& dir_path);

std::vector<std::string> _allow_paths;
std::shared_ptr<bufferevent_rate_limit_group> _rate_limit_group;
};

} // end namespace doris
3 changes: 0 additions & 3 deletions be/src/http/action/download_binlog_action.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,11 +21,9 @@
#include <fmt/ranges.h>

#include <cstdint>
#include <limits>
#include <stdexcept>
#include <string_view>
#include <utility>
#include <vector>

#include "common/config.h"
#include "common/logging.h"
Expand All @@ -34,7 +32,6 @@
#include "http/utils.h"
#include "io/fs/local_file_system.h"
#include "olap/storage_engine.h"
#include "olap/tablet.h"
#include "olap/tablet_manager.h"
#include "runtime/exec_env.h"

Expand Down
Loading
Loading