Skip to content

Commit

Permalink
Build: use rclone for sync (#9842)
Browse files Browse the repository at this point in the history
- Put this new feature under a feature flag.
- Works out of the box with our current settings, no rclone configuration file required.
- Uses the local filesystem when running tests, uses minion during dev.
- We need to install rclone in our builders for this to work.
- I'm using the checks implemented in #9890, that needs to be merged first.
- If we want even faster upload times for sphinx, we can merge readthedocs/readthedocs-sphinx-ext#119,
  since right now we are re-uploading all files.

To test this, you need to re-build your docker containers.

Closes #9448
  • Loading branch information
stsewd authored Jan 25, 2023
1 parent 634e02f commit c48e5eb
Show file tree
Hide file tree
Showing 8 changed files with 302 additions and 5 deletions.
2 changes: 2 additions & 0 deletions .circleci/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@ jobs:
- checkout
- run: git submodule sync
- run: git submodule update --init
- run: sudo apt update
- run: sudo apt install -y rclone
- run: pip install --user 'tox<5'
- run: tox -e py310
- codecov/upload
Expand Down
3 changes: 2 additions & 1 deletion dockerfiles/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,8 @@ RUN apt-get -y install \
netcat \
telnet \
lsb-release \
npm
npm \
rclone

# Gets the MinIO mc client used to add buckets upon initialization
# If this client should have issues running inside this image, it is also
Expand Down
18 changes: 18 additions & 0 deletions readthedocs/builds/storage.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from functools import cached_property
from pathlib import Path

import structlog
Expand All @@ -7,6 +8,7 @@
from storages.utils import get_available_overwrite_name, safe_join

from readthedocs.core.utils.filesystem import safe_open
from readthedocs.storage.rclone import RCloneLocal

log = structlog.get_logger(__name__)

Expand Down Expand Up @@ -172,6 +174,18 @@ def sync_directory(self, source, destination):
log.debug('Deleting file from media storage.', filepath=filepath)
self.delete(filepath)

@cached_property
def _rclone(self):
raise NotImplementedError

def rclone_sync_directory(self, source, destination):
"""Sync a directory recursively to storage using rclone sync."""
if destination in ("", "/"):
raise SuspiciousFileOperation("Syncing all storage cannot be right")

self._check_suspicious_path(source)
return self._rclone.sync(source, destination)

def join(self, directory, filepath):
return safe_join(directory, filepath)

Expand Down Expand Up @@ -206,6 +220,10 @@ def __init__(self, **kwargs):

super().__init__(location)

@cached_property
def _rclone(self):
return RCloneLocal(location=self.location)

def get_available_name(self, name, max_length=None):
"""
A hack to overwrite by default with the FileSystemStorage.
Expand Down
5 changes: 5 additions & 0 deletions readthedocs/projects/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -1849,6 +1849,7 @@ def add_features(sender, **kwargs):
USE_SPHINX_BUILDERS = "use_sphinx_builders"
CANCEL_OLD_BUILDS = "cancel_old_builds"
DONT_CREATE_INDEX = "dont_create_index"
USE_RCLONE = "use_rclone"

FEATURES = (
(ALLOW_DEPRECATED_WEBHOOKS, _('Allow deprecated webhook views')),
Expand Down Expand Up @@ -2005,6 +2006,10 @@ def add_features(sender, **kwargs):
DONT_CREATE_INDEX,
_('Do not create index.md or README.rst if the project does not have one.'),
),
(
USE_RCLONE,
_("Use rclone for syncing files to the media storage."),
),
)

projects = models.ManyToManyField(
Expand Down
5 changes: 4 additions & 1 deletion readthedocs/projects/tasks/builds.py
Original file line number Diff line number Diff line change
Expand Up @@ -839,7 +839,10 @@ def store_build_artifacts(self):
version_type=self.data.version.type,
)
try:
build_media_storage.sync_directory(from_path, to_path)
if self.data.project.has_feature(Feature.USE_RCLONE):
build_media_storage.rclone_sync_directory(from_path, to_path)
else:
build_media_storage.sync_directory(from_path, to_path)
except Exception:
# Ideally this should just be an IOError
# but some storage backends unfortunately throw other errors
Expand Down
52 changes: 52 additions & 0 deletions readthedocs/rtd_tests/tests/test_build_storage.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,3 +157,55 @@ def test_walk(self):
self.assertEqual(top, 'files/api')
self.assertCountEqual(dirs, [])
self.assertCountEqual(files, ['index.html'])

def test_rclone_sync(self):
tmp_files_dir = Path(tempfile.mkdtemp()) / "files"
shutil.copytree(files_dir, tmp_files_dir, symlinks=True)
storage_dir = "files"

tree = [
("api", ["index.html"]),
"api.fjson",
"conf.py",
"test.html",
]
with override_settings(DOCROOT=tmp_files_dir):
self.storage.rclone_sync_directory(tmp_files_dir, storage_dir)
self.assertFileTree(storage_dir, tree)

tree = [
("api", ["index.html"]),
"conf.py",
"test.html",
]
(tmp_files_dir / "api.fjson").unlink()
with override_settings(DOCROOT=tmp_files_dir):
self.storage.rclone_sync_directory(tmp_files_dir, storage_dir)
self.assertFileTree(storage_dir, tree)

tree = [
"conf.py",
"test.html",
]
shutil.rmtree(tmp_files_dir / "api")
with override_settings(DOCROOT=tmp_files_dir):
self.storage.rclone_sync_directory(tmp_files_dir, storage_dir)
self.assertFileTree(storage_dir, tree)

def test_rclone_sync_source_symlink(self):
tmp_dir = Path(tempfile.mkdtemp())
tmp_symlink_dir = Path(tempfile.mkdtemp()) / "files"
tmp_symlink_dir.symlink_to(tmp_dir)

with override_settings(DOCROOT=tmp_dir):
with pytest.raises(SuspiciousFileOperation, match="symbolic link"):
self.storage.rclone_sync_directory(tmp_symlink_dir, "files")

def test_rclone_sync_source_outside_docroot(self):
tmp_dir = Path(tempfile.mkdtemp())
tmp_docroot = Path(tempfile.mkdtemp()) / "docroot"
tmp_docroot.mkdir()

with override_settings(DOCROOT=tmp_docroot):
with pytest.raises(SuspiciousFileOperation, match="outside the docroot"):
self.storage.rclone_sync_directory(tmp_dir, "files")
192 changes: 192 additions & 0 deletions readthedocs/storage/rclone.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,192 @@
"""
Wrapper around the rclone command.
See https://rclone.org/docs.
"""

import os
import subprocess

import structlog
from django.utils._os import safe_join as safe_join_fs
from storages.utils import safe_join

log = structlog.get_logger(__name__)


class BaseRClone:

"""
RClone base class.
This class allows you to interact with an rclone remote without
a configuration file, the remote declaration and its options
are passed in the command itself.
This base class allows you to use the local file system as remote.
:param remote_type: You can see the full list of supported providers at
https://rclone.org/#providers.
:param rclone_bin: Binary name or path to the rclone binary.
Defaults to ``rclone``.
:param default_options: Options passed to the rclone command.
:parm env_vars: Environment variables used when executing the rclone command.
Useful to pass secrets to the ``rclone` command, since all arguments and
options will be logged.
"""

remote_type = None
rclone_bin = "rclone"
default_options = [
# Number of file transfers to run in parallel.
# Default value is 4.
"--transfers=8",
# Skip based on checksum (if available) & size, not mod-time & size.
"--checksum",
"--verbose",
]
env_vars = {}

def _get_target_path(self, path):
"""
Get the final target path for the remote.
.. note::
This doesn't include the remote type,
this is just the destination path.
"""
raise NotImplementedError

def get_target(self, path):
"""
Get the proper target using the current remote type.
We start the remote with `:` to create it on the fly,
instead of having to create a configuration file.
See https://rclone.org/docs/#backend-path-to-dir.
:param path: Path to the remote target.
"""
path = self._get_target_path(path)
return f":{self.remote_type}:{path}"

def execute(self, subcommand, args, options=None):
"""
Execute an rclone subcommand.
:param subcommand: Name of the subcommand.
:param list args: List of positional arguments passed the to command.
:param list options: List of options passed to the command.
"""
options = options or []
command = [
self.rclone_bin,
subcommand,
*self.default_options,
*options,
"--",
*args,
]
env = os.environ.copy()
env.update(self.env_vars)
log.info("Executing rclone command.", command=command)
log.debug("Executing rclone commmad.", env=env)
result = subprocess.run(
command,
capture_output=True,
env=env,
check=True,
)
log.debug(
"rclone execution finished.",
stdout=result.stdout.decode(),
stderr=result.stderr.decode(),
exit_code=result.returncode,
)
return result

def sync(self, source, destination):
"""
Run the `rclone sync` command.
See https://rclone.org/commands/rclone_sync/.
:params source: Local path to the source directory.
:params destination: Remote path to the destination directory.
"""
return self.execute("sync", args=[source, self.get_target(destination)])


class RCloneLocal(BaseRClone):

"""
RClone remote implementation for the local file system.
Used for local testing only.
See https://rclone.org/local/.
:param location: Root directory where the files will be stored.
"""

remote_type = "local"

def __init__(self, location):
self.location = location

def _get_target_path(self, path):
return safe_join_fs(self.location, path)


class RCloneS3Remote(BaseRClone):

"""
RClone remote implementation for S3.
All secrets will be passed as environ variables to the rclone command.
See https://rclone.org/s3/.
:params bucket_name: Name of the S3 bucket.
:params access_key_id: AWS access key id.
:params secret_acces_key: AWS secret access key.
:params region: AWS region.
:params provider: S3 provider, defaults to ``AWS``.
Useful to use Minio during development.
See https://rclone.org/s3/#s3-provider.
:param acl: Canned ACL used when creating buckets and storing or copying objects.
See https://rclone.org/s3/#s3-acl.
:param endpoint: Custom S3 endpoint, useful for development.
"""

remote_type = "s3"

def __init__(
self,
bucket_name,
access_key_id,
secret_acces_key,
region,
provider="AWS",
acl=None,
endpoint=None,
):
# rclone S3 options passed as env vars.
# https://rclone.org/s3/#standard-options.
self.env_vars = {
"RCLONE_S3_PROVIDER": provider,
"RCLONE_S3_ACCESS_KEY_ID": access_key_id,
"RCLONE_S3_SECRET_ACCESS_KEY": secret_acces_key,
"RCLONE_S3_REGION": region,
"RCLONE_S3_LOCATION_CONSTRAINT": region,
}
if acl:
self.env_vars["RCLONE_S3_ACL"] = acl
if endpoint:
self.env_vars["RCLONE_S3_ENDPOINT"] = endpoint
self.bucket_name = bucket_name

def _get_target_path(self, path):
"""Overridden to prepend the bucket name to the path."""
return safe_join(self.bucket_name, path)
Loading

0 comments on commit c48e5eb

Please sign in to comment.