From 1ccc43b59b3860e95c394bcd7d1a0fdb42df4b1c Mon Sep 17 00:00:00 2001 From: Santos Gallegos Date: Thu, 22 Dec 2022 19:00:37 -0500 Subject: [PATCH] Build: use rclone for sync --- dockerfiles/Dockerfile | 3 +- readthedocs/builds/storage.py | 10 +++ readthedocs/projects/models.py | 5 ++ readthedocs/projects/tasks/builds.py | 5 +- readthedocs/storage/rclone.py | 91 ++++++++++++++++++++++++++++ readthedocs/storage/s3_storage.py | 29 ++++++++- 6 files changed, 138 insertions(+), 5 deletions(-) create mode 100644 readthedocs/storage/rclone.py diff --git a/dockerfiles/Dockerfile b/dockerfiles/Dockerfile index b26f322a48e..e6004b1976b 100644 --- a/dockerfiles/Dockerfile +++ b/dockerfiles/Dockerfile @@ -30,7 +30,8 @@ RUN apt-get -y install \ netcat \ telnet \ lsb-release \ - npm + npm \ + rclone # Gets the MinIO mc client used to add buckets upon initialization # If this client should have issues running inside this image, it is also diff --git a/readthedocs/builds/storage.py b/readthedocs/builds/storage.py index b4ddc5305c2..4a4c39372ad 100644 --- a/readthedocs/builds/storage.py +++ b/readthedocs/builds/storage.py @@ -1,4 +1,5 @@ from pathlib import Path +from functools import cached_property import structlog from django.conf import settings @@ -7,6 +8,7 @@ from storages.utils import get_available_overwrite_name, safe_join from readthedocs.core.utils.filesystem import safe_open +from readthedocs.storage.rclone import RClone log = structlog.get_logger(__name__) @@ -153,6 +155,14 @@ def sync_directory(self, source, destination): log.debug('Deleting file from media storage.', filepath=filepath) self.delete(filepath) + @cached_property + def _rclone(self): + return RClone() + + def rclone_sync(self, source, destination): + """Sync a directory recursively to storage using rclone sync.""" + return self._rclone.sync(source, destination) + def join(self, directory, filepath): return safe_join(directory, filepath) diff --git a/readthedocs/projects/models.py b/readthedocs/projects/models.py index 2b14553a2e7..7a64fb5ffbb 100644 --- a/readthedocs/projects/models.py +++ b/readthedocs/projects/models.py @@ -1842,6 +1842,7 @@ def add_features(sender, **kwargs): USE_SPHINX_BUILDERS = "use_sphinx_builders" CANCEL_OLD_BUILDS = "cancel_old_builds" DONT_CREATE_INDEX = "dont_create_index" + USE_RCLONE = "use_rclone" FEATURES = ( (ALLOW_DEPRECATED_WEBHOOKS, _('Allow deprecated webhook views')), @@ -1998,6 +1999,10 @@ def add_features(sender, **kwargs): DONT_CREATE_INDEX, _('Do not create index.md or README.rst if the project does not have one.'), ), + ( + USE_RCLONE, + _("Use rclone for syncing files to the media storage."), + ), ) projects = models.ManyToManyField( diff --git a/readthedocs/projects/tasks/builds.py b/readthedocs/projects/tasks/builds.py index f5a72d2a0c3..af1a3977a48 100644 --- a/readthedocs/projects/tasks/builds.py +++ b/readthedocs/projects/tasks/builds.py @@ -833,7 +833,10 @@ def store_build_artifacts( version_type=self.data.version.type, ) try: - build_media_storage.sync_directory(from_path, to_path) + if self.data.project.has_feature(Feature.USE_RCLONE): + build_media_storage.rclone_sync(from_path, to_path) + else: + build_media_storage.sync_directory(from_path, to_path) except Exception: # Ideally this should just be an IOError # but some storage backends unfortunately throw other errors diff --git a/readthedocs/storage/rclone.py b/readthedocs/storage/rclone.py new file mode 100644 index 00000000000..99ea1f9707e --- /dev/null +++ b/readthedocs/storage/rclone.py @@ -0,0 +1,91 @@ +"""Wrapper around the rclone command.""" + +import os +import subprocess + +import structlog + +log = structlog.get_logger(__name__) + + +class RClone: + + remote_type = "local" + rclone_bin = "rclone" + default_options = [ + # Number of file transfers to run in parallel. + "--transfers=8", + "--verbose", + ] + env_vars = {} + + def build_target(self, path): + return f":{self.remote_type}:{path}" + + def execute(self, action, args, options=None): + options = options or [] + command = [ + self.rclone_bin, + action, + *self.default_options, + *options, + "--", + *args, + ] + env = os.environ.copy() + # env = {} + env.update(self.env_vars) + log.info("Executing rclone command.", command=command) + log.debug("env", env=env) + result = subprocess.run( + command, + capture_output=True, + env=env, + ) + log.debug( + "Result.", + stdout=result.stdout.decode(), + stderr=result.stderr.decode(), + exit_code=result.returncode, + ) + return result + + def sync(self, source, destination): + # TODO: check if source can be a symlink. + return self.execute("sync", args=[source, self.build_target(destination)]) + + +class RCloneS3Remote(RClone): + + remote_type = "s3" + + def __init__( + self, + bucket_name, + access_key_id, + secret_acces_key, + region, + provider="AWS", + acl=None, + endpoint=None, + ): + super().__init__() + # rclone S3 options passed as env vars. + # https://rclone.org/s3/#standard-options. + region = region or "" + self.env_vars = { + "RCLONE_S3_PROVIDER": provider, + "RCLONE_S3_ACCESS_KEY_ID": access_key_id, + "RCLONE_S3_SECRET_ACCESS_KEY": secret_acces_key, + "RCLONE_S3_REGION": region, + "RCLONE_S3_LOCATION_CONSTRAINT": region, + } + if acl: + self.env_vars["RCLONE_S3_ACL"] = acl + if endpoint: + self.env_vars["RCLONE_S3_ENDPOINT"] = endpoint + self.bucket_name = bucket_name + + def build_target(self, path): + path = f"{self.bucket_name}/{path}" + return super().build_target(path) diff --git a/readthedocs/storage/s3_storage.py b/readthedocs/storage/s3_storage.py index 197602ea50c..1cfba440de2 100644 --- a/readthedocs/storage/s3_storage.py +++ b/readthedocs/storage/s3_storage.py @@ -9,16 +9,39 @@ # Disable abstract method because we are not overriding all the methods # pylint: disable=abstract-method +from functools import cached_property from django.conf import settings from django.core.exceptions import ImproperlyConfigured from storages.backends.s3boto3 import S3Boto3Storage, S3ManifestStaticStorage from readthedocs.builds.storage import BuildMediaStorageMixin +from readthedocs.storage.rclone import RCloneS3Remote from .mixins import OverrideHostnameMixin, S3PrivateBucketMixin -class S3BuildMediaStorage(BuildMediaStorageMixin, OverrideHostnameMixin, S3Boto3Storage): +class S3BuildMediaStorageMixin(BuildMediaStorageMixin, S3Boto3Storage): + + @cached_property + def _rclone(self): + provider = "AWS" + # If a cutom endpoint URL is given and + # we are runnin in DEBUG mode, use minio as provider. + if self.endpoint_url and settings.DEBUG: + provider = "minio" + + return RCloneS3Remote( + bucket_name=self.bucket_name, + access_key_id=self.access_key, + secret_acces_key=self.secret_key, + region=self.region_name, + acl=self.default_acl, + endpoint=self.endpoint_url, + provider=provider, + ) + + +class S3BuildMediaStorage(OverrideHostnameMixin, S3BuildMediaStorageMixin): """An AWS S3 Storage backend for build artifacts.""" @@ -94,7 +117,7 @@ class NoManifestS3StaticStorage( """ -class S3BuildEnvironmentStorage(S3PrivateBucketMixin, BuildMediaStorageMixin, S3Boto3Storage): +class S3BuildEnvironmentStorage(S3PrivateBucketMixin, S3BuildMediaStorageMixin): bucket_name = getattr(settings, 'S3_BUILD_ENVIRONMENT_STORAGE_BUCKET', None) @@ -108,7 +131,7 @@ def __init__(self, *args, **kwargs): ) -class S3BuildToolsStorage(S3PrivateBucketMixin, BuildMediaStorageMixin, S3Boto3Storage): +class S3BuildToolsStorage(S3PrivateBucketMixin, S3BuildMediaStorageMixin): bucket_name = getattr(settings, 'S3_BUILD_TOOLS_STORAGE_BUCKET', None)