diff --git a/.circleci/config.yml b/.circleci/config.yml index 9facf43fdfb..921cdd080fe 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -18,6 +18,8 @@ jobs: - checkout - run: git submodule sync - run: git submodule update --init + - run: sudo apt update + - run: sudo apt install -y rclone - run: pip install --user 'tox<5' - run: tox -e py310 - codecov/upload diff --git a/dockerfiles/Dockerfile b/dockerfiles/Dockerfile index b26f322a48e..e6004b1976b 100644 --- a/dockerfiles/Dockerfile +++ b/dockerfiles/Dockerfile @@ -30,7 +30,8 @@ RUN apt-get -y install \ netcat \ telnet \ lsb-release \ - npm + npm \ + rclone # Gets the MinIO mc client used to add buckets upon initialization # If this client should have issues running inside this image, it is also diff --git a/readthedocs/builds/storage.py b/readthedocs/builds/storage.py index 80ac243c9f4..b58380e28be 100644 --- a/readthedocs/builds/storage.py +++ b/readthedocs/builds/storage.py @@ -1,3 +1,4 @@ +from functools import cached_property from pathlib import Path import structlog @@ -7,6 +8,7 @@ from storages.utils import get_available_overwrite_name, safe_join from readthedocs.core.utils.filesystem import safe_open +from readthedocs.storage.rclone import RCloneLocal log = structlog.get_logger(__name__) @@ -172,6 +174,18 @@ def sync_directory(self, source, destination): log.debug('Deleting file from media storage.', filepath=filepath) self.delete(filepath) + @cached_property + def _rclone(self): + raise NotImplementedError + + def rclone_sync_directory(self, source, destination): + """Sync a directory recursively to storage using rclone sync.""" + if destination in ("", "/"): + raise SuspiciousFileOperation("Syncing all storage cannot be right") + + self._check_suspicious_path(source) + return self._rclone.sync(source, destination) + def join(self, directory, filepath): return safe_join(directory, filepath) @@ -206,6 +220,10 @@ def __init__(self, **kwargs): super().__init__(location) + @cached_property + def _rclone(self): + return RCloneLocal(location=self.location) + def get_available_name(self, name, max_length=None): """ A hack to overwrite by default with the FileSystemStorage. diff --git a/readthedocs/projects/models.py b/readthedocs/projects/models.py index 9c581f13f6a..569a9201e57 100644 --- a/readthedocs/projects/models.py +++ b/readthedocs/projects/models.py @@ -1849,6 +1849,7 @@ def add_features(sender, **kwargs): USE_SPHINX_BUILDERS = "use_sphinx_builders" CANCEL_OLD_BUILDS = "cancel_old_builds" DONT_CREATE_INDEX = "dont_create_index" + USE_RCLONE = "use_rclone" FEATURES = ( (ALLOW_DEPRECATED_WEBHOOKS, _('Allow deprecated webhook views')), @@ -2005,6 +2006,10 @@ def add_features(sender, **kwargs): DONT_CREATE_INDEX, _('Do not create index.md or README.rst if the project does not have one.'), ), + ( + USE_RCLONE, + _("Use rclone for syncing files to the media storage."), + ), ) projects = models.ManyToManyField( diff --git a/readthedocs/projects/tasks/builds.py b/readthedocs/projects/tasks/builds.py index ed4707f16ae..a7fd39b5887 100644 --- a/readthedocs/projects/tasks/builds.py +++ b/readthedocs/projects/tasks/builds.py @@ -839,7 +839,10 @@ def store_build_artifacts(self): version_type=self.data.version.type, ) try: - build_media_storage.sync_directory(from_path, to_path) + if self.data.project.has_feature(Feature.USE_RCLONE): + build_media_storage.rclone_sync_directory(from_path, to_path) + else: + build_media_storage.sync_directory(from_path, to_path) except Exception: # Ideally this should just be an IOError # but some storage backends unfortunately throw other errors diff --git a/readthedocs/rtd_tests/tests/test_build_storage.py b/readthedocs/rtd_tests/tests/test_build_storage.py index aa2d3eabc88..95b4bed50ec 100644 --- a/readthedocs/rtd_tests/tests/test_build_storage.py +++ b/readthedocs/rtd_tests/tests/test_build_storage.py @@ -157,3 +157,55 @@ def test_walk(self): self.assertEqual(top, 'files/api') self.assertCountEqual(dirs, []) self.assertCountEqual(files, ['index.html']) + + def test_rclone_sync(self): + tmp_files_dir = Path(tempfile.mkdtemp()) / "files" + shutil.copytree(files_dir, tmp_files_dir, symlinks=True) + storage_dir = "files" + + tree = [ + ("api", ["index.html"]), + "api.fjson", + "conf.py", + "test.html", + ] + with override_settings(DOCROOT=tmp_files_dir): + self.storage.rclone_sync_directory(tmp_files_dir, storage_dir) + self.assertFileTree(storage_dir, tree) + + tree = [ + ("api", ["index.html"]), + "conf.py", + "test.html", + ] + (tmp_files_dir / "api.fjson").unlink() + with override_settings(DOCROOT=tmp_files_dir): + self.storage.rclone_sync_directory(tmp_files_dir, storage_dir) + self.assertFileTree(storage_dir, tree) + + tree = [ + "conf.py", + "test.html", + ] + shutil.rmtree(tmp_files_dir / "api") + with override_settings(DOCROOT=tmp_files_dir): + self.storage.rclone_sync_directory(tmp_files_dir, storage_dir) + self.assertFileTree(storage_dir, tree) + + def test_rclone_sync_source_symlink(self): + tmp_dir = Path(tempfile.mkdtemp()) + tmp_symlink_dir = Path(tempfile.mkdtemp()) / "files" + tmp_symlink_dir.symlink_to(tmp_dir) + + with override_settings(DOCROOT=tmp_dir): + with pytest.raises(SuspiciousFileOperation, match="symbolic link"): + self.storage.rclone_sync_directory(tmp_symlink_dir, "files") + + def test_rclone_sync_source_outside_docroot(self): + tmp_dir = Path(tempfile.mkdtemp()) + tmp_docroot = Path(tempfile.mkdtemp()) / "docroot" + tmp_docroot.mkdir() + + with override_settings(DOCROOT=tmp_docroot): + with pytest.raises(SuspiciousFileOperation, match="outside the docroot"): + self.storage.rclone_sync_directory(tmp_dir, "files") diff --git a/readthedocs/storage/rclone.py b/readthedocs/storage/rclone.py new file mode 100644 index 00000000000..3552e2c3f86 --- /dev/null +++ b/readthedocs/storage/rclone.py @@ -0,0 +1,192 @@ +""" +Wrapper around the rclone command. + +See https://rclone.org/docs. +""" + +import os +import subprocess + +import structlog +from django.utils._os import safe_join as safe_join_fs +from storages.utils import safe_join + +log = structlog.get_logger(__name__) + + +class BaseRClone: + + """ + RClone base class. + + This class allows you to interact with an rclone remote without + a configuration file, the remote declaration and its options + are passed in the command itself. + + This base class allows you to use the local file system as remote. + + :param remote_type: You can see the full list of supported providers at + https://rclone.org/#providers. + :param rclone_bin: Binary name or path to the rclone binary. + Defaults to ``rclone``. + :param default_options: Options passed to the rclone command. + :parm env_vars: Environment variables used when executing the rclone command. + Useful to pass secrets to the ``rclone` command, since all arguments and + options will be logged. + """ + + remote_type = None + rclone_bin = "rclone" + default_options = [ + # Number of file transfers to run in parallel. + # Default value is 4. + "--transfers=8", + # Skip based on checksum (if available) & size, not mod-time & size. + "--checksum", + "--verbose", + ] + env_vars = {} + + def _get_target_path(self, path): + """ + Get the final target path for the remote. + + .. note:: + + This doesn't include the remote type, + this is just the destination path. + """ + raise NotImplementedError + + def get_target(self, path): + """ + Get the proper target using the current remote type. + + We start the remote with `:` to create it on the fly, + instead of having to create a configuration file. + See https://rclone.org/docs/#backend-path-to-dir. + + :param path: Path to the remote target. + """ + path = self._get_target_path(path) + return f":{self.remote_type}:{path}" + + def execute(self, subcommand, args, options=None): + """ + Execute an rclone subcommand. + + :param subcommand: Name of the subcommand. + :param list args: List of positional arguments passed the to command. + :param list options: List of options passed to the command. + """ + options = options or [] + command = [ + self.rclone_bin, + subcommand, + *self.default_options, + *options, + "--", + *args, + ] + env = os.environ.copy() + env.update(self.env_vars) + log.info("Executing rclone command.", command=command) + log.debug("Executing rclone commmad.", env=env) + result = subprocess.run( + command, + capture_output=True, + env=env, + check=True, + ) + log.debug( + "rclone execution finished.", + stdout=result.stdout.decode(), + stderr=result.stderr.decode(), + exit_code=result.returncode, + ) + return result + + def sync(self, source, destination): + """ + Run the `rclone sync` command. + + See https://rclone.org/commands/rclone_sync/. + + :params source: Local path to the source directory. + :params destination: Remote path to the destination directory. + """ + return self.execute("sync", args=[source, self.get_target(destination)]) + + +class RCloneLocal(BaseRClone): + + """ + RClone remote implementation for the local file system. + + Used for local testing only. + + See https://rclone.org/local/. + + :param location: Root directory where the files will be stored. + """ + + remote_type = "local" + + def __init__(self, location): + self.location = location + + def _get_target_path(self, path): + return safe_join_fs(self.location, path) + + +class RCloneS3Remote(BaseRClone): + + """ + RClone remote implementation for S3. + + All secrets will be passed as environ variables to the rclone command. + + See https://rclone.org/s3/. + + :params bucket_name: Name of the S3 bucket. + :params access_key_id: AWS access key id. + :params secret_acces_key: AWS secret access key. + :params region: AWS region. + :params provider: S3 provider, defaults to ``AWS``. + Useful to use Minio during development. + See https://rclone.org/s3/#s3-provider. + :param acl: Canned ACL used when creating buckets and storing or copying objects. + See https://rclone.org/s3/#s3-acl. + :param endpoint: Custom S3 endpoint, useful for development. + """ + + remote_type = "s3" + + def __init__( + self, + bucket_name, + access_key_id, + secret_acces_key, + region, + provider="AWS", + acl=None, + endpoint=None, + ): + # rclone S3 options passed as env vars. + # https://rclone.org/s3/#standard-options. + self.env_vars = { + "RCLONE_S3_PROVIDER": provider, + "RCLONE_S3_ACCESS_KEY_ID": access_key_id, + "RCLONE_S3_SECRET_ACCESS_KEY": secret_acces_key, + "RCLONE_S3_REGION": region, + "RCLONE_S3_LOCATION_CONSTRAINT": region, + } + if acl: + self.env_vars["RCLONE_S3_ACL"] = acl + if endpoint: + self.env_vars["RCLONE_S3_ENDPOINT"] = endpoint + self.bucket_name = bucket_name + + def _get_target_path(self, path): + """Overridden to prepend the bucket name to the path.""" + return safe_join(self.bucket_name, path) diff --git a/readthedocs/storage/s3_storage.py b/readthedocs/storage/s3_storage.py index 197602ea50c..d0d8e42cccb 100644 --- a/readthedocs/storage/s3_storage.py +++ b/readthedocs/storage/s3_storage.py @@ -9,16 +9,40 @@ # Disable abstract method because we are not overriding all the methods # pylint: disable=abstract-method +from functools import cached_property + from django.conf import settings from django.core.exceptions import ImproperlyConfigured from storages.backends.s3boto3 import S3Boto3Storage, S3ManifestStaticStorage from readthedocs.builds.storage import BuildMediaStorageMixin +from readthedocs.storage.rclone import RCloneS3Remote from .mixins import OverrideHostnameMixin, S3PrivateBucketMixin -class S3BuildMediaStorage(BuildMediaStorageMixin, OverrideHostnameMixin, S3Boto3Storage): +class S3BuildMediaStorageMixin(BuildMediaStorageMixin, S3Boto3Storage): + @cached_property + def _rclone(self): + provider = "AWS" + # If a custom endpoint URL is given and + # we are running in DEBUG mode, use minio as provider. + if self.endpoint_url and settings.DEBUG: + provider = "minio" + + return RCloneS3Remote( + bucket_name=self.bucket_name, + access_key_id=self.access_key, + secret_acces_key=self.secret_key, + region=self.region_name or "", + acl=self.default_acl, + endpoint=self.endpoint_url, + provider=provider, + ) + + +# pylint: disable=too-many-ancestors +class S3BuildMediaStorage(OverrideHostnameMixin, S3BuildMediaStorageMixin): """An AWS S3 Storage backend for build artifacts.""" @@ -94,7 +118,7 @@ class NoManifestS3StaticStorage( """ -class S3BuildEnvironmentStorage(S3PrivateBucketMixin, BuildMediaStorageMixin, S3Boto3Storage): +class S3BuildEnvironmentStorage(S3PrivateBucketMixin, S3BuildMediaStorageMixin): bucket_name = getattr(settings, 'S3_BUILD_ENVIRONMENT_STORAGE_BUCKET', None) @@ -108,7 +132,7 @@ def __init__(self, *args, **kwargs): ) -class S3BuildToolsStorage(S3PrivateBucketMixin, BuildMediaStorageMixin, S3Boto3Storage): +class S3BuildToolsStorage(S3PrivateBucketMixin, S3BuildMediaStorageMixin): bucket_name = getattr(settings, 'S3_BUILD_TOOLS_STORAGE_BUCKET', None)