From a28b28263306a45d4d99c3ea7a926c2f05c3dc99 Mon Sep 17 00:00:00 2001 From: Manuel Kaufmann Date: Thu, 8 Dec 2022 20:32:46 +0100 Subject: [PATCH 1/2] Build: hard fail builds when there is symlinks in the output Hard fail the build if there is symlinks in the output. --- readthedocs/builds/storage.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/readthedocs/builds/storage.py b/readthedocs/builds/storage.py index b4ddc5305c2..6b1b97d1a43 100644 --- a/readthedocs/builds/storage.py +++ b/readthedocs/builds/storage.py @@ -5,6 +5,7 @@ from django.core.exceptions import SuspiciousFileOperation from django.core.files.storage import FileSystemStorage from storages.utils import get_available_overwrite_name, safe_join +from readthedocs.doc_builder.exceptions import UnsupportedSymlinkFileError from readthedocs.core.utils.filesystem import safe_open @@ -89,11 +90,11 @@ def copy_directory(self, source, destination): # Don't follow symlinks when uploading to storage. if filepath.is_symlink(): - log.info( + log.warning( "Skipping symlink upload.", path_resolved=str(filepath.resolve()), ) - continue + raise UnsupportedSymlinkFileError(filepath) if filepath.is_dir(): # Recursively copy the subdirectory @@ -127,11 +128,11 @@ def sync_directory(self, source, destination): sub_destination = self.join(destination, filepath.name) # Don't follow symlinks when uploading to storage. if filepath.is_symlink(): - log.info( + log.warning( "Skipping symlink upload.", path_resolved=str(filepath.resolve()), ) - continue + raise UnsupportedSymlinkFileError(filepath) if filepath.is_dir(): # Recursively sync the subdirectory From fadc357a55a5ca4d691e4ad58f6814152d3dca14 Mon Sep 17 00:00:00 2001 From: Manuel Kaufmann Date: Mon, 12 Dec 2022 19:22:02 +0100 Subject: [PATCH 2/2] Build: handle symlinks in a centralized way Simplifies symlinks hanlding to centralize the security logic in one place only (`safe_open`); reducing the complexity of this code, making it easier to maintain and more secure. Besides, it improves the UX for users to understand why their build is failing. It clearly communicates the error using our internal pattern for known exception managed at `Celery.on_failure`. These are the highlights of this work: - Always follow symlinks - Always copy symlink as-is. Only resolve them when uploading to storage. - Always check symlinks are inside DOCROOT and `base_path` if passed - Hard fail the build if there are symlinks in the output that are outside DOCROOT - Centralize all the symlink checks in `safe_open` (used to open configuration files while building and to open files for uploading them) - Clearly communicate the error to the user - Move upload to storage function inside `Celery.execute` handler to allow us raising `UnsupportedSymlinkFileError` and using `on_failure` to clearly communicate the error to the user --- readthedocs/builds/storage.py | 15 ---- readthedocs/config/config.py | 2 +- readthedocs/core/utils/filesystem.py | 86 +++++-------------- readthedocs/doc_builder/backends/mkdocs.py | 6 +- readthedocs/doc_builder/backends/sphinx.py | 3 +- readthedocs/doc_builder/exceptions.py | 12 ++- .../doc_builder/python_environments.py | 4 - readthedocs/projects/tasks/builds.py | 53 +++++++----- 8 files changed, 70 insertions(+), 111 deletions(-) diff --git a/readthedocs/builds/storage.py b/readthedocs/builds/storage.py index 6b1b97d1a43..ab243b95229 100644 --- a/readthedocs/builds/storage.py +++ b/readthedocs/builds/storage.py @@ -88,14 +88,6 @@ def copy_directory(self, source, destination): for filepath in source.iterdir(): sub_destination = self.join(destination, filepath.name) - # Don't follow symlinks when uploading to storage. - if filepath.is_symlink(): - log.warning( - "Skipping symlink upload.", - path_resolved=str(filepath.resolve()), - ) - raise UnsupportedSymlinkFileError(filepath) - if filepath.is_dir(): # Recursively copy the subdirectory self.copy_directory(filepath, sub_destination) @@ -126,13 +118,6 @@ def sync_directory(self, source, destination): copied_dirs = set() for filepath in source.iterdir(): sub_destination = self.join(destination, filepath.name) - # Don't follow symlinks when uploading to storage. - if filepath.is_symlink(): - log.warning( - "Skipping symlink upload.", - path_resolved=str(filepath.resolve()), - ) - raise UnsupportedSymlinkFileError(filepath) if filepath.is_dir(): # Recursively sync the subdirectory diff --git a/readthedocs/config/config.py b/readthedocs/config/config.py index 3c755bdd612..5d303da0a85 100644 --- a/readthedocs/config/config.py +++ b/readthedocs/config/config.py @@ -1384,7 +1384,7 @@ def load(path, env_config): # Allow symlinks, but only the ones that resolve inside the base directory. with safe_open( - filename, "r", allow_symlinks=True, base_path=path + filename, "r", base_path=path ) as configuration_file: try: config = parse(configuration_file.read()) diff --git a/readthedocs/core/utils/filesystem.py b/readthedocs/core/utils/filesystem.py index 6ede92cd35b..ec01d89abd2 100644 --- a/readthedocs/core/utils/filesystem.py +++ b/readthedocs/core/utils/filesystem.py @@ -21,18 +21,7 @@ log = structlog.get_logger(__name__) -def _assert_path_is_inside_docroot(path): - resolved_path = path.absolute().resolve() - docroot = Path(settings.DOCROOT).absolute() - if not path.is_relative_to(docroot): - log.error( - "Suspicious operation outside the docroot directory.", - path_resolved=str(resolved_path), - ) - raise SuspiciousFileOperation(path) - - -def safe_open(path, *args, allow_symlinks=False, base_path=None, **kwargs): +def safe_open(path, *args, base_path=None, **kwargs): """ Wrapper around path.open() to check for symlinks. @@ -40,13 +29,8 @@ def safe_open(path, *args, allow_symlinks=False, base_path=None, **kwargs): like GHSA-368m-86q9-m99w. - Checks that files aren't out of the DOCROOT directory. - :param allow_symlinks: If `False` and the path is a symlink, raise `FileIsSymlink` - This prevents reading the contents of other files users shouldn't have access to. - :param base_path: If given, check that the path isn't located outside the base path (usually the directory where the project was cloned). - It must be given if `allow_symlinks` is set to `True`. - This prevents path traversal attacks (even when using symlinks). The extra *args and **kwargs will be passed to the open() method. @@ -55,43 +39,44 @@ def safe_open(path, *args, allow_symlinks=False, base_path=None, **kwargs): This operation isn't safe to TocTou (Time-of-check to Time-of-use) attacks. Users shouldn't be able to change files while this operation is done. """ - if allow_symlinks and not base_path: - raise ValueError("base_path mut be given if symlinks are allowed.") + docroot = Path(settings.DOCROOT).absolute() path = Path(path).absolute() + # TODO: this `relative_path` can be improved to remove the first part: + # /symlink-security-exploit/artifacts/latest/generic/passwd.txt + # This is shown to the user currently. + relative_path = str(path).replace(str(docroot.resolve()), '') log.bind( - path_resolved=str(path.absolute().resolve()), + relative_path=relative_path, ) + if path.is_symlink(): + symlink_path = str(path.readlink().resolve()) + log.bind( + symlink_path=symlink_path, + ) + if path.exists() and not path.is_file(): raise FileIsNotRegularFile(path) - if not allow_symlinks and path.is_symlink(): - log.info("Skipping file becuase it's a symlink.") - raise UnsupportedSymlinkFileError(path) - - # Expand symlinks. + # Expand symlinks resolved_path = path.resolve() - if allow_symlinks and base_path: - base_path = Path(base_path).absolute() - if not resolved_path.is_relative_to(base_path): - # Trying to path traversal via a symlink, sneaky! - log.info("Path traversal via symlink.") - raise SymlinkOutsideBasePath(path) - - _assert_path_is_inside_docroot(resolved_path) + # Don't follow symlinks outside DOCROOT or base_path + if path.is_symlink() and (not resolved_path.is_relative_to(docroot) or (base_path and not resolved_path.is_relative_to(base_path))): + raise UnsupportedSymlinkFileError(filepath=relative_path) return resolved_path.open(*args, **kwargs) +# NOTE: I think `safe_copytree` is useful with `symlinks=True`, +# and we shouldn't perform all the other checks here. +# We are always using `safe_open` at the storage level, +# and we can keep all these checks contained there (in one place) def safe_copytree(from_dir, to_dir, *args, **kwargs): """ - Wrapper around shutil.copytree to check for symlinks. - - If any of the directories point to symlinks, cancel the operation. - We don't want to copy contents outside of those directories. + Wrapper around shutil.copytree to copy symlinks as is, instead of its contents. The extra *args and **kwargs will be passed to the copytree() function. @@ -100,21 +85,6 @@ def safe_copytree(from_dir, to_dir, *args, **kwargs): This operation isn't safe to TocTou (Time-of-check to Time-of-use) attacks. Users shouldn't be able to change files while this operation is done. """ - from_dir = Path(from_dir) - to_dir = Path(to_dir) - if from_dir.is_symlink() or to_dir.is_symlink(): - log.info( - "Not copying directory, one of the paths is a symlink.", - from_dir=from_dir, - from_dir_resolved=from_dir.resolve(), - to_dir=to_dir, - to_dir_resolved=to_dir.resolve(), - ) - return False - - _assert_path_is_inside_docroot(from_dir) - _assert_path_is_inside_docroot(to_dir) - return shutil.copytree( from_dir, to_dir, @@ -127,21 +97,11 @@ def safe_copytree(from_dir, to_dir, *args, **kwargs): def safe_rmtree(path, *args, **kwargs): """ - Wrapper around shutil.rmtree to check for symlinks. + Wrapper around shutil.rmtree for security reasons. shutil.rmtree doens't follow symlinks by default, - this function just logs in case users are trying to use symlinks. https://docs.python.org/3/library/shutil.html#shutil.rmtree The extra *args and **kwargs will be passed to the rmtree() function. """ - path = Path(path) - if path.is_symlink(): - log.info( - "Not deleting directory because it's a symlink.", - path=str(path), - resolved_path=path.resolve(), - ) - return None - _assert_path_is_inside_docroot(path) return shutil.rmtree(path, *args, **kwargs) diff --git a/readthedocs/doc_builder/backends/mkdocs.py b/readthedocs/doc_builder/backends/mkdocs.py index 171c1a59830..ee6a3a073e5 100644 --- a/readthedocs/doc_builder/backends/mkdocs.py +++ b/readthedocs/doc_builder/backends/mkdocs.py @@ -73,9 +73,8 @@ def get_final_doctype(self): https://www.mkdocs.org/user-guide/configuration/#use_directory_urls """ - # Allow symlinks, but only the ones that resolve inside the base directory. with safe_open( - self.yaml_file, "r", allow_symlinks=True, base_path=self.project_path + self.yaml_file, "r", base_path=self.project_path ) as fh: config = yaml_load_safely(fh) use_directory_urls = config.get('use_directory_urls', True) @@ -98,9 +97,8 @@ def load_yaml_config(self): :raises: ``MkDocsYAMLParseError`` if failed due to syntax errors. """ try: - # Allow symlinks, but only the ones that resolve inside the base directory. result = safe_open( - self.yaml_file, "r", allow_symlinks=True, base_path=self.project_path + self.yaml_file, "r", base_path=self.project_path ) if not result: raise UserFileNotFound( diff --git a/readthedocs/doc_builder/backends/sphinx.py b/readthedocs/doc_builder/backends/sphinx.py index 9f5712b6388..a1d36da949d 100644 --- a/readthedocs/doc_builder/backends/sphinx.py +++ b/readthedocs/doc_builder/backends/sphinx.py @@ -233,9 +233,8 @@ def append_conf(self): self.config_file = ( self.config_file or self.project.conf_file(self.version.slug) ) - # Allow symlinks, but only the ones that resolve inside the base directory. outfile = safe_open( - self.config_file, "a", allow_symlinks=True, base_path=self.project_path + self.config_file, "a", base_path=self.project_path ) if not outfile: raise UserFileNotFound( diff --git a/readthedocs/doc_builder/exceptions.py b/readthedocs/doc_builder/exceptions.py index 3f0e0c308f6..2ae801abe9f 100644 --- a/readthedocs/doc_builder/exceptions.py +++ b/readthedocs/doc_builder/exceptions.py @@ -97,9 +97,17 @@ class MkDocsYAMLParseError(BuildUserError): ) -# TODO: improve messages for symlink errors with a more detailed error and include the `filepath`. class UnsupportedSymlinkFileError(BuildUserError): - message = gettext_noop("Symlinks are not fully supported") + message = gettext_noop( + "There is at least one file ({filepath}) in the output that is a symlink. " + "Please, review your output directory and remove the symlinks. " + "Symlinks are not fully supported." + ) + + def __init__(self, message=None, **kwargs): + filepath = kwargs.pop('filepath') + message = self.message.format(filepath=filepath) + super().__init__(message, **kwargs) class FileIsNotRegularFile(UnsupportedSymlinkFileError): diff --git a/readthedocs/doc_builder/python_environments.py b/readthedocs/doc_builder/python_environments.py index 193b9494d53..04cad4d2555 100644 --- a/readthedocs/doc_builder/python_environments.py +++ b/readthedocs/doc_builder/python_environments.py @@ -395,14 +395,12 @@ def _append_core_requirements(self): See https://github.com/readthedocs/readthedocs.org/pull/5631 """ try: - # Allow symlinks, but only the ones that resolve inside the base directory. inputfile = safe_open( os.path.join( self.checkout_path, self.config.conda.environment, ), "r", - allow_symlinks=True, base_path=self.checkout_path, ) if not inputfile: @@ -438,14 +436,12 @@ def _append_core_requirements(self): dependencies.extend(conda_requirements) environment.update({'dependencies': dependencies}) try: - # Allow symlinks, but only the ones that resolve inside the base directory. outputfile = safe_open( os.path.join( self.checkout_path, self.config.conda.environment, ), "w", - allow_symlinks=True, base_path=self.checkout_path, ) if not outputfile: diff --git a/readthedocs/projects/tasks/builds.py b/readthedocs/projects/tasks/builds.py index f5a72d2a0c3..eeaa2af69f3 100644 --- a/readthedocs/projects/tasks/builds.py +++ b/readthedocs/projects/tasks/builds.py @@ -48,6 +48,7 @@ MkDocsYAMLParseError, ProjectBuildsSkippedError, YAMLParseError, + UnsupportedSymlinkFileError, ) from readthedocs.storage import build_media_storage from readthedocs.telemetry.collectors import BuildDataCollector @@ -514,29 +515,13 @@ def on_failure(self, exc, task_id, args, kwargs, einfo): self.data.build['success'] = False def on_success(self, retval, task_id, args, kwargs): - html = self.data.outcomes['html'] - search = self.data.outcomes['search'] - localmedia = self.data.outcomes['localmedia'] - pdf = self.data.outcomes['pdf'] - epub = self.data.outcomes['epub'] - - time_before_store_build_artifacts = timezone.now() - # Store build artifacts to storage (local or cloud storage) - self.store_build_artifacts( - html=html, - search=search, - localmedia=localmedia, - pdf=pdf, - epub=epub, - ) - log.info( - "Store build artifacts finished.", - time=(timezone.now() - time_before_store_build_artifacts).seconds, - ) - # NOTE: we are updating the db version instance *only* when # HTML are built successfully. + html = self.data.outcomes['html'] if html: + localmedia = self.data.outcomes['localmedia'] + pdf = self.data.outcomes['pdf'] + epub = self.data.outcomes['epub'] try: api_v2.version(self.data.version.pk).patch( { @@ -699,6 +684,26 @@ def execute(self): finally: self.data.build_data = self.collect_build_data() + html = self.data.outcomes['html'] + search = self.data.outcomes['search'] + localmedia = self.data.outcomes['localmedia'] + pdf = self.data.outcomes['pdf'] + epub = self.data.outcomes['epub'] + + time_before_store_build_artifacts = timezone.now() + # Store build artifacts to storage (local or cloud storage) + self.store_build_artifacts( + html=html, + search=search, + localmedia=localmedia, + pdf=pdf, + epub=epub, + ) + log.info( + "Store build artifacts finished.", + time=(timezone.now() - time_before_store_build_artifacts).seconds, + ) + def collect_build_data(self): """ Collect data from the current build. @@ -834,9 +839,14 @@ def store_build_artifacts( ) try: build_media_storage.sync_directory(from_path, to_path) + except UnsupportedSymlinkFileError: + raise except Exception: # Ideally this should just be an IOError # but some storage backends unfortunately throw other errors + # + # NOTE: this should hard fail the build to avoid unexpected results: + # missing pages, for example log.exception( 'Error copying to storage (not failing build)', media_type=media_type, @@ -856,6 +866,9 @@ def store_build_artifacts( except Exception: # Ideally this should just be an IOError # but some storage backends unfortunately throw other errors + # + # NOTE: this should hard fail the build to avoid unexpected results: + # pages that should not exist anymore, for example log.exception( 'Error deleting from storage (not failing build)', media_type=media_type,