Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Build: handle symlinks in a centralized way #9800

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 1 addition & 15 deletions readthedocs/builds/storage.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from django.core.exceptions import SuspiciousFileOperation
from django.core.files.storage import FileSystemStorage
from storages.utils import get_available_overwrite_name, safe_join
from readthedocs.doc_builder.exceptions import UnsupportedSymlinkFileError

from readthedocs.core.utils.filesystem import safe_open

Expand Down Expand Up @@ -87,14 +88,6 @@ def copy_directory(self, source, destination):
for filepath in source.iterdir():
sub_destination = self.join(destination, filepath.name)

# Don't follow symlinks when uploading to storage.
if filepath.is_symlink():
log.info(
"Skipping symlink upload.",
path_resolved=str(filepath.resolve()),
)
continue

Comment on lines -90 to -97
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think we should follow symlinks when uploading files (github pages doesn't for example), and with our plans to test rclone, rclone doesn't support something like "follow symlinks but only inside this path", and we also need to check that the symlinks aren't outside the version path not just the docroot path.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I remove this check from here to allow communicating this decision, "do not follow symlinks", to the user clearly. Otherwise, with this check we are just ignoring symlinks without telling this to the user, which lead to unexpected behavior from a user perspective.

if filepath.is_dir():
# Recursively copy the subdirectory
self.copy_directory(filepath, sub_destination)
Expand Down Expand Up @@ -125,13 +118,6 @@ def sync_directory(self, source, destination):
copied_dirs = set()
for filepath in source.iterdir():
sub_destination = self.join(destination, filepath.name)
# Don't follow symlinks when uploading to storage.
if filepath.is_symlink():
log.info(
"Skipping symlink upload.",
path_resolved=str(filepath.resolve()),
)
continue

if filepath.is_dir():
# Recursively sync the subdirectory
Expand Down
2 changes: 1 addition & 1 deletion readthedocs/config/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -1384,7 +1384,7 @@ def load(path, env_config):

# Allow symlinks, but only the ones that resolve inside the base directory.
with safe_open(
filename, "r", allow_symlinks=True, base_path=path
filename, "r", base_path=path
) as configuration_file:
try:
config = parse(configuration_file.read())
Expand Down
86 changes: 23 additions & 63 deletions readthedocs/core/utils/filesystem.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,32 +21,16 @@
log = structlog.get_logger(__name__)


def _assert_path_is_inside_docroot(path):
resolved_path = path.absolute().resolve()
docroot = Path(settings.DOCROOT).absolute()
if not path.is_relative_to(docroot):
log.error(
"Suspicious operation outside the docroot directory.",
path_resolved=str(resolved_path),
)
raise SuspiciousFileOperation(path)


def safe_open(path, *args, allow_symlinks=False, base_path=None, **kwargs):
def safe_open(path, *args, base_path=None, **kwargs):
"""
Wrapper around path.open() to check for symlinks.

- Checks for symlinks to avoid symlink following vulnerabilities
like GHSA-368m-86q9-m99w.
- Checks that files aren't out of the DOCROOT directory.

:param allow_symlinks: If `False` and the path is a symlink, raise `FileIsSymlink`
This prevents reading the contents of other files users shouldn't have access to.

:param base_path: If given, check that the path isn't located outside the base path
(usually the directory where the project was cloned).
It must be given if `allow_symlinks` is set to `True`.
This prevents path traversal attacks (even when using symlinks).

The extra *args and **kwargs will be passed to the open() method.

Expand All @@ -55,43 +39,44 @@ def safe_open(path, *args, allow_symlinks=False, base_path=None, **kwargs):
This operation isn't safe to TocTou (Time-of-check to Time-of-use) attacks.
Users shouldn't be able to change files while this operation is done.
"""
if allow_symlinks and not base_path:
raise ValueError("base_path mut be given if symlinks are allowed.")

docroot = Path(settings.DOCROOT).absolute()
path = Path(path).absolute()

# TODO: this `relative_path` can be improved to remove the first part:
# /symlink-security-exploit/artifacts/latest/generic/passwd.txt
# This is shown to the user currently.
relative_path = str(path).replace(str(docroot.resolve()), '')
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There is the Path.relative_to method that should be used instead of replace.

log.bind(
path_resolved=str(path.absolute().resolve()),
relative_path=relative_path,
)

if path.is_symlink():
symlink_path = str(path.readlink().resolve())
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

resolve already follows symlinks, readlink isn't needed here.

log.bind(
symlink_path=symlink_path,
)

if path.exists() and not path.is_file():
raise FileIsNotRegularFile(path)

if not allow_symlinks and path.is_symlink():
log.info("Skipping file becuase it's a symlink.")
raise UnsupportedSymlinkFileError(path)

# Expand symlinks.
# Expand symlinks
resolved_path = path.resolve()

if allow_symlinks and base_path:
base_path = Path(base_path).absolute()
if not resolved_path.is_relative_to(base_path):
# Trying to path traversal via a symlink, sneaky!
log.info("Path traversal via symlink.")
raise SymlinkOutsideBasePath(path)

_assert_path_is_inside_docroot(resolved_path)
# Don't follow symlinks outside DOCROOT or base_path
if path.is_symlink() and (not resolved_path.is_relative_to(docroot) or (base_path and not resolved_path.is_relative_to(base_path))):
raise UnsupportedSymlinkFileError(filepath=relative_path)

return resolved_path.open(*args, **kwargs)


# NOTE: I think `safe_copytree` is useful with `symlinks=True`,
# and we shouldn't perform all the other checks here.
# We are always using `safe_open` at the storage level,
# and we can keep all these checks contained there (in one place)
def safe_copytree(from_dir, to_dir, *args, **kwargs):
"""
Wrapper around shutil.copytree to check for symlinks.

If any of the directories point to symlinks, cancel the operation.
We don't want to copy contents outside of those directories.
Wrapper around shutil.copytree to copy symlinks as is, instead of its contents.

The extra *args and **kwargs will be passed to the copytree() function.

Expand All @@ -100,21 +85,6 @@ def safe_copytree(from_dir, to_dir, *args, **kwargs):
This operation isn't safe to TocTou (Time-of-check to Time-of-use) attacks.
Users shouldn't be able to change files while this operation is done.
"""
from_dir = Path(from_dir)
to_dir = Path(to_dir)
if from_dir.is_symlink() or to_dir.is_symlink():
log.info(
"Not copying directory, one of the paths is a symlink.",
from_dir=from_dir,
from_dir_resolved=from_dir.resolve(),
to_dir=to_dir,
to_dir_resolved=to_dir.resolve(),
)
return False

_assert_path_is_inside_docroot(from_dir)
_assert_path_is_inside_docroot(to_dir)

Comment on lines -103 to -117
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

these checks are needed, since copytree will follow the symlink when doing the copy of the initial directory (there is a test for this as well), what it won't do is follow any symlinks inside that directory. And the check for _assert_path_is_inside_docroot is also useful, it will help us to prevent any directory traversal vulnerabilities being exploited.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

copytree will follow the symlink when doing the copy of the initial directory (there is a test for this as well)

What is that initial directory you refer to here?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The directory itself that you are going to copy (from_dir)

return shutil.copytree(
from_dir,
to_dir,
Expand All @@ -127,21 +97,11 @@ def safe_copytree(from_dir, to_dir, *args, **kwargs):

def safe_rmtree(path, *args, **kwargs):
"""
Wrapper around shutil.rmtree to check for symlinks.
Wrapper around shutil.rmtree for security reasons.

shutil.rmtree doens't follow symlinks by default,
this function just logs in case users are trying to use symlinks.
https://docs.python.org/3/library/shutil.html#shutil.rmtree

The extra *args and **kwargs will be passed to the rmtree() function.
"""
path = Path(path)
if path.is_symlink():
log.info(
"Not deleting directory because it's a symlink.",
path=str(path),
resolved_path=path.resolve(),
)
return None
_assert_path_is_inside_docroot(path)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

rmtree fails hard (OSError) when trying to delete a dir that points to a symlink, but _assert_path_is_inside_docroot is still useful as mentioned in my other comment.

return shutil.rmtree(path, *args, **kwargs)
6 changes: 2 additions & 4 deletions readthedocs/doc_builder/backends/mkdocs.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,9 +73,8 @@ def get_final_doctype(self):

https://www.mkdocs.org/user-guide/configuration/#use_directory_urls
"""
# Allow symlinks, but only the ones that resolve inside the base directory.
with safe_open(
self.yaml_file, "r", allow_symlinks=True, base_path=self.project_path
self.yaml_file, "r", base_path=self.project_path
) as fh:
config = yaml_load_safely(fh)
use_directory_urls = config.get('use_directory_urls', True)
Expand All @@ -98,9 +97,8 @@ def load_yaml_config(self):
:raises: ``MkDocsYAMLParseError`` if failed due to syntax errors.
"""
try:
# Allow symlinks, but only the ones that resolve inside the base directory.
result = safe_open(
self.yaml_file, "r", allow_symlinks=True, base_path=self.project_path
self.yaml_file, "r", base_path=self.project_path
)
if not result:
raise UserFileNotFound(
Expand Down
3 changes: 1 addition & 2 deletions readthedocs/doc_builder/backends/sphinx.py
Original file line number Diff line number Diff line change
Expand Up @@ -233,9 +233,8 @@ def append_conf(self):
self.config_file = (
self.config_file or self.project.conf_file(self.version.slug)
)
# Allow symlinks, but only the ones that resolve inside the base directory.
outfile = safe_open(
self.config_file, "a", allow_symlinks=True, base_path=self.project_path
self.config_file, "a", base_path=self.project_path
)
if not outfile:
raise UserFileNotFound(
Expand Down
12 changes: 10 additions & 2 deletions readthedocs/doc_builder/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,9 +97,17 @@ class MkDocsYAMLParseError(BuildUserError):
)


# TODO: improve messages for symlink errors with a more detailed error and include the `filepath`.
class UnsupportedSymlinkFileError(BuildUserError):
message = gettext_noop("Symlinks are not fully supported")
message = gettext_noop(
"There is at least one file ({filepath}) in the output that is a symlink. "
"Please, review your output directory and remove the symlinks. "
"Symlinks are not fully supported."
)

def __init__(self, message=None, **kwargs):
filepath = kwargs.pop('filepath')
message = self.message.format(filepath=filepath)
super().__init__(message, **kwargs)


class FileIsNotRegularFile(UnsupportedSymlinkFileError):
Expand Down
4 changes: 0 additions & 4 deletions readthedocs/doc_builder/python_environments.py
Original file line number Diff line number Diff line change
Expand Up @@ -395,14 +395,12 @@ def _append_core_requirements(self):
See https://github.com/readthedocs/readthedocs.org/pull/5631
"""
try:
# Allow symlinks, but only the ones that resolve inside the base directory.
inputfile = safe_open(
os.path.join(
self.checkout_path,
self.config.conda.environment,
),
"r",
allow_symlinks=True,
base_path=self.checkout_path,
)
if not inputfile:
Expand Down Expand Up @@ -438,14 +436,12 @@ def _append_core_requirements(self):
dependencies.extend(conda_requirements)
environment.update({'dependencies': dependencies})
try:
# Allow symlinks, but only the ones that resolve inside the base directory.
outputfile = safe_open(
os.path.join(
self.checkout_path,
self.config.conda.environment,
),
"w",
allow_symlinks=True,
base_path=self.checkout_path,
)
if not outputfile:
Expand Down
53 changes: 33 additions & 20 deletions readthedocs/projects/tasks/builds.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@
MkDocsYAMLParseError,
ProjectBuildsSkippedError,
YAMLParseError,
UnsupportedSymlinkFileError,
)
from readthedocs.storage import build_media_storage
from readthedocs.telemetry.collectors import BuildDataCollector
Expand Down Expand Up @@ -514,29 +515,13 @@ def on_failure(self, exc, task_id, args, kwargs, einfo):
self.data.build['success'] = False

def on_success(self, retval, task_id, args, kwargs):
html = self.data.outcomes['html']
search = self.data.outcomes['search']
localmedia = self.data.outcomes['localmedia']
pdf = self.data.outcomes['pdf']
epub = self.data.outcomes['epub']

time_before_store_build_artifacts = timezone.now()
# Store build artifacts to storage (local or cloud storage)
self.store_build_artifacts(
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we should probably do this change in another PR, doesn't look related to symlinks

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We need to move this call inside execute since we cannot raise exceptions from on_success and be able to manage them inside on_failure since it's not called when the code is already inside on_success.

html=html,
search=search,
localmedia=localmedia,
pdf=pdf,
epub=epub,
)
log.info(
"Store build artifacts finished.",
time=(timezone.now() - time_before_store_build_artifacts).seconds,
)

# NOTE: we are updating the db version instance *only* when
# HTML are built successfully.
html = self.data.outcomes['html']
if html:
localmedia = self.data.outcomes['localmedia']
pdf = self.data.outcomes['pdf']
epub = self.data.outcomes['epub']
try:
api_v2.version(self.data.version.pk).patch(
{
Expand Down Expand Up @@ -699,6 +684,26 @@ def execute(self):
finally:
self.data.build_data = self.collect_build_data()

html = self.data.outcomes['html']
search = self.data.outcomes['search']
localmedia = self.data.outcomes['localmedia']
pdf = self.data.outcomes['pdf']
epub = self.data.outcomes['epub']

time_before_store_build_artifacts = timezone.now()
# Store build artifacts to storage (local or cloud storage)
self.store_build_artifacts(
html=html,
search=search,
localmedia=localmedia,
pdf=pdf,
epub=epub,
)
log.info(
"Store build artifacts finished.",
time=(timezone.now() - time_before_store_build_artifacts).seconds,
)

def collect_build_data(self):
"""
Collect data from the current build.
Expand Down Expand Up @@ -834,9 +839,14 @@ def store_build_artifacts(
)
try:
build_media_storage.sync_directory(from_path, to_path)
except UnsupportedSymlinkFileError:
raise
except Exception:
# Ideally this should just be an IOError
# but some storage backends unfortunately throw other errors
#
# NOTE: this should hard fail the build to avoid unexpected results:
# missing pages, for example
log.exception(
'Error copying to storage (not failing build)',
media_type=media_type,
Expand All @@ -856,6 +866,9 @@ def store_build_artifacts(
except Exception:
# Ideally this should just be an IOError
# but some storage backends unfortunately throw other errors
#
# NOTE: this should hard fail the build to avoid unexpected results:
# pages that should not exist anymore, for example
log.exception(
'Error deleting from storage (not failing build)',
media_type=media_type,
Expand Down