diff --git a/readthedocs/filetreediff/__init__.py b/readthedocs/filetreediff/__init__.py new file mode 100644 index 00000000000..3698a2e08a7 --- /dev/null +++ b/readthedocs/filetreediff/__init__.py @@ -0,0 +1,84 @@ +import json +from dataclasses import dataclass + +from readthedocs.builds.constants import BUILD_STATE_FINISHED +from readthedocs.builds.models import Version +from readthedocs.projects.constants import MEDIA_TYPE_METADATA +from readthedocs.storage import build_media_storage + + +@dataclass +class FileTreeDiff: + added: list[str] + removed: list[str] + modified: list[str] + + +def get_diff(version_a: Version, version_b: Version) -> FileTreeDiff | None: + version_a_manifest = get_manifest(version_a) + version_b_manifest = get_manifest(version_b) + + if not version_a_manifest or not version_b_manifest: + return None + + files_a = set(version_a_manifest.get("files", {}).keys()) + files_b = set(version_b_manifest.get("files", {}).keys()) + + files_added = list(files_a - files_b) + files_removed = list(files_b - files_a) + files_modified = [] + for file_path in files_a & files_b: + file_a = version_a_manifest["files"][file_path] + file_b = version_b_manifest["files"][file_path] + + if file_a["hash"] != file_b["hash"]: + files_modified.append(file_path) + + return FileTreeDiff( + added=files_added, + removed=files_removed, + modified=files_modified, + ) + + +def get_manifest(version: Version): + storage_path = version.project.get_storage_path( + type_=MEDIA_TYPE_METADATA, + version_slug=version.slug, + include_file=False, + version_type=version.type, + ) + manifest_path = build_media_storage.join(storage_path, "manifest.json") + try: + with build_media_storage.open(manifest_path) as manifest_file: + manifest = json.load(manifest_file) + except FileNotFoundError: + return None + + latest_successful_build = version.builds.filter( + state=BUILD_STATE_FINISHED, + success=True, + ).first() + if not latest_successful_build: + return None + + build_id_from_manifest = manifest.get("build", {}).get("id") + if latest_successful_build.id != build_id_from_manifest: + # The manifest is outdated, + # do we want to still use it? do we care? + # Should the caller be responsible to handle this? + return None + + return manifest + + +def write_manifest(version: Version, manifest: dict): + storage_path = version.project.get_storage_path( + type_=MEDIA_TYPE_METADATA, + version_slug=version.slug, + include_file=False, + version_type=version.type, + ) + manifest_path = build_media_storage.join(storage_path, "manifest.json") + with build_media_storage.open(manifest_path, "w") as f: + json.dump(manifest, f) diff --git a/readthedocs/projects/constants.py b/readthedocs/projects/constants.py index d1401c171e6..05b3b3d15ae 100644 --- a/readthedocs/projects/constants.py +++ b/readthedocs/projects/constants.py @@ -34,6 +34,7 @@ MEDIA_TYPE_EPUB = "epub" MEDIA_TYPE_HTMLZIP = "htmlzip" MEDIA_TYPE_JSON = "json" +MEDIA_TYPE_METADATA = "metadata" DOWNLOADABLE_MEDIA_TYPES = ( MEDIA_TYPE_PDF, MEDIA_TYPE_EPUB, @@ -45,6 +46,7 @@ MEDIA_TYPE_EPUB, MEDIA_TYPE_HTMLZIP, MEDIA_TYPE_JSON, + MEDIA_TYPE_METADATA, ) BUILD_COMMANDS_OUTPUT_PATH = "_readthedocs/" diff --git a/readthedocs/projects/models.py b/readthedocs/projects/models.py index b3f9a9b9aba..d22d90be0e9 100644 --- a/readthedocs/projects/models.py +++ b/readthedocs/projects/models.py @@ -1524,13 +1524,20 @@ class Meta: objects = HTMLFileManager() def get_processed_json(self): - parser = GenericParser(self.version) - return parser.parse(self.path) + return self._parser.parse(self.path) + + @cached_property + def _parser(self): + return GenericParser(self.version) @cached_property def processed_json(self): return self.get_processed_json() + @property + def main_content(self): + return self._parser.get_main_content(self.path) + class Notification(TimeStampedModel): @@ -1887,6 +1894,7 @@ def add_features(sender, **kwargs): RESOLVE_PROJECT_FROM_HEADER = "resolve_project_from_header" USE_PROXIED_APIS_WITH_PREFIX = "use_proxied_apis_with_prefix" ALLOW_VERSION_WARNING_BANNER = "allow_version_warning_banner" + GENERATE_MANIFEST_FOR_FILE_TREE_DIFF = "generate_manifest_for_file_tree_diff" # Versions sync related features SKIP_SYNC_TAGS = "skip_sync_tags" @@ -1947,6 +1955,10 @@ def add_features(sender, **kwargs): ALLOW_VERSION_WARNING_BANNER, _("Dashboard: Allow project to use the version warning banner."), ), + ( + GENERATE_MANIFEST_FOR_FILE_TREE_DIFF, + _("Build: Generate a file manifest for file tree diff."), + ), # Versions sync related features ( SKIP_SYNC_BRANCHES, diff --git a/readthedocs/projects/tasks/search.py b/readthedocs/projects/tasks/search.py index 1a76195e8a3..bc4cdfe6dc6 100644 --- a/readthedocs/projects/tasks/search.py +++ b/readthedocs/projects/tasks/search.py @@ -1,10 +1,12 @@ +import hashlib from fnmatch import fnmatch import structlog -from readthedocs.builds.constants import BUILD_STATE_FINISHED, INTERNAL +from readthedocs.builds.constants import BUILD_STATE_FINISHED, INTERNAL, LATEST from readthedocs.builds.models import Build, Version -from readthedocs.projects.models import HTMLFile, Project +from readthedocs.filetreediff import write_manifest +from readthedocs.projects.models import Feature, HTMLFile, Project from readthedocs.projects.signals import files_changed from readthedocs.search.documents import PageDocument from readthedocs.search.utils import index_objects, remove_indexed_files @@ -120,7 +122,38 @@ def collect(self, sync_id: int): self.version.imported_files.exclude(build=sync_id).delete() -def _get_indexers(*, version, search_ranking, search_ignore, search_index_name=None): +class FileManifestIndexer(Indexer): + def __init__(self, version: Version, build: Build): + self.version = version + self.build = build + self._hashes = {} + + def process(self, html_file: HTMLFile, sync_id: int): + self._hashes[html_file.path] = hashlib.md5( + html_file.main_content.encode() + ).hexdigest() + + def collect(self, sync_id: int): + manifest = { + "build": { + "id": self.build.id, + }, + "files": { + path: { + "hash": hash, + } + for path, hash in self._hashes.items() + }, + } + write_manifest(self.version, manifest) + + +def _get_indexers(*, version: Version, build: Build, search_index_name=None): + build_config = build.config or {} + search_config = build_config.get("search", {}) + search_ranking = search_config.get("ranking", {}) + search_ignore = search_config.get("ignore", []) + indexers = [] # NOTE: The search indexer must be before the index file indexer. # This is because saving the objects in the DB will give them an id, @@ -136,6 +169,22 @@ def _get_indexers(*, version, search_ranking, search_ignore, search_index_name=N search_index_name=search_index_name, ) indexers.append(search_indexer) + + # File tree diff is under a feature flag for now, + # and we only allow to compare PR previous against the latest version. + has_feature = version.project.has_feature( + Feature.GENERATE_MANIFEST_FOR_FILE_TREE_DIFF + ) + create_manifest = has_feature and ( + version.is_external or version == version.slug == LATEST + ) + if create_manifest: + file_manifest_indexer = FileManifestIndexer( + version=version, + build=build, + ) + indexers.append(file_manifest_indexer) + index_file_indexer = IndexFileIndexer( project=version.project, version=version, @@ -230,16 +279,10 @@ def index_build(build_id): build_id=build.id, ) - build_config = build.config or {} - search_config = build_config.get("search", {}) - search_ranking = search_config.get("ranking", {}) - search_ignore = search_config.get("ignore", []) - try: indexers = _get_indexers( version=version, - search_ranking=search_ranking, - search_ignore=search_ignore, + build=build, ) _process_files(version=version, indexers=indexers) except Exception: @@ -280,17 +323,10 @@ def reindex_version(version_id, search_index_name=None): version_slug=version.slug, build_id=latest_successful_build.id, ) - - build_config = latest_successful_build.config or {} - search_config = build_config.get("search", {}) - search_ranking = search_config.get("ranking", {}) - search_ignore = search_config.get("ignore", []) - try: indexers = _get_indexers( version=version, - search_ranking=search_ranking, - search_ignore=search_ignore, + build=latest_successful_build, search_index_name=search_index_name, ) _process_files(version=version, indexers=indexers) diff --git a/readthedocs/proxito/views/hosting.py b/readthedocs/proxito/views/hosting.py index 51174fbab55..04971480c55 100644 --- a/readthedocs/proxito/views/hosting.py +++ b/readthedocs/proxito/views/hosting.py @@ -23,6 +23,7 @@ from readthedocs.core.resolver import Resolver from readthedocs.core.unresolver import UnresolverError, unresolver from readthedocs.core.utils.extend import SettingsOverrideObject +from readthedocs.filetreediff import get_diff from readthedocs.projects.constants import ( ADDONS_FLYOUT_SORTING_CALVER, ADDONS_FLYOUT_SORTING_CUSTOM_PATTERN, @@ -501,9 +502,28 @@ def _v1(self, project, version, build, filename, url, request): "trigger": "Slash", # Could be something like "Ctrl + D" }, }, + "filetreediff": { + "enabled": False, + }, }, } + if version.is_external: + latest_version = project.get_latest_version() + diff = get_diff(version_a=version, version_b=latest_version) + if diff: + diff_result = { + "added": [{"file": file} for file in diff.added], + "removed": [{"file": file} for file in diff.removed], + "modified": [{"file": file} for file in diff.modified], + } + data["addons"]["filetreediff"].update( + { + "enabled": True, + "diff": diff_result, + } + ) + # DocDiff depends on `url=` GET attribute. # This attribute allows us to know the exact filename where the request was made. # If we don't know the filename, we cannot return the data required by DocDiff to work. diff --git a/readthedocs/search/parsers.py b/readthedocs/search/parsers.py index f5a99294b79..30e92803ae4 100644 --- a/readthedocs/search/parsers.py +++ b/readthedocs/search/parsers.py @@ -1,5 +1,5 @@ """JSON/HTML parsers for search indexing.""" - +import functools import itertools import re @@ -20,6 +20,7 @@ def __init__(self, version): self.project = self.version.project self.storage = build_media_storage + @functools.cache def _get_page_content(self, page): """Gets the page content from storage.""" content = None @@ -34,7 +35,7 @@ def _get_page_content(self, page): content = f.read() except Exception: log.warning( - "Unhandled exception during search processing file.", + "Failed to get page content.", page=page, ) return content @@ -427,3 +428,13 @@ def _process_content(self, page, content): "title": title, "sections": sections, } + + def get_main_content(self, page): + try: + content = self._get_page_content(page) + html = HTMLParser(content) + body = self._get_main_node(html) + return body.html + except Exception: + log.info("Failed to get main content from page.", path=page, exc_info=True) + return ""