File tree diff

stsewd · stsewd · commit 2555ae60ab8c · 2024-10-03T12:39:12.000-05:00
Closes #11319 Ref #11507
diff --git a/readthedocs/filetreediff/__init__.py b/readthedocs/filetreediff/__init__.py
@@ -0,0 +1,84 @@
+import json
+from dataclasses import dataclass
+
+from readthedocs.builds.constants import BUILD_STATE_FINISHED
+from readthedocs.builds.models import Version
+from readthedocs.projects.constants import MEDIA_TYPE_METADATA
+from readthedocs.storage import build_media_storage
+
+
+@dataclass
+class FileTreeDiff:
+    added: list[str]
+    removed: list[str]
+    modified: list[str]
+
+
+def get_diff(version_a: Version, version_b: Version) -> FileTreeDiff | None:
+    version_a_manifest = get_manifest(version_a)
+    version_b_manifest = get_manifest(version_b)
+
+    if not version_a_manifest or not version_b_manifest:
+        return None
+
+    files_a = set(version_a_manifest.get("files", {}).keys())
+    files_b = set(version_b_manifest.get("files", {}).keys())
+
+    files_added = list(files_a - files_b)
+    files_removed = list(files_b - files_a)
+    files_modified = []
+    for file_path in files_a & files_b:
+        file_a = version_a_manifest["files"][file_path]
+        file_b = version_b_manifest["files"][file_path]
+
+        if file_a["hash"] != file_b["hash"]:
+            files_modified.append(file_path)
+
+    return FileTreeDiff(
+        added=files_added,
+        removed=files_removed,
+        modified=files_modified,
+    )
+
+
+def get_manifest(version: Version):
+    storage_path = version.project.get_storage_path(
+        type_=MEDIA_TYPE_METADATA,
+        version_slug=version.slug,
+        include_file=False,
+        version_type=version.type,
+    )
+    manifest_path = build_media_storage.join(storage_path, "manifest.json")
+    try:
+        with build_media_storage.open(manifest_path) as manifest_file:
+            manifest = json.load(manifest_file)
+    except FileNotFoundError:
+        return None
+
+    latest_successful_build = version.builds.filter(
+        state=BUILD_STATE_FINISHED,
+        success=True,
+    ).first()
+    if not latest_successful_build:
+        return None
+
+    build_id_from_manifest = manifest.get("build", {}).get("id")
+    if latest_successful_build.id != build_id_from_manifest:
+        # The manifest is outdated,
+        # do we want to still use it? do we care?
+        # Should the caller be responsible to handle this?
+        return None
+
+    return manifest
+
+
+def write_manifest(version: Version, manifest: dict):
+    storage_path = version.project.get_storage_path(
+        type_=MEDIA_TYPE_METADATA,
+        version_slug=version.slug,
+        include_file=False,
+        version_type=version.type,
+    )
+    manifest_path = build_media_storage.join(storage_path, "manifest.json")
+    with build_media_storage.open(manifest_path, "w") as f:
+        json.dump(manifest, f)
diff --git a/readthedocs/projects/constants.py b/readthedocs/projects/constants.py
@@ -34,6 +34,7 @@
 MEDIA_TYPE_EPUB = "epub"
 MEDIA_TYPE_HTMLZIP = "htmlzip"
 MEDIA_TYPE_JSON = "json"
+MEDIA_TYPE_METADATA = "metadata"
 DOWNLOADABLE_MEDIA_TYPES = (
     MEDIA_TYPE_PDF,
     MEDIA_TYPE_EPUB,
@@ -45,6 +46,7 @@
     MEDIA_TYPE_EPUB,
     MEDIA_TYPE_HTMLZIP,
     MEDIA_TYPE_JSON,
+    MEDIA_TYPE_METADATA,
 )
 
 BUILD_COMMANDS_OUTPUT_PATH = "_readthedocs/"
diff --git a/readthedocs/projects/models.py b/readthedocs/projects/models.py
@@ -1524,13 +1524,20 @@ class Meta:
     objects = HTMLFileManager()
 
     def get_processed_json(self):
-        parser = GenericParser(self.version)
-        return parser.parse(self.path)
+        return self._parser.parse(self.path)
+
+    @cached_property
+    def _parser(self):
+        return GenericParser(self.version)
 
     @cached_property
     def processed_json(self):
         return self.get_processed_json()
 
+    @property
+    def main_content(self):
+        return self._parser.get_main_content(self.path)
+
 
 class Notification(TimeStampedModel):
 
@@ -1887,6 +1894,7 @@ def add_features(sender, **kwargs):
     RESOLVE_PROJECT_FROM_HEADER = "resolve_project_from_header"
     USE_PROXIED_APIS_WITH_PREFIX = "use_proxied_apis_with_prefix"
     ALLOW_VERSION_WARNING_BANNER = "allow_version_warning_banner"
+    GENERATE_MANIFEST_FOR_FILE_TREE_DIFF = "generate_manifest_for_file_tree_diff"
 
     # Versions sync related features
     SKIP_SYNC_TAGS = "skip_sync_tags"
@@ -1947,6 +1955,10 @@ def add_features(sender, **kwargs):
             ALLOW_VERSION_WARNING_BANNER,
             _("Dashboard: Allow project to use the version warning banner."),
         ),
+        (
+            GENERATE_MANIFEST_FOR_FILE_TREE_DIFF,
+            _("Build: Generate a file manifest for file tree diff."),
+        ),
         # Versions sync related features
         (
             SKIP_SYNC_BRANCHES,
diff --git a/readthedocs/projects/tasks/search.py b/readthedocs/projects/tasks/search.py
@@ -1,10 +1,12 @@
+import hashlib
 from fnmatch import fnmatch
 
 import structlog
 
-from readthedocs.builds.constants import BUILD_STATE_FINISHED, INTERNAL
+from readthedocs.builds.constants import BUILD_STATE_FINISHED, INTERNAL, LATEST
 from readthedocs.builds.models import Build, Version
-from readthedocs.projects.models import HTMLFile, Project
+from readthedocs.filetreediff import write_manifest
+from readthedocs.projects.models import Feature, HTMLFile, Project
 from readthedocs.projects.signals import files_changed
 from readthedocs.search.documents import PageDocument
 from readthedocs.search.utils import index_objects, remove_indexed_files
@@ -120,7 +122,38 @@ def collect(self, sync_id: int):
         self.version.imported_files.exclude(build=sync_id).delete()
 
 
-def _get_indexers(*, version, search_ranking, search_ignore, search_index_name=None):
+class FileManifestIndexer(Indexer):
+    def __init__(self, version: Version, build: Build):
+        self.version = version
+        self.build = build
+        self._hashes = {}
+
+    def process(self, html_file: HTMLFile, sync_id: int):
+        self._hashes[html_file.path] = hashlib.md5(
+            html_file.main_content.encode()
+        ).hexdigest()
+
+    def collect(self, sync_id: int):
+        manifest = {
+            "build": {
+                "id": self.build.id,
+            },
+            "files": {
+                path: {
+                    "hash": hash,
+                }
+                for path, hash in self._hashes.items()
+            },
+        }
+        write_manifest(self.version, manifest)
+
+
+def _get_indexers(*, version: Version, build: Build, search_index_name=None):
+    build_config = build.config or {}
+    search_config = build_config.get("search", {})
+    search_ranking = search_config.get("ranking", {})
+    search_ignore = search_config.get("ignore", [])
+
     indexers = []
     # NOTE: The search indexer must be before the index file indexer.
     # This is because saving the objects in the DB will give them an id,
@@ -136,6 +169,22 @@ def _get_indexers(*, version, search_ranking, search_ignore, search_index_name=N
             search_index_name=search_index_name,
         )
         indexers.append(search_indexer)
+
+    # File tree diff is under a feature flag for now,
+    # and we only allow to compare PR previous against the latest version.
+    has_feature = version.project.has_feature(
+        Feature.GENERATE_MANIFEST_FOR_FILE_TREE_DIFF
+    )
+    create_manifest = has_feature and (
+        version.is_external or version == version.slug == LATEST
+    )
+    if create_manifest:
+        file_manifest_indexer = FileManifestIndexer(
+            version=version,
+            build=build,
+        )
+        indexers.append(file_manifest_indexer)
+
     index_file_indexer = IndexFileIndexer(
         project=version.project,
         version=version,
@@ -230,16 +279,10 @@ def index_build(build_id):
         build_id=build.id,
     )
 
-    build_config = build.config or {}
-    search_config = build_config.get("search", {})
-    search_ranking = search_config.get("ranking", {})
-    search_ignore = search_config.get("ignore", [])
-
     try:
         indexers = _get_indexers(
             version=version,
-            search_ranking=search_ranking,
-            search_ignore=search_ignore,
+            build=build,
         )
         _process_files(version=version, indexers=indexers)
     except Exception:
@@ -280,17 +323,10 @@ def reindex_version(version_id, search_index_name=None):
         version_slug=version.slug,
         build_id=latest_successful_build.id,
     )
-
-    build_config = latest_successful_build.config or {}
-    search_config = build_config.get("search", {})
-    search_ranking = search_config.get("ranking", {})
-    search_ignore = search_config.get("ignore", [])
-
     try:
         indexers = _get_indexers(
             version=version,
-            search_ranking=search_ranking,
-            search_ignore=search_ignore,
+            build=latest_successful_build,
             search_index_name=search_index_name,
         )
         _process_files(version=version, indexers=indexers)
diff --git a/readthedocs/proxito/views/hosting.py b/readthedocs/proxito/views/hosting.py
@@ -23,6 +23,7 @@
 from readthedocs.core.resolver import Resolver
 from readthedocs.core.unresolver import UnresolverError, unresolver
 from readthedocs.core.utils.extend import SettingsOverrideObject
+from readthedocs.filetreediff import get_diff
 from readthedocs.projects.constants import (
     ADDONS_FLYOUT_SORTING_CALVER,
     ADDONS_FLYOUT_SORTING_CUSTOM_PATTERN,
@@ -501,9 +502,28 @@ def _v1(self, project, version, build, filename, url, request):
                         "trigger": "Slash",  # Could be something like "Ctrl + D"
                     },
                 },
+                "filetreediff": {
+                    "enabled": False,
+                },
             },
         }
 
+        if version.is_external:
+            latest_version = project.get_latest_version()
+            diff = get_diff(version_a=version, version_b=latest_version)
+            if diff:
+                diff_result = {
+                    "added": [{"file": file} for file in diff.added],
+                    "removed": [{"file": file} for file in diff.removed],
+                    "modified": [{"file": file} for file in diff.modified],
+                }
+                data["addons"]["filetreediff"].update(
+                    {
+                        "enabled": True,
+                        "diff": diff_result,
+                    }
+                )
+
         # DocDiff depends on `url=` GET attribute.
         # This attribute allows us to know the exact filename where the request was made.
         # If we don't know the filename, we cannot return the data required by DocDiff to work.
diff --git a/readthedocs/search/parsers.py b/readthedocs/search/parsers.py
@@ -1,5 +1,5 @@
 """JSON/HTML parsers for search indexing."""
-
+import functools
 import itertools
 import re
 
@@ -20,6 +20,7 @@ def __init__(self, version):
         self.project = self.version.project
         self.storage = build_media_storage
 
+    @functools.cache
     def _get_page_content(self, page):
         """Gets the page content from storage."""
         content = None
@@ -34,7 +35,7 @@ def _get_page_content(self, page):
                 content = f.read()
         except Exception:
             log.warning(
-                "Unhandled exception during search processing file.",
+                "Failed to get page content.",
                 page=page,
             )
         return content
@@ -427,3 +428,13 @@ def _process_content(self, page, content):
             "title": title,
             "sections": sections,
         }
+
+    def get_main_content(self, page):
+        try:
+            content = self._get_page_content(page)
+            html = HTMLParser(content)
+            body = self._get_main_node(html)
+            return body.html
+        except Exception:
+            log.info("Failed to get main content from page.", path=page, exc_info=True)
+        return ""