Skip to content

Commit

Permalink
File tree diff
Browse files Browse the repository at this point in the history
Closes #11319

Ref #11507
  • Loading branch information
stsewd committed Oct 3, 2024
1 parent a182899 commit 2555ae6
Show file tree
Hide file tree
Showing 6 changed files with 187 additions and 22 deletions.
84 changes: 84 additions & 0 deletions readthedocs/filetreediff/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
import json
from dataclasses import dataclass

from readthedocs.builds.constants import BUILD_STATE_FINISHED
from readthedocs.builds.models import Version
from readthedocs.projects.constants import MEDIA_TYPE_METADATA
from readthedocs.storage import build_media_storage


@dataclass
class FileTreeDiff:
added: list[str]
removed: list[str]
modified: list[str]


def get_diff(version_a: Version, version_b: Version) -> FileTreeDiff | None:
version_a_manifest = get_manifest(version_a)
version_b_manifest = get_manifest(version_b)

if not version_a_manifest or not version_b_manifest:
return None

files_a = set(version_a_manifest.get("files", {}).keys())
files_b = set(version_b_manifest.get("files", {}).keys())

files_added = list(files_a - files_b)
files_removed = list(files_b - files_a)
files_modified = []
for file_path in files_a & files_b:
file_a = version_a_manifest["files"][file_path]
file_b = version_b_manifest["files"][file_path]

if file_a["hash"] != file_b["hash"]:
files_modified.append(file_path)

return FileTreeDiff(
added=files_added,
removed=files_removed,
modified=files_modified,
)


def get_manifest(version: Version):
storage_path = version.project.get_storage_path(
type_=MEDIA_TYPE_METADATA,
version_slug=version.slug,
include_file=False,
version_type=version.type,
)
manifest_path = build_media_storage.join(storage_path, "manifest.json")
try:
with build_media_storage.open(manifest_path) as manifest_file:
manifest = json.load(manifest_file)
except FileNotFoundError:
return None

latest_successful_build = version.builds.filter(
state=BUILD_STATE_FINISHED,
success=True,
).first()
if not latest_successful_build:
return None

build_id_from_manifest = manifest.get("build", {}).get("id")
if latest_successful_build.id != build_id_from_manifest:
# The manifest is outdated,
# do we want to still use it? do we care?
# Should the caller be responsible to handle this?
return None

return manifest


def write_manifest(version: Version, manifest: dict):
storage_path = version.project.get_storage_path(
type_=MEDIA_TYPE_METADATA,
version_slug=version.slug,
include_file=False,
version_type=version.type,
)
manifest_path = build_media_storage.join(storage_path, "manifest.json")
with build_media_storage.open(manifest_path, "w") as f:
json.dump(manifest, f)
2 changes: 2 additions & 0 deletions readthedocs/projects/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
MEDIA_TYPE_EPUB = "epub"
MEDIA_TYPE_HTMLZIP = "htmlzip"
MEDIA_TYPE_JSON = "json"
MEDIA_TYPE_METADATA = "metadata"
DOWNLOADABLE_MEDIA_TYPES = (
MEDIA_TYPE_PDF,
MEDIA_TYPE_EPUB,
Expand All @@ -45,6 +46,7 @@
MEDIA_TYPE_EPUB,
MEDIA_TYPE_HTMLZIP,
MEDIA_TYPE_JSON,
MEDIA_TYPE_METADATA,
)

BUILD_COMMANDS_OUTPUT_PATH = "_readthedocs/"
Expand Down
16 changes: 14 additions & 2 deletions readthedocs/projects/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -1524,13 +1524,20 @@ class Meta:
objects = HTMLFileManager()

def get_processed_json(self):
parser = GenericParser(self.version)
return parser.parse(self.path)
return self._parser.parse(self.path)

@cached_property
def _parser(self):
return GenericParser(self.version)

@cached_property
def processed_json(self):
return self.get_processed_json()

@property
def main_content(self):
return self._parser.get_main_content(self.path)


class Notification(TimeStampedModel):

Expand Down Expand Up @@ -1887,6 +1894,7 @@ def add_features(sender, **kwargs):
RESOLVE_PROJECT_FROM_HEADER = "resolve_project_from_header"
USE_PROXIED_APIS_WITH_PREFIX = "use_proxied_apis_with_prefix"
ALLOW_VERSION_WARNING_BANNER = "allow_version_warning_banner"
GENERATE_MANIFEST_FOR_FILE_TREE_DIFF = "generate_manifest_for_file_tree_diff"

# Versions sync related features
SKIP_SYNC_TAGS = "skip_sync_tags"
Expand Down Expand Up @@ -1947,6 +1955,10 @@ def add_features(sender, **kwargs):
ALLOW_VERSION_WARNING_BANNER,
_("Dashboard: Allow project to use the version warning banner."),
),
(
GENERATE_MANIFEST_FOR_FILE_TREE_DIFF,
_("Build: Generate a file manifest for file tree diff."),
),
# Versions sync related features
(
SKIP_SYNC_BRANCHES,
Expand Down
72 changes: 54 additions & 18 deletions readthedocs/projects/tasks/search.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
import hashlib
from fnmatch import fnmatch

import structlog

from readthedocs.builds.constants import BUILD_STATE_FINISHED, INTERNAL
from readthedocs.builds.constants import BUILD_STATE_FINISHED, INTERNAL, LATEST
from readthedocs.builds.models import Build, Version
from readthedocs.projects.models import HTMLFile, Project
from readthedocs.filetreediff import write_manifest
from readthedocs.projects.models import Feature, HTMLFile, Project
from readthedocs.projects.signals import files_changed
from readthedocs.search.documents import PageDocument
from readthedocs.search.utils import index_objects, remove_indexed_files
Expand Down Expand Up @@ -120,7 +122,38 @@ def collect(self, sync_id: int):
self.version.imported_files.exclude(build=sync_id).delete()


def _get_indexers(*, version, search_ranking, search_ignore, search_index_name=None):
class FileManifestIndexer(Indexer):
def __init__(self, version: Version, build: Build):
self.version = version
self.build = build
self._hashes = {}

def process(self, html_file: HTMLFile, sync_id: int):
self._hashes[html_file.path] = hashlib.md5(
html_file.main_content.encode()
).hexdigest()

def collect(self, sync_id: int):
manifest = {
"build": {
"id": self.build.id,
},
"files": {
path: {
"hash": hash,
}
for path, hash in self._hashes.items()
},
}
write_manifest(self.version, manifest)


def _get_indexers(*, version: Version, build: Build, search_index_name=None):
build_config = build.config or {}
search_config = build_config.get("search", {})
search_ranking = search_config.get("ranking", {})
search_ignore = search_config.get("ignore", [])

indexers = []
# NOTE: The search indexer must be before the index file indexer.
# This is because saving the objects in the DB will give them an id,
Expand All @@ -136,6 +169,22 @@ def _get_indexers(*, version, search_ranking, search_ignore, search_index_name=N
search_index_name=search_index_name,
)
indexers.append(search_indexer)

# File tree diff is under a feature flag for now,
# and we only allow to compare PR previous against the latest version.
has_feature = version.project.has_feature(
Feature.GENERATE_MANIFEST_FOR_FILE_TREE_DIFF
)
create_manifest = has_feature and (
version.is_external or version == version.slug == LATEST
)
if create_manifest:
file_manifest_indexer = FileManifestIndexer(
version=version,
build=build,
)
indexers.append(file_manifest_indexer)

index_file_indexer = IndexFileIndexer(
project=version.project,
version=version,
Expand Down Expand Up @@ -230,16 +279,10 @@ def index_build(build_id):
build_id=build.id,
)

build_config = build.config or {}
search_config = build_config.get("search", {})
search_ranking = search_config.get("ranking", {})
search_ignore = search_config.get("ignore", [])

try:
indexers = _get_indexers(
version=version,
search_ranking=search_ranking,
search_ignore=search_ignore,
build=build,
)
_process_files(version=version, indexers=indexers)
except Exception:
Expand Down Expand Up @@ -280,17 +323,10 @@ def reindex_version(version_id, search_index_name=None):
version_slug=version.slug,
build_id=latest_successful_build.id,
)

build_config = latest_successful_build.config or {}
search_config = build_config.get("search", {})
search_ranking = search_config.get("ranking", {})
search_ignore = search_config.get("ignore", [])

try:
indexers = _get_indexers(
version=version,
search_ranking=search_ranking,
search_ignore=search_ignore,
build=latest_successful_build,
search_index_name=search_index_name,
)
_process_files(version=version, indexers=indexers)
Expand Down
20 changes: 20 additions & 0 deletions readthedocs/proxito/views/hosting.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
from readthedocs.core.resolver import Resolver
from readthedocs.core.unresolver import UnresolverError, unresolver
from readthedocs.core.utils.extend import SettingsOverrideObject
from readthedocs.filetreediff import get_diff
from readthedocs.projects.constants import (
ADDONS_FLYOUT_SORTING_CALVER,
ADDONS_FLYOUT_SORTING_CUSTOM_PATTERN,
Expand Down Expand Up @@ -501,9 +502,28 @@ def _v1(self, project, version, build, filename, url, request):
"trigger": "Slash", # Could be something like "Ctrl + D"
},
},
"filetreediff": {
"enabled": False,
},
},
}

if version.is_external:
latest_version = project.get_latest_version()
diff = get_diff(version_a=version, version_b=latest_version)
if diff:
diff_result = {
"added": [{"file": file} for file in diff.added],
"removed": [{"file": file} for file in diff.removed],
"modified": [{"file": file} for file in diff.modified],
}
data["addons"]["filetreediff"].update(
{
"enabled": True,
"diff": diff_result,
}
)

# DocDiff depends on `url=` GET attribute.
# This attribute allows us to know the exact filename where the request was made.
# If we don't know the filename, we cannot return the data required by DocDiff to work.
Expand Down
15 changes: 13 additions & 2 deletions readthedocs/search/parsers.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
"""JSON/HTML parsers for search indexing."""

import functools
import itertools
import re

Expand All @@ -20,6 +20,7 @@ def __init__(self, version):
self.project = self.version.project
self.storage = build_media_storage

@functools.cache
def _get_page_content(self, page):
"""Gets the page content from storage."""
content = None
Expand All @@ -34,7 +35,7 @@ def _get_page_content(self, page):
content = f.read()
except Exception:
log.warning(
"Unhandled exception during search processing file.",
"Failed to get page content.",
page=page,
)
return content
Expand Down Expand Up @@ -427,3 +428,13 @@ def _process_content(self, page, content):
"title": title,
"sections": sections,
}

def get_main_content(self, page):
try:
content = self._get_page_content(page)
html = HTMLParser(content)
body = self._get_main_node(html)
return body.html
except Exception:
log.info("Failed to get main content from page.", path=page, exc_info=True)
return ""

0 comments on commit 2555ae6

Please sign in to comment.