From 759733c140eccfef5e7a948bbd3babcae9688a30 Mon Sep 17 00:00:00 2001 From: Santos Gallegos Date: Thu, 7 Apr 2022 10:47:50 -0500 Subject: [PATCH] Traffic analytics: track 404s (#9027) Co-authored-by: Eric Holscher <25510+ericholscher@users.noreply.github.com> Co-authored-by: Manuel Kaufmann --- .gitignore | 1 + .../migrations/0002_track_status_code.py | 55 +++++++ readthedocs/analytics/models.py | 134 +++++++++++++----- readthedocs/analytics/proxied_api.py | 17 +-- readthedocs/analytics/tests.py | 12 +- readthedocs/projects/views/private.py | 25 ++-- readthedocs/proxito/tests/test_full.py | 89 ++++++++++++ readthedocs/proxito/views/serve.py | 41 +++++- .../projects/project_traffic_analytics.html | 38 ++++- 9 files changed, 340 insertions(+), 72 deletions(-) create mode 100644 readthedocs/analytics/migrations/0002_track_status_code.py diff --git a/.gitignore b/.gitignore index b04e6803f7a..54f175b7859 100644 --- a/.gitignore +++ b/.gitignore @@ -8,6 +8,7 @@ .cache .coverage .coverage.* +coverage.xml .idea .vagrant .vscode diff --git a/readthedocs/analytics/migrations/0002_track_status_code.py b/readthedocs/analytics/migrations/0002_track_status_code.py new file mode 100644 index 00000000000..4d2f6a08fa3 --- /dev/null +++ b/readthedocs/analytics/migrations/0002_track_status_code.py @@ -0,0 +1,55 @@ +# Generated by Django 3.2.12 on 2022-03-29 17:51 + +import django.db.models.deletion +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ("builds", "0041_track_task_id"), + ("projects", "0087_use_booleanfield_null"), + ("analytics", "0001_initial"), + ] + + operations = [ + migrations.AddField( + model_name="pageview", + name="full_path", + field=models.CharField( + blank=True, + help_text="Full path including the version and language parts.", + max_length=4096, + null=True, + ), + ), + migrations.AddField( + model_name="pageview", + name="status", + field=models.PositiveIntegerField( + default=200, help_text="HTTP status code" + ), + ), + migrations.AlterField( + model_name="pageview", + name="path", + field=models.CharField( + help_text="Path relative to the version.", max_length=4096 + ), + ), + migrations.AlterField( + model_name="pageview", + name="version", + field=models.ForeignKey( + null=True, + on_delete=django.db.models.deletion.CASCADE, + related_name="page_views", + to="builds.version", + verbose_name="Version", + ), + ), + migrations.AlterUniqueTogether( + name="pageview", + unique_together={("project", "version", "path", "date", "status")}, + ), + ] diff --git a/readthedocs/analytics/models.py b/readthedocs/analytics/models.py index 9adbcebd961..169610b5387 100644 --- a/readthedocs/analytics/models.py +++ b/readthedocs/analytics/models.py @@ -1,6 +1,7 @@ """Analytics modeling to help understand the projects on Read the Docs.""" - import datetime +from collections import namedtuple +from urllib.parse import urlparse from django.db import models from django.db.models import Sum @@ -8,6 +9,7 @@ from django.utils.translation import gettext_lazy as _ from readthedocs.builds.models import Version +from readthedocs.core.resolver import resolve, resolve_path from readthedocs.projects.models import Project @@ -19,6 +21,27 @@ def _last_30_days_iter(): return (thirty_days_ago + timezone.timedelta(days=n) for n in range(31)) +class PageViewManager(models.Manager): + def register_page_view(self, project, version, path, full_path, status): + # Normalize paths to avoid duplicates. + path = "/" + path.lstrip("/") + full_path = "/" + full_path.lstrip("/") + + page_view, created = self.get_or_create( + project=project, + version=version, + path=path, + full_path=full_path, + date=timezone.now().date(), + status=status, + defaults={"view_count": 1}, + ) + if not created: + page_view.view_count = models.F("view_count") + 1 + page_view.save(update_fields=["view_count"]) + return page_view + + class PageView(models.Model): """PageView counts per day for a project, version, and path.""" @@ -28,65 +51,94 @@ class PageView(models.Model): related_name='page_views', on_delete=models.CASCADE, ) + # NOTE: this could potentially be removed, + # since isn't being used and not all page + # views (404s) are attached to a version. version = models.ForeignKey( Version, verbose_name=_('Version'), related_name='page_views', on_delete=models.CASCADE, + null=True, + ) + path = models.CharField( + max_length=4096, + help_text=_("Path relative to the version."), + ) + full_path = models.CharField( + max_length=4096, + help_text=_("Full path including the version and language parts."), + null=True, + blank=True, ) - path = models.CharField(max_length=4096) view_count = models.PositiveIntegerField(default=0) date = models.DateField(default=datetime.date.today, db_index=True) + status = models.PositiveIntegerField( + default=200, + help_text=_("HTTP status code"), + ) + + objects = PageViewManager() class Meta: - unique_together = ("project", "version", "path", "date") + unique_together = ("project", "version", "path", "date", "status") def __str__(self): - return f'PageView: [{self.project.slug}:{self.version.slug}] - {self.path} for {self.date}' + return f"PageView: [{self.project.slug}] - {self.full_path or self.path} for {self.date}" @classmethod - def top_viewed_pages(cls, project, since=None, limit=10): + def top_viewed_pages( + cls, project, since=None, limit=10, status=200, per_version=False + ): """ Returns top pages according to view counts. - Structure of returned data is compatible to make graphs. - Sample returned data:: - { - 'pages': ['index', 'config-file/v1', 'intro/import-guide'], - 'view_counts': [150, 120, 100] - } - This data shows that `index` is the most viewed page having 150 total views, - followed by `config-file/v1` and `intro/import-guide` having 120 and - 100 total page views respectively. + :param per_version: If `True`, group the results per version. + + :returns: A list of named tuples ordered by the number of views. + Each tuple contains: path, url, and count. """ + # pylint: disable=too-many-locals if since is None: since = timezone.now().date() - timezone.timedelta(days=30) + group_by = "full_path" if per_version else "path" queryset = ( - cls.objects - .filter(project=project, date__gte=since) - .values_list('path') - .annotate(total_views=Sum('view_count')) - .values_list('path', 'total_views') - .order_by('-total_views')[:limit] + cls.objects.filter(project=project, date__gte=since, status=status) + .values_list(group_by) + .annotate(count=Sum("view_count")) + .values_list(group_by, "count", named=True) + .order_by("-count")[:limit] ) - pages = [] - view_counts = [] - - for data in queryset.iterator(): - pages.append(data[0]) - view_counts.append(data[1]) - - final_data = { - 'pages': pages, - 'view_counts': view_counts, - } - - return final_data + PageViewResult = namedtuple("PageViewResult", "path, url, count") + result = [] + parsed_domain = urlparse(resolve(project)) + default_version = project.get_default_version() + for row in queryset: + if not per_version: + # If we aren't groupig by version, + # then always link to the default version. + url_path = resolve_path( + project=project, + version_slug=default_version, + filename=row.path, + ) + else: + url_path = row.full_path or "" + url = parsed_domain._replace(path=url_path).geturl() + path = row.full_path if per_version else row.path + result.append( + PageViewResult( + path=path, + url=url, + count=row.count, + ) + ) + return result @classmethod - def page_views_by_date(cls, project_slug, since=None): + def page_views_by_date(cls, project_slug, since=None, status=200): """ Returns the total page views count for last 30 days for a particular project. @@ -102,10 +154,16 @@ def page_views_by_date(cls, project_slug, since=None): if since is None: since = timezone.now().date() - timezone.timedelta(days=30) - queryset = cls.objects.filter( - project__slug=project_slug, - date__gte=since, - ).values('date').annotate(total_views=Sum('view_count')).order_by('date') + queryset = ( + cls.objects.filter( + project__slug=project_slug, + date__gte=since, + status=status, + ) + .values("date") + .annotate(total_views=Sum("view_count")) + .order_by("date") + ) count_dict = dict( queryset.order_by('date').values_list('date', 'total_views') diff --git a/readthedocs/analytics/proxied_api.py b/readthedocs/analytics/proxied_api.py index a7d40892923..c9e7cfdeab5 100644 --- a/readthedocs/analytics/proxied_api.py +++ b/readthedocs/analytics/proxied_api.py @@ -1,10 +1,8 @@ """Analytics views that are served from the same domain as the docs.""" - from functools import lru_cache +from urllib.parse import urlparse -from django.db.models import F from django.shortcuts import get_object_or_404 -from django.utils import timezone from rest_framework.response import Response from rest_framework.views import APIView @@ -67,20 +65,15 @@ def increase_page_view_count(self, request, project, version, absolute_uri): return path = unresolved.filename + full_path = urlparse(absolute_uri).path - fields = dict( + PageView.objects.register_page_view( project=project, version=version, path=path, - date=timezone.now().date(), - ) - page_view, created = PageView.objects.get_or_create( - **fields, - defaults={'view_count': 1}, + full_path=full_path, + status=200, ) - if not created: - page_view.view_count = F('view_count') + 1 - page_view.save(update_fields=['view_count']) class AnalyticsView(SettingsOverrideObject): diff --git a/readthedocs/analytics/tests.py b/readthedocs/analytics/tests.py index 6e683c78b54..46bd8799882 100644 --- a/readthedocs/analytics/tests.py +++ b/readthedocs/analytics/tests.py @@ -140,8 +140,8 @@ def test_increase_page_view_count(self): assert ( PageView.objects.all().count() == 1 - ), f'PageView object for path \'{self.absolute_uri}\' is already created' - assert PageView.objects.filter(path='index.html').count() == 1 + ), f"PageView object for path '{self.absolute_uri}' is already created" + assert PageView.objects.filter(path="/index.html").count() == 1 assert ( PageView.objects.all().first().view_count == 2 ), f'\'{self.absolute_uri}\' has 2 views now' @@ -154,8 +154,8 @@ def test_increase_page_view_count(self): assert ( PageView.objects.all().count() == 2 - ), f'PageView object for path \'{self.absolute_uri}\' is created for two days (yesterday and today)' - assert PageView.objects.filter(path='index.html').count() == 2 + ), f"PageView object for path '{self.absolute_uri}' is created for two days (yesterday and today)" + assert PageView.objects.filter(path="/index.html").count() == 2 assert ( PageView.objects.all().order_by('-date').first().view_count == 1 ), f'\'{self.absolute_uri}\' has 1 view today' @@ -168,8 +168,8 @@ def test_increase_page_view_count(self): assert ( PageView.objects.all().count() == 3 - ), f'PageView object for path \'{self.absolute_uri}\' is created for three days (yesterday, today & tomorrow)' - assert PageView.objects.filter(path='index.html').count() == 3 + ), f"PageView object for path '{self.absolute_uri}' is created for three days (yesterday, today & tomorrow)" + assert PageView.objects.filter(path="/index.html").count() == 3 assert ( PageView.objects.all().order_by('-date').first().view_count == 1 ), f'\'{self.absolute_uri}\' has 1 view tomorrow' diff --git a/readthedocs/projects/views/private.py b/readthedocs/projects/views/private.py index 8f0c4d2105f..15a60541986 100644 --- a/readthedocs/projects/views/private.py +++ b/readthedocs/projects/views/private.py @@ -1,7 +1,6 @@ """Project views for authenticated users.""" import structlog - from allauth.socialaccount.models import SocialAccount from django.conf import settings from django.contrib import messages @@ -1180,21 +1179,26 @@ def get_context_data(self, **kwargs): return context # Count of views for top pages over the month - top_pages = PageView.top_viewed_pages(project, limit=25) - top_viewed_pages = list(zip( - top_pages['pages'], - top_pages['view_counts'] - )) + top_pages_200 = PageView.top_viewed_pages(project, limit=25) + top_pages_404 = PageView.top_viewed_pages( + project, + limit=25, + status=404, + per_version=True, + ) # Aggregate pageviews grouped by day page_data = PageView.page_views_by_date( project_slug=project.slug, ) - context.update({ - 'top_viewed_pages': top_viewed_pages, - 'page_data': page_data, - }) + context.update( + { + "top_pages_200": top_pages_200, + "page_data": page_data, + "top_pages_404": top_pages_404, + } + ) return context @@ -1220,6 +1224,7 @@ def _get_csv_data(self): PageView.objects.filter( project=project, date__gte=days_ago, + status=200, ) .order_by('-date') .values_list(*[value for _, value in values]) diff --git a/readthedocs/proxito/tests/test_full.py b/readthedocs/proxito/tests/test_full.py index fefa21cb581..50fcf5f37f2 100644 --- a/readthedocs/proxito/tests/test_full.py +++ b/readthedocs/proxito/tests/test_full.py @@ -10,6 +10,7 @@ from django.urls import reverse from django_dynamic_fixture import get +from readthedocs.analytics.models import PageView from readthedocs.audit.models import AuditLog from readthedocs.builds.constants import EXTERNAL, INTERNAL, LATEST from readthedocs.builds.models import Version @@ -875,6 +876,94 @@ def test_404_all_paths_checked_default_version_different_doc_type(self, storage_ ] ) + @mock.patch.object(BuildMediaFileSystemStorageTest, "exists") + def test_track_broken_link(self, storage_exists): + self.assertEqual(PageView.objects.all().count(), 0) + + paths = [ + "/en/latest/not-found/", + "/en/latest/not-found/", + "/not-found", + "/en/not-found/", + ] + for path in paths: + storage_exists.reset_mock() + storage_exists.return_value = False + resp = self.client.get( + reverse( + "proxito_404_handler", + kwargs={"proxito_path": path}, + ), + HTTP_HOST="project.readthedocs.io", + ) + self.assertEqual(resp.status_code, 404) + + self.assertEqual(PageView.objects.all().count(), 3) + + version = self.project.versions.get(slug="latest") + + pageview = PageView.objects.get(full_path="/en/latest/not-found/") + self.assertEqual(pageview.project, self.project) + self.assertEqual(pageview.version, version) + self.assertEqual(pageview.path, "/not-found/") + self.assertEqual(pageview.view_count, 2) + self.assertEqual(pageview.status, 404) + + pageview = PageView.objects.get(full_path="/not-found") + self.assertEqual(pageview.project, self.project) + self.assertEqual(pageview.version, None) + self.assertEqual(pageview.path, "/not-found") + self.assertEqual(pageview.view_count, 1) + self.assertEqual(pageview.status, 404) + + pageview = PageView.objects.get(full_path="/en/not-found/") + self.assertEqual(pageview.project, self.project) + self.assertEqual(pageview.version, None) + self.assertEqual(pageview.path, "/en/not-found/") + self.assertEqual(pageview.view_count, 1) + self.assertEqual(pageview.status, 404) + + @mock.patch.object(BuildMediaFileSystemStorageTest, "open") + @mock.patch.object(BuildMediaFileSystemStorageTest, "exists") + def test_track_broken_link_custom_404(self, storage_exists, storage_open): + self.assertEqual(PageView.objects.all().count(), 0) + + paths = [ + "/en/latest/not-found", + "/en/latest/not-found", + "/en/latest/not-found/", + ] + for path in paths: + storage_open.reset_mock() + storage_exists.reset_mock() + storage_exists.side_effect = [False, False, True] + resp = self.client.get( + reverse( + "proxito_404_handler", + kwargs={"proxito_path": path}, + ), + HTTP_HOST="project.readthedocs.io", + ) + self.assertEqual(resp.status_code, 404) + storage_open.assert_called_once_with("html/project/latest/404.html") + + self.assertEqual(PageView.objects.all().count(), 2) + version = self.project.versions.get(slug="latest") + + pageview = PageView.objects.get(path="/not-found") + self.assertEqual(pageview.project, self.project) + self.assertEqual(pageview.version, version) + self.assertEqual(pageview.full_path, "/en/latest/not-found") + self.assertEqual(pageview.view_count, 2) + self.assertEqual(pageview.status, 404) + + pageview = PageView.objects.get(path="/not-found/") + self.assertEqual(pageview.project, self.project) + self.assertEqual(pageview.version, version) + self.assertEqual(pageview.full_path, "/en/latest/not-found/") + self.assertEqual(pageview.view_count, 1) + self.assertEqual(pageview.status, 404) + def test_sitemap_xml(self): self.project.versions.update(active=True) private_version = fixture.get( diff --git a/readthedocs/proxito/views/serve.py b/readthedocs/proxito/views/serve.py index 12049b26678..1220b0528ee 100644 --- a/readthedocs/proxito/views/serve.py +++ b/readthedocs/proxito/views/serve.py @@ -11,6 +11,7 @@ from django.views import View from django.views.decorators.cache import cache_page +from readthedocs.analytics.models import PageView from readthedocs.builds.constants import EXTERNAL, LATEST, STABLE from readthedocs.builds.models import Version from readthedocs.core.mixins import CachedView @@ -342,11 +343,12 @@ def get(self, request, proxito_path, template_name='404.html'): # If that doesn't work, attempt to serve the 404 of the current version (version_slug) # Secondly, try to serve the 404 page for the default version # (project.get_default_version()) - doc_type = ( + version = ( Version.objects.filter(project=final_project, slug=version_slug) - .values_list('documentation_type', flat=True) + .only("documentation_type") .first() ) + doc_type = version.documentation_type if version else None versions = [(version_slug, doc_type)] default_version_slug = final_project.get_default_version() if default_version_slug != version_slug: @@ -382,10 +384,45 @@ def get(self, request, proxito_path, template_name='404.html'): ) resp = HttpResponse(build_media_storage.open(storage_filename_path).read()) resp.status_code = 404 + self._register_broken_link( + project=final_project, + version=version, + path=redirect_filename, + full_path=proxito_path, + ) return resp + self._register_broken_link( + project=final_project, + version=version, + path=redirect_filename, + full_path=proxito_path, + ) raise Http404('No custom 404 page found.') + def _register_broken_link(self, project, version, path, full_path): + try: + # If the path isn't attached to a version + # it should be the same as the full_path, + # otherwise it would be empty. + if not version: + path = full_path + PageView.objects.register_page_view( + project=project, + version=version, + path=path, + full_path=full_path, + status=404, + ) + except Exception: + # Don't break doc serving if there was an error + # while recording the broken link. + log.exception( + "Error while recording the broken link", + project_slug=project.slug, + full_path=full_path, + ) + class ServeError404(SettingsOverrideObject): _default_class = ServeError404Base diff --git a/readthedocs/templates/projects/project_traffic_analytics.html b/readthedocs/templates/projects/project_traffic_analytics.html index 6f1760bf2d1..3ccf4af6dbb 100644 --- a/readthedocs/templates/projects/project_traffic_analytics.html +++ b/readthedocs/templates/projects/project_traffic_analytics.html @@ -23,10 +23,10 @@

{% trans "Top viewed pages in the past 30 days" %}

    - {% for page, count in top_viewed_pages %} + {% for row in top_pages_200 %}
  • - {{ page }} - {{ count }} + {{ row.path }} + {{ row.count }}
  • {% empty %}
  • @@ -41,13 +41,43 @@

    {% trans "Top viewed pages in the past 30 days" %}


    -

    {% trans "Daily pageview totals" %}

    +

    {% trans "Daily pageview totals" %}

    +

    {% trans "Not Found (404) pages" %}

    + +
    +
    +
      + {% for row in top_pages_404 %} +
    • + {{ row.path }} + {{ row.count }} +
    • + {# TODO: Add a `Create Redirect` button here with the URL pre-filled as the `from` URL #} + {% empty %} +
    • +

      + {% trans "No data available." %} +

      +
    • + {% endfor %} +
    +
    +
    + + + {% blocktrans %} + * The way that we track 404 pages is on our backend hosting. + It's possible the number of times each page is viewed is undercounted because of caching in the browser or in a CDN. + {% endblocktrans %} + + + {% endblock %} {% block extra_scripts %}