Skip to content

Commit

Permalink
Traffic analytics: track 404s (#9027)
Browse files Browse the repository at this point in the history
Co-authored-by: Eric Holscher <[email protected]>
Co-authored-by: Manuel Kaufmann <[email protected]>
  • Loading branch information
3 people authored Apr 7, 2022
1 parent 9a56069 commit 759733c
Show file tree
Hide file tree
Showing 9 changed files with 340 additions and 72 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
.cache
.coverage
.coverage.*
coverage.xml
.idea
.vagrant
.vscode
Expand Down
55 changes: 55 additions & 0 deletions readthedocs/analytics/migrations/0002_track_status_code.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
# Generated by Django 3.2.12 on 2022-03-29 17:51

import django.db.models.deletion
from django.db import migrations, models


class Migration(migrations.Migration):

dependencies = [
("builds", "0041_track_task_id"),
("projects", "0087_use_booleanfield_null"),
("analytics", "0001_initial"),
]

operations = [
migrations.AddField(
model_name="pageview",
name="full_path",
field=models.CharField(
blank=True,
help_text="Full path including the version and language parts.",
max_length=4096,
null=True,
),
),
migrations.AddField(
model_name="pageview",
name="status",
field=models.PositiveIntegerField(
default=200, help_text="HTTP status code"
),
),
migrations.AlterField(
model_name="pageview",
name="path",
field=models.CharField(
help_text="Path relative to the version.", max_length=4096
),
),
migrations.AlterField(
model_name="pageview",
name="version",
field=models.ForeignKey(
null=True,
on_delete=django.db.models.deletion.CASCADE,
related_name="page_views",
to="builds.version",
verbose_name="Version",
),
),
migrations.AlterUniqueTogether(
name="pageview",
unique_together={("project", "version", "path", "date", "status")},
),
]
134 changes: 96 additions & 38 deletions readthedocs/analytics/models.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,15 @@
"""Analytics modeling to help understand the projects on Read the Docs."""

import datetime
from collections import namedtuple
from urllib.parse import urlparse

from django.db import models
from django.db.models import Sum
from django.utils import timezone
from django.utils.translation import gettext_lazy as _

from readthedocs.builds.models import Version
from readthedocs.core.resolver import resolve, resolve_path
from readthedocs.projects.models import Project


Expand All @@ -19,6 +21,27 @@ def _last_30_days_iter():
return (thirty_days_ago + timezone.timedelta(days=n) for n in range(31))


class PageViewManager(models.Manager):
def register_page_view(self, project, version, path, full_path, status):
# Normalize paths to avoid duplicates.
path = "/" + path.lstrip("/")
full_path = "/" + full_path.lstrip("/")

page_view, created = self.get_or_create(
project=project,
version=version,
path=path,
full_path=full_path,
date=timezone.now().date(),
status=status,
defaults={"view_count": 1},
)
if not created:
page_view.view_count = models.F("view_count") + 1
page_view.save(update_fields=["view_count"])
return page_view


class PageView(models.Model):

"""PageView counts per day for a project, version, and path."""
Expand All @@ -28,65 +51,94 @@ class PageView(models.Model):
related_name='page_views',
on_delete=models.CASCADE,
)
# NOTE: this could potentially be removed,
# since isn't being used and not all page
# views (404s) are attached to a version.
version = models.ForeignKey(
Version,
verbose_name=_('Version'),
related_name='page_views',
on_delete=models.CASCADE,
null=True,
)
path = models.CharField(
max_length=4096,
help_text=_("Path relative to the version."),
)
full_path = models.CharField(
max_length=4096,
help_text=_("Full path including the version and language parts."),
null=True,
blank=True,
)
path = models.CharField(max_length=4096)
view_count = models.PositiveIntegerField(default=0)
date = models.DateField(default=datetime.date.today, db_index=True)
status = models.PositiveIntegerField(
default=200,
help_text=_("HTTP status code"),
)

objects = PageViewManager()

class Meta:
unique_together = ("project", "version", "path", "date")
unique_together = ("project", "version", "path", "date", "status")

def __str__(self):
return f'PageView: [{self.project.slug}:{self.version.slug}] - {self.path} for {self.date}'
return f"PageView: [{self.project.slug}] - {self.full_path or self.path} for {self.date}"

@classmethod
def top_viewed_pages(cls, project, since=None, limit=10):
def top_viewed_pages(
cls, project, since=None, limit=10, status=200, per_version=False
):
"""
Returns top pages according to view counts.
Structure of returned data is compatible to make graphs.
Sample returned data::
{
'pages': ['index', 'config-file/v1', 'intro/import-guide'],
'view_counts': [150, 120, 100]
}
This data shows that `index` is the most viewed page having 150 total views,
followed by `config-file/v1` and `intro/import-guide` having 120 and
100 total page views respectively.
:param per_version: If `True`, group the results per version.
:returns: A list of named tuples ordered by the number of views.
Each tuple contains: path, url, and count.
"""
# pylint: disable=too-many-locals
if since is None:
since = timezone.now().date() - timezone.timedelta(days=30)

group_by = "full_path" if per_version else "path"
queryset = (
cls.objects
.filter(project=project, date__gte=since)
.values_list('path')
.annotate(total_views=Sum('view_count'))
.values_list('path', 'total_views')
.order_by('-total_views')[:limit]
cls.objects.filter(project=project, date__gte=since, status=status)
.values_list(group_by)
.annotate(count=Sum("view_count"))
.values_list(group_by, "count", named=True)
.order_by("-count")[:limit]
)

pages = []
view_counts = []

for data in queryset.iterator():
pages.append(data[0])
view_counts.append(data[1])

final_data = {
'pages': pages,
'view_counts': view_counts,
}

return final_data
PageViewResult = namedtuple("PageViewResult", "path, url, count")
result = []
parsed_domain = urlparse(resolve(project))
default_version = project.get_default_version()
for row in queryset:
if not per_version:
# If we aren't groupig by version,
# then always link to the default version.
url_path = resolve_path(
project=project,
version_slug=default_version,
filename=row.path,
)
else:
url_path = row.full_path or ""
url = parsed_domain._replace(path=url_path).geturl()
path = row.full_path if per_version else row.path
result.append(
PageViewResult(
path=path,
url=url,
count=row.count,
)
)
return result

@classmethod
def page_views_by_date(cls, project_slug, since=None):
def page_views_by_date(cls, project_slug, since=None, status=200):
"""
Returns the total page views count for last 30 days for a particular project.
Expand All @@ -102,10 +154,16 @@ def page_views_by_date(cls, project_slug, since=None):
if since is None:
since = timezone.now().date() - timezone.timedelta(days=30)

queryset = cls.objects.filter(
project__slug=project_slug,
date__gte=since,
).values('date').annotate(total_views=Sum('view_count')).order_by('date')
queryset = (
cls.objects.filter(
project__slug=project_slug,
date__gte=since,
status=status,
)
.values("date")
.annotate(total_views=Sum("view_count"))
.order_by("date")
)

count_dict = dict(
queryset.order_by('date').values_list('date', 'total_views')
Expand Down
17 changes: 5 additions & 12 deletions readthedocs/analytics/proxied_api.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,8 @@
"""Analytics views that are served from the same domain as the docs."""

from functools import lru_cache
from urllib.parse import urlparse

from django.db.models import F
from django.shortcuts import get_object_or_404
from django.utils import timezone
from rest_framework.response import Response
from rest_framework.views import APIView

Expand Down Expand Up @@ -67,20 +65,15 @@ def increase_page_view_count(self, request, project, version, absolute_uri):
return

path = unresolved.filename
full_path = urlparse(absolute_uri).path

fields = dict(
PageView.objects.register_page_view(
project=project,
version=version,
path=path,
date=timezone.now().date(),
)
page_view, created = PageView.objects.get_or_create(
**fields,
defaults={'view_count': 1},
full_path=full_path,
status=200,
)
if not created:
page_view.view_count = F('view_count') + 1
page_view.save(update_fields=['view_count'])


class AnalyticsView(SettingsOverrideObject):
Expand Down
12 changes: 6 additions & 6 deletions readthedocs/analytics/tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,8 +140,8 @@ def test_increase_page_view_count(self):

assert (
PageView.objects.all().count() == 1
), f'PageView object for path \'{self.absolute_uri}\' is already created'
assert PageView.objects.filter(path='index.html').count() == 1
), f"PageView object for path '{self.absolute_uri}' is already created"
assert PageView.objects.filter(path="/index.html").count() == 1
assert (
PageView.objects.all().first().view_count == 2
), f'\'{self.absolute_uri}\' has 2 views now'
Expand All @@ -154,8 +154,8 @@ def test_increase_page_view_count(self):

assert (
PageView.objects.all().count() == 2
), f'PageView object for path \'{self.absolute_uri}\' is created for two days (yesterday and today)'
assert PageView.objects.filter(path='index.html').count() == 2
), f"PageView object for path '{self.absolute_uri}' is created for two days (yesterday and today)"
assert PageView.objects.filter(path="/index.html").count() == 2
assert (
PageView.objects.all().order_by('-date').first().view_count == 1
), f'\'{self.absolute_uri}\' has 1 view today'
Expand All @@ -168,8 +168,8 @@ def test_increase_page_view_count(self):

assert (
PageView.objects.all().count() == 3
), f'PageView object for path \'{self.absolute_uri}\' is created for three days (yesterday, today & tomorrow)'
assert PageView.objects.filter(path='index.html').count() == 3
), f"PageView object for path '{self.absolute_uri}' is created for three days (yesterday, today & tomorrow)"
assert PageView.objects.filter(path="/index.html").count() == 3
assert (
PageView.objects.all().order_by('-date').first().view_count == 1
), f'\'{self.absolute_uri}\' has 1 view tomorrow'
25 changes: 15 additions & 10 deletions readthedocs/projects/views/private.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
"""Project views for authenticated users."""

import structlog

from allauth.socialaccount.models import SocialAccount
from django.conf import settings
from django.contrib import messages
Expand Down Expand Up @@ -1180,21 +1179,26 @@ def get_context_data(self, **kwargs):
return context

# Count of views for top pages over the month
top_pages = PageView.top_viewed_pages(project, limit=25)
top_viewed_pages = list(zip(
top_pages['pages'],
top_pages['view_counts']
))
top_pages_200 = PageView.top_viewed_pages(project, limit=25)
top_pages_404 = PageView.top_viewed_pages(
project,
limit=25,
status=404,
per_version=True,
)

# Aggregate pageviews grouped by day
page_data = PageView.page_views_by_date(
project_slug=project.slug,
)

context.update({
'top_viewed_pages': top_viewed_pages,
'page_data': page_data,
})
context.update(
{
"top_pages_200": top_pages_200,
"page_data": page_data,
"top_pages_404": top_pages_404,
}
)

return context

Expand All @@ -1220,6 +1224,7 @@ def _get_csv_data(self):
PageView.objects.filter(
project=project,
date__gte=days_ago,
status=200,
)
.order_by('-date')
.values_list(*[value for _, value in values])
Expand Down
Loading

0 comments on commit 759733c

Please sign in to comment.