Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(scrapers.admin): create materialized view and admin page #4662

Draft
wants to merge 1 commit into
base: main
Choose a base branch
from
Draft
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
68 changes: 68 additions & 0 deletions cl/scrapers/admin.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from django.contrib import admin
from django.db import models

from cl.scrapers.models import (
PACERFreeDocumentLog,
Expand Down Expand Up @@ -29,3 +30,70 @@ class PACERFreeDocumentRowAdmin(admin.ModelAdmin):


admin.site.register(UrlHash)


class MVLatestOpinions(models.Model):
"""
Model linked to materialized view for monitoring scrapers

Must use `REFRESH MATERIALIZED VIEW scrapers_mv_latest_opinion`
periodically
"""

query = """
CREATE MATERIALIZED VIEW
scrapers_mv_latest_opinion
AS
(
SELECT
court_id,
max(so.date_created) as latest_creation_date,
(now() - max(so.date_created))::text as time_since
FROM
(
SELECT id, court_id
FROM search_docket
WHERE court_id IN (
SELECT id
FROM search_court
/*
Only check courts with scrapers in use
*/
WHERE
has_opinion_scraper
AND in_use
)
) sd
INNER JOIN
(SELECT id, docket_id FROM search_opinioncluster) soc ON soc.docket_id = sd.id
INNER JOIN
search_opinion so ON so.cluster_id = soc.id
GROUP BY
sd.court_id
HAVING
/*
Only return results for courts with no updates in a week
*/
now() - max(so.date_created) > interval '7 days'
ORDER BY
2 DESC
)
"""
# a django model must have a primary key
court_id = models.TextField(primary_key=True)
latest_creation_date = models.DateField()
time_since = models.TextField()

class Meta:
managed = False # ignore this model in migrations
db_table = "scrapers_mv_latest_opinion"


@admin.register(MVLatestOpinions)
class MVLatestOpinionsAdmin(admin.ModelAdmin):
"""Admin page to look at the latest opinion for each court

Use this to monitor silently failing scrapers
"""

list_display = ["court_id", "latest_creation_date", "time_since"]
Loading