Skip to content

Commit

Permalink
[staged-updates] optimizes the crap out of the tasks progress page to…
Browse files Browse the repository at this point in the history
… stop it killing servers.

This does remove a feature from the page, but the feature was stupid and nobody uses this page anyway (but now they can).
  • Loading branch information
biblicabeebli committed Aug 23, 2024
1 parent 530acce commit 1250974
Show file tree
Hide file tree
Showing 4 changed files with 8 additions and 73 deletions.
54 changes: 2 additions & 52 deletions database/study_models.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,14 @@
from __future__ import annotations

import operator
from datetime import datetime, timedelta, tzinfo
from typing import Any, Dict, Optional
from datetime import datetime, tzinfo
from typing import Any, Dict

from dateutil.tz import gettz
from django.core.exceptions import ObjectDoesNotExist
from django.core.validators import MaxValueValidator, MinValueValidator
from django.db import models
from django.db.models import F, Func, Manager
from django.db.models.query import QuerySet
from django.utils import timezone
from django.utils.timezone import localtime

from constants.data_stream_constants import ALL_DATA_STREAMS
Expand Down Expand Up @@ -126,54 +124,6 @@ def get_researchers(self) -> QuerySet[Researcher]:
from database.user_models_researcher import Researcher
return Researcher.objects.filter(study_relations__study=self)

def get_earliest_data_time_bin(
self, only_after_epoch: bool = True, only_before_now: bool = True
) -> Optional[datetime]:
return self._get_data_time_bin(
earliest=True,
only_after_epoch=only_after_epoch,
only_before_now=only_before_now,
)

def get_latest_data_time_bin(
self, only_after_epoch: bool = True, only_before_now: bool = True
) -> Optional[datetime]:
return self._get_data_time_bin(
earliest=False,
only_after_epoch=only_after_epoch,
only_before_now=only_before_now,
)

def _get_data_time_bin(
self, earliest=True, only_after_epoch: bool = True, only_before_now: bool = True
) -> Optional[datetime]:
""" Return the earliest ChunkRegistry time bin datetime for this study.
Note: As of 2021-07-01, running the query as a QuerySet filter or sorting the QuerySet can
take upwards of 30 seconds. Doing the logic in python speeds this up tremendously.
Args:
earliest: if True, will return earliest datetime; if False, will return latest datetime
only_after_epoch: if True, will filter results only for datetimes after the Unix epoch
(1970-01-01T00:00:00Z)
only_before_now: if True, will filter results only for datetimes before now """

time_bins: QuerySet[datetime] = self.chunk_registries.values_list("time_bin", flat=True)
comparator = operator.lt if earliest else operator.gt
now = timezone.now()
desired_time_bin = None
for time_bin in time_bins:
if only_after_epoch and time_bin.timestamp() <= 0:
continue
if only_before_now and time_bin > now:
continue
if desired_time_bin is None:
desired_time_bin = time_bin
continue
if comparator(desired_time_bin, time_bin):
continue
desired_time_bin = time_bin
return desired_time_bin

def notification_events(self, **archived_event_filter_kwargs):
from database.schedule_models import ArchivedEvent
return ArchivedEvent.objects.filter(
Expand Down
20 changes: 5 additions & 15 deletions endpoints/forest_endpoints.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
from database.user_models_participant import Participant
from libs.django_forms.forms import CreateTasksForm
from libs.efficient_paginator import EfficientQueryPaginator
from libs.endpoint_helpers.dashboard_helpers import get_first_and_last_days_of_data
from libs.endpoint_helpers.summary_statistic_helpers import SummaryStatisticsPaginator
from libs.internal_types import ParticipantQuerySet, ResearcherRequest
from libs.s3 import NoSuchKeyException
Expand Down Expand Up @@ -75,11 +76,6 @@ def forest_tasks_progress(request: ResearcherRequest, study_id=None):
# number of forest tasks shouldn't be the bottleneck here.
tasks = ForestTask.objects.filter(participant__in=participants).order_by("created_on")

# these are quite optimized buuuuut it is still slow.
start_date = (study.get_earliest_data_time_bin() or study.created_on).date()
end_date = (study.get_latest_data_time_bin() or timezone.now()).date()

params = {}
results = defaultdict(lambda: "-")
chart_elements_lookup = {False: "N", None: "?"}
# this loop builds the chart of whether there are forest results for date ranges
Expand All @@ -102,7 +98,10 @@ def forest_tasks_progress(request: ResearcherRequest, study_id=None):
# 3. If in_table is a ? we can just skip it because we cannot upgrade from ? to Y here.
# So, we just skip if we are already at ? in the chart element, and otherwise we do the lookup.
results[key] = chart_elements_lookup[output_exists]
params[key] = task.safe_unpickle_parameters_as_string()

start_date, end_date = get_first_and_last_days_of_data(study)
start_date = start_date or study.created_on.date()
end_date = end_date or timezone.now().date()

# generate the date range for the chart, we need it many times.
dates = list(daterange(start_date, end_date, inclusive=True))
Expand All @@ -114,14 +113,6 @@ def forest_tasks_progress(request: ResearcherRequest, study_id=None):
[results[(participant.id, tree_name, date)] for date in dates]
chart.append(row)

# ensure that within each tree, only a single set of param values are used (only the most recent runs
# are considered, and unsuccessful runs are assumed to invalidate old runs, clearing params)
params_conflict = False
for tree_name in {k[1] for k in params.keys()}:
if len({m for k, m in params.items() if m is not None and k[1] == tree_name}) > 1:
params_conflict = True
break

chart_json = orjson.dumps(chart).decode() # may be huge, but orjson is very fast.
return render(
request,
Expand All @@ -130,7 +121,6 @@ def forest_tasks_progress(request: ResearcherRequest, study_id=None):
study=study,
chart_columns=["participant", "tree"] + dates,
status_choices=ForestTaskStatus,
params_conflict=params_conflict,
start_date=start_date,
end_date=end_date,
chart=chart_json # this uses the jinja safe filter and should never involve user input
Expand Down
5 changes: 0 additions & 5 deletions frontend/templates/forest/forest_tasks_progress.html
Original file line number Diff line number Diff line change
Expand Up @@ -44,10 +44,5 @@
{% block title %}Forest Analysis Progress{% endblock %}

{% block content %}
{% if params_conflict %}
<div class="alert alert-warning">
Warning: There are analyses on this study that were computed using different Forest parameters.
</div>
{% endif %}
<table id="analysis_chart" class="display" width="100%"></table>
{% endblock %}
2 changes: 1 addition & 1 deletion frontend/templates/forest/task_log.html
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@
</div>

<div class="table-responsive col-xs-12 well">
<table class="table">
<table class="table ng-cloak">
<thead>
<tr>
<th scope="col">Created On</th>
Expand Down

0 comments on commit 1250974

Please sign in to comment.