From 9973b78c066b9adf07896a1e750e3dfad1a492d2 Mon Sep 17 00:00:00 2001
From: Liam Keegan <liam@keegan.ch>
Date: Fri, 21 Feb 2025 15:30:33 +0100
Subject: [PATCH] Refactor milestone answers statistics calculation

- refactor statistics calculation
  - add `update_stats` function which updates milestones and milestone group statistics
    - optional `update_existing_statistics` argument
      - if `True`, then statistics updated using new answers as before
      - if `False`, then recalculates all statistics using all answers (may be needed if e.g. some junk answers are deleted by an admin)
  - reduce duplication
- `AnswerSession`
  - add `expired` flag: initially False
    - set to True by `get_or_create_current_milestone_answer_session` if it was created 7 or more days ago
    - set to True when stats are updated if it was created 9 days or more ago
      - includes a grace period to avoid setting a currently in use answer session to expired
    - once an answer session is expired, then answers can no longer be modified / submitted by the user
      - this should ensure that answers cannot be modified after they have been included in the statistics
    - resolves #219
  - add `included_in_statistics` flag: initially False
    - set to True once the answers from this session are included in the statistics
- `MilestoneAnswer`
  - remove `included_in_milestone_statistics` and `included_in_milestonegroup_statistics` flags
    - this is now done at the level of an answer session rather than for each individual answer
- milestone feedback functions
  - insert a `TrafficLight.invalid.value` instead of raising an exception if there are no statistics for a milestone id or group
  - no longer recalculate stats when constructing feedback to avoid slowing down a user request in this case
- add `/update-milestone-age-scores` admin endpoint to recalculate the statistics
  - TODO: add tests, add button(s) to admin interface?
  - TODO: add scheduled calling of this function
---
 .../src/mondey_backend/models/milestones.py   |   4 +-
 .../routers/admin_routers/milestones.py       |   8 +
 .../src/mondey_backend/routers/scores.py      | 112 ++---
 .../src/mondey_backend/routers/statistics.py  | 426 +++++++++---------
 .../src/mondey_backend/routers/users.py       |   2 +
 .../src/mondey_backend/routers/utils.py       |  22 +-
 mondey_backend/tests/conftest.py              | 202 ++++-----
 .../admin_routers/test_admin_milestones.py    |  18 +-
 mondey_backend/tests/routers/test_auth.py     |   5 +-
 .../tests/routers/test_milestones.py          |   4 +-
 mondey_backend/tests/routers/test_users.py    |  45 +-
 mondey_backend/tests/utils/test_scores.py     | 147 +++---
 mondey_backend/tests/utils/test_statistics.py | 219 +++------
 mondey_backend/tests/utils/test_utils.py      |   7 +-
 14 files changed, 531 insertions(+), 690 deletions(-)

diff --git a/mondey_backend/src/mondey_backend/models/milestones.py b/mondey_backend/src/mondey_backend/models/milestones.py
index a08c72b1..bd3622a4 100644
--- a/mondey_backend/src/mondey_backend/models/milestones.py
+++ b/mondey_backend/src/mondey_backend/models/milestones.py
@@ -158,8 +158,6 @@ class MilestoneAnswer(SQLModel, table=True):
     )
     milestone_group_id: int = Field(default=None, foreign_key="milestonegroup.id")
     answer: int
-    included_in_milestone_statistics: bool = False
-    included_in_milestonegroup_statistics: bool = False
 
 
 class MilestoneAnswerSession(SQLModel, table=True):
@@ -171,6 +169,8 @@ class MilestoneAnswerSession(SQLModel, table=True):
             "server_default": text("CURRENT_TIMESTAMP"),
         }
     )
+    expired: bool = False
+    included_in_statistics: bool = False
     answers: Mapped[dict[int, MilestoneAnswer]] = dict_relationship(key="milestone_id")
 
 
diff --git a/mondey_backend/src/mondey_backend/routers/admin_routers/milestones.py b/mondey_backend/src/mondey_backend/routers/admin_routers/milestones.py
index 8dde8167..77c3557c 100644
--- a/mondey_backend/src/mondey_backend/routers/admin_routers/milestones.py
+++ b/mondey_backend/src/mondey_backend/routers/admin_routers/milestones.py
@@ -20,6 +20,7 @@
 from ...models.milestones import SubmittedMilestoneImage
 from ...models.milestones import SubmittedMilestoneImagePublic
 from ...models.utils import ItemOrder
+from ..statistics import update_stats
 from ..utils import add
 from ..utils import get
 from ..utils import milestone_group_image_path
@@ -195,4 +196,11 @@ def get_milestone_age_scores(
 
         return collection
 
+    @router.post(
+        "/update-milestone-age-scores/{incremental}",
+        response_model=str,
+    )
+    def update_milestone_age_scores(session: SessionDep, incremental: bool) -> str:
+        return update_stats(session, incremental)
+
     return router
diff --git a/mondey_backend/src/mondey_backend/routers/scores.py b/mondey_backend/src/mondey_backend/routers/scores.py
index b9c66e33..35b0395f 100644
--- a/mondey_backend/src/mondey_backend/routers/scores.py
+++ b/mondey_backend/src/mondey_backend/routers/scores.py
@@ -1,8 +1,6 @@
 from __future__ import annotations
 
 import logging
-from datetime import datetime
-from datetime import timedelta
 from enum import Enum
 from typing import cast
 
@@ -15,8 +13,6 @@
 from ..models.milestones import MilestoneAnswerSession
 from ..models.milestones import MilestoneGroupAgeScore
 from ..models.milestones import MilestoneGroupAgeScoreCollection
-from .statistics import calculate_milestone_statistics_by_age
-from .statistics import calculate_milestonegroup_statistics_by_age
 from .utils import get_child_age_in_months
 
 
@@ -88,7 +84,6 @@ def compute_milestonegroup_feedback_summary(
     by first calculating the mean score over all milestones that belong to the milestonegroup that
     are relevant for the child when the given answersession was created. The mean is then
     compared against the mean and standard deviation over the known population of children for the child's age.
-    When the statistics is outdated (older than a week currently) or there is none, it is recomputed and updated in the database.
     See `compute_feedback_simple` for the feedback logic.
 
     Parameters
@@ -123,53 +118,35 @@ def compute_milestonegroup_feedback_summary(
     logger.debug(f"  child age in months: {age}")
     # extract milestonegroups
     groups = set(answer.milestone_group_id for answer in answersession.answers.values())
-    today = datetime.now()
 
     # for each milestonegroup, get the statistics, compute the current mean, and compute the feedback
-    # if the statistics is older than a week, we update it with the current data
     feedback: dict[int, int] = {}
     for group in groups:
         logger.debug(f"  group: {group}")
         stats = session.get(MilestoneGroupAgeScoreCollection, group)
-        logger.debug(f"  old stats: {stats}")
-        if stats is not None:
+        if stats is None:
+            logger.debug("  no stats")
+            feedback[group] = TrafficLight.invalid.value
+        else:
+            logger.debug(f"  stats: {stats}")
             for i, score in enumerate(stats.scores):
                 if score.count > 0:
                     logger.debug(
-                        f"   old score: , {i}, {score.count}, {score.avg_score}, {score.stddev_score}"
+                        f"   score: , {i}, {score.count}, {score.avg_score}, {score.stddev_score}"
                     )
-
-        if stats is None or stats.created_at < today - timedelta(days=7):
-            new_stats = calculate_milestonegroup_statistics_by_age(session, group)
-
-            if new_stats is None:
-                raise ValueError("No statistics for milestone group: ", group)
-
-            # update stuff in database
-            for i, new_score in enumerate(new_stats.scores):
-                if new_score.count > 0:
-                    logger.debug(
-                        f"   new_score: , {i}, {new_score.count}, {new_score.avg_score}, {new_score.stddev_score}"
-                    )
-                session.merge(new_score)
-
-            session.merge(new_stats)
-            session.commit()
-            stats = new_stats
-
-        # extract the answers for the current milestone group
-        group_answers = [
-            answer.answer + 1
-            for answer in answersession.answers.values()
-            if answer.milestone_group_id == group
-        ]
-        logger.debug(
-            f'  group answers: , {group_answers}, "mean: ", {np.mean(group_answers)}'
-        )
-        # use the statistics recorded for a certain age as the basis for the feedback computation
-        feedback[group] = compute_feedback_simple(
-            stats.scores[age], float(np.mean(group_answers))
-        )
+            # extract the answers for the current milestone group
+            group_answers = [
+                answer.answer + 1
+                for answer in answersession.answers.values()
+                if answer.milestone_group_id == group
+            ]
+            logger.debug(
+                f'  group answers: , {group_answers}, "mean: ", {np.mean(group_answers)}'
+            )
+            # use the statistics recorded for a certain age as the basis for the feedback computation
+            feedback[group] = compute_feedback_simple(
+                stats.scores[age], float(np.mean(group_answers))
+            )
     logger.debug(f"summary feedback: {feedback}")
     return feedback
 
@@ -179,8 +156,8 @@ def compute_milestonegroup_feedback_detailed(
 ) -> dict[int, dict[int, int]]:
     """
     Compute the per-milestone (detailed) feedback for all answers in a given answersession.
-    This is done by comparing the given answer per milestone against the mean and standard deviation of the known population of children for the child's age. If this statistics is outdated (older than a week currently) or is
-    missing, it is recomputed and updated in the database. See `compute_feedback_simple` for the feedback logic.
+    This is done by comparing the given answer per milestone against the mean and standard deviation of the known population of children for the child's age.
+    See `compute_feedback_simple` for the feedback logic.
     Return a dictionary mapping milestonegroup -> [milestone -> feedback].
     Parameters
     ----------
@@ -214,49 +191,28 @@ def compute_milestonegroup_feedback_detailed(
 
     age = get_child_age_in_months(child, answersession.created_at)
     logger.debug(f"  child age in months: {age}")
-    today = datetime.today()
 
     # for each milestonegroup, get the statistics, compute the current mean, and compute the feedback
     feedback: dict[int, dict[int, int]] = {}
     for milestone_id, answer in answersession.answers.items():
-        # try to get statistics for the current milestone and update it if it's not there
-        # or is too old
+        logger.debug(f"  milestone id: {milestone_id}, answer: {answer.answer + 1}")
         stats = session.get(MilestoneAgeScoreCollection, milestone_id)
-        logger.debug(f"  old stats: {stats}")
-        if stats is not None:
+        logger.debug(f"  stats: {stats}")
+        if answer.milestone_group_id not in feedback:
+            feedback[answer.milestone_group_id] = {}
+        if stats is None:
+            feedback[answer.milestone_group_id][cast(int, answer.milestone_id)] = (
+                TrafficLight.invalid.value
+            )
+        else:
             for i, score in enumerate(stats.scores):
                 if score.count > 0:
                     logger.debug(
-                        f"   old score: {i}, {score.count}, {score.avg_score}, {score.stddev_score}"
+                        f"   score: {i}, {score.count}, {score.avg_score}, {score.stddev_score}"
                     )
-
-        if stats is None or stats.created_at < today - timedelta(days=7):
-            new_stats = calculate_milestone_statistics_by_age(session, milestone_id)
-
-            if new_stats is None:
-                raise ValueError(
-                    "No new statistics could be calculated for milestone: ",
-                    milestone_id,
-                )
-
-            # update stuff in database
-            for i, new_score in enumerate(new_stats.scores):
-                if new_score.count > 0:
-                    logger.debug(
-                        f"   new_score: , {i}, {new_score.count}, {new_score.avg_score}, {new_score.stddev_score}"
-                    )
-                session.merge(new_score)
-
-            session.merge(new_stats)
-            session.commit()
-            stats = new_stats
-
-        if answer.milestone_group_id not in feedback:
-            feedback[answer.milestone_group_id] = {}
-
-        feedback[answer.milestone_group_id][cast(int, answer.milestone_id)] = (
-            compute_feedback_simple(stats.scores[age], answer.answer + 1)
-        )
+            feedback[answer.milestone_group_id][cast(int, answer.milestone_id)] = (
+                compute_feedback_simple(stats.scores[age], answer.answer + 1)
+            )
 
     logger.debug(f" detailed feedback: {feedback}")
 
diff --git a/mondey_backend/src/mondey_backend/routers/statistics.py b/mondey_backend/src/mondey_backend/routers/statistics.py
index 1be97ffa..ba4230e3 100644
--- a/mondey_backend/src/mondey_backend/routers/statistics.py
+++ b/mondey_backend/src/mondey_backend/routers/statistics.py
@@ -1,6 +1,8 @@
 from __future__ import annotations
 
 import datetime
+import logging
+from collections import defaultdict
 from collections.abc import Sequence
 
 import numpy as np
@@ -23,29 +25,30 @@
 # we are using Welford's method here. This necessitates recording the count.
 def _add_sample(
     count: int,
-    mean: float | int,
-    m2: float | int,
-    new_value: float | int,
-) -> tuple[int, float | int, float | int]:
+    mean: float,
+    m2: float,
+    new_value: float,
+) -> tuple[int, float, float]:
     """
-    Add a sample to the the current statistics. This function uses an online algorithm to compute the mean (directly) and an intermediate for the variance. This uses Welford's method with a slight
-    modification to avoid numerical instability. See https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
-    for details.
+    Add a sample to the the current statistics.
+    This function uses an online algorithm to compute the mean (directly) and an intermediate for the variance.
+    This uses Welford's method with a slight modification to avoid numerical instability.
+    See https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Welford's_online_algorithm
 
     Parameters
     ----------
     count : int
         number of samples added so far.
-    mean : float | int
+    mean : float
         current mean of the samples.
-    m2 : float | int
+    m2 : float
         intermediate value for the variance computation.
-    new_value : float | int
+    new_value : float
         new sample to be added to the statistics.
 
     Returns
     -------
-    tuple[float | int, float | int, float | int]
+    tuple[int, float, float]
         updated count, mean, and m2 values.
     """
     count += 1
@@ -58,25 +61,27 @@ def _add_sample(
 
 def _finalize_statistics(
     count: int | np.ndarray,
-    mean: float | int | np.ndarray,
-    m2: float | int | np.ndarray,
-) -> tuple[float | int | np.ndarray, float | np.ndarray, float | np.ndarray]:
+    mean: float | np.ndarray,
+    m2: float | np.ndarray,
+) -> tuple[int | np.ndarray, float | np.ndarray, float | np.ndarray]:
     """
-    Compute the mean and standard deviation from the intermediate values. This function is used to finalize the statistics after a batch of new samples have been added. If arrays are supplied, they all need to have the
-    same shape. Values for the standard deviation for which the count is less than 2 are set to zero.
+    Compute the mean and standard deviation from the intermediate values.
+    This function is used to finalize the statistics after a batch of new samples have been added.
+    If arrays are supplied, they all need to have the same shape.
+    Values for the standard deviation for which the count is less than 2 are set to zero.
 
     Parameters
     ----------
     count : int | np.ndarray
         Current counts of samples. If ndarray, it contains the number of samples for each entry.
-    mean : float | int | np.ndarray
+    mean : float | np.ndarray
         Current mean value of the samples. If ndarray, it contains the mean for each entry.
-    m2 : float | int | np.ndarray
+    m2 : float | np.ndarray
         Current intermediate value for variance computation. If ndarray, it contains the intermediate value for each entry.
 
     Returns
     -------
-    tuple[float | int | np.ndarray, float | np.ndarray, float | np.ndarray]
+    tuple[int | np.ndarray, float | np.ndarray, float | np.ndarray]
         updated count, mean, and standard deviation values.
 
     Raises
@@ -93,8 +98,12 @@ def _finalize_statistics(
         else:
             var = m2 / (count - 1)
             return count, mean, np.sqrt(var)
-    elif all(isinstance(x, np.ndarray) for x in [count, mean, m2]):
-        if not all(x.shape == count.shape for x in [mean, m2]):  # type: ignore
+    elif (
+        isinstance(count, np.ndarray)
+        and isinstance(mean, np.ndarray)
+        and isinstance(m2, np.ndarray)
+    ):
+        if count.shape != m2.shape or mean.shape != m2.shape:
             raise ValueError(
                 "Given arrays for statistics computation must have the same shape."
             )
@@ -102,8 +111,8 @@ def _finalize_statistics(
         with np.errstate(invalid="ignore"):
             valid_counts = count >= 2
             variance = m2
-            variance[valid_counts] /= count[valid_counts] - 1  # type: ignore
-            variance[np.invert(valid_counts)] = 0.0  # type: ignore
+            variance[valid_counts] /= count[valid_counts] - 1
+            variance[np.invert(valid_counts)] = 0.0
             return (
                 count,
                 np.nan_to_num(mean),
@@ -158,7 +167,7 @@ def _get_statistics_by_age(
 
     # online algorithm computes variance, compute m2 from stddev
     # we can ignore count-1 <= 0 because stddev is zero in this case
-    m2 = stddev**2 * (count - 1)
+    m2 = np.pow(stddev, 2) * (count - 1)
 
     for answer in answers:
         age = child_ages[answer.answer_session_id]  # type: ignore
@@ -169,224 +178,229 @@ def _get_statistics_by_age(
         avg[age] = new_avg
         m2[age] = new_m2
 
-    count, avg, stddev = _finalize_statistics(count, avg, m2)  # type: ignore
+    count, avg, stddev = _finalize_statistics(count, avg, m2)
 
     return count, avg, stddev
 
 
+def make_any_stale_sessions_inactive(session: SessionDep) -> None:
+    days_after_which_session_is_stale = 9
+    stale_date = datetime.datetime.now() - datetime.timedelta(
+        days=days_after_which_session_is_stale
+    )
+    for stale_milestone_answer_session in session.exec(
+        select(MilestoneAnswerSession)
+        .where(~col(MilestoneAnswerSession.expired))
+        .where(col(MilestoneAnswerSession.created_at) <= stale_date)
+    ).all():
+        stale_milestone_answer_session.expired = True
+        session.add(stale_milestone_answer_session)
+    session.commit()
+
+
+def update_stats(session: SessionDep, incremental_update: bool) -> str:
+    logger = logging.getLogger(__name__)
+    logger.debug(
+        f"Starting {'incremental' if incremental_update else 'full'} statistics update"
+    )
+
+    make_any_stale_sessions_inactive(session)
+
+    # get MilestoneAnswerSessions to be used for calculating statistics
+    answer_session_filter = select(MilestoneAnswerSession).where(
+        col(MilestoneAnswerSession.expired)
+    )
+    if incremental_update:
+        answer_session_filter = answer_session_filter.where(
+            ~col(MilestoneAnswerSession.included_in_statistics)
+        )
+    milestone_answer_sessions = session.exec(answer_session_filter).all()
+    child_ages = _get_answer_session_child_ages_in_months(
+        session, milestone_answer_sessions
+    )
+    logger.debug(f"  - found {len(milestone_answer_sessions)} answer sessions")
+
+    # construct a list of MilestoneAnswers for each Milestone and MilestoneGroup
+    milestone_answers: dict[int, list[MilestoneAnswer]] = defaultdict(list)
+    milestone_group_answers: dict[int, list[MilestoneAnswer]] = defaultdict(list)
+    for milestone_answer_session in milestone_answer_sessions:
+        for milestone_id, answer in milestone_answer_session.answers.items():
+            milestone_answers[milestone_id].append(answer)
+            milestone_group_answers[answer.milestone_group_id].append(answer)
+
+    # update milestone statistics
+    logger.debug(f"  - updating {len(milestone_answers)} milestone statistics...")
+    for milestone_id, answers in milestone_answers.items():
+        existing_milestone_statistics = (
+            session.get(MilestoneAgeScoreCollection, milestone_id)
+            if incremental_update
+            else None
+        )
+        new_milestone_statistics = calculate_milestone_statistics_by_age(
+            milestone_id, answers, child_ages, existing_milestone_statistics
+        )
+        session.merge(new_milestone_statistics)
+
+    # update milestone group statistics
+    logger.debug(
+        f"  - updating {len(milestone_group_answers)} milestone group statistics..."
+    )
+    for milestone_group_id, answers in milestone_group_answers.items():
+        existing_milestone_group_statistics = (
+            session.get(MilestoneGroupAgeScoreCollection, milestone_group_id)
+            if incremental_update
+            else None
+        )
+        new_milestone_group_statistics = calculate_milestonegroup_statistics_by_age(
+            milestone_group_id, answers, child_ages, existing_milestone_group_statistics
+        )
+        session.merge(new_milestone_group_statistics)
+
+    for milestone_answer_session in milestone_answer_sessions:
+        milestone_answer_session.included_in_statistics = True
+        session.add(milestone_answer_session)
+
+    session.commit()
+    logger.debug("  - done")
+    return f"{'Incremental' if incremental_update else 'Full'} statistics update complete using {len(milestone_answer_sessions)} answer sessions."
+
+
+def _extract_stats(
+    existing_statistics: MilestoneGroupAgeScoreCollection
+    | MilestoneAgeScoreCollection
+    | None,
+) -> tuple[np.ndarray | None, np.ndarray | None, np.ndarray | None]:
+    """
+    Extract count, avg and stddev arrays from an AgeScoreCollection
+
+    Parameters
+    ----------
+    existing_statistics : MilestoneGroupAgeScoreCollection | MilestoneAgeScoreCollection | None
+        The collection to extract statistics from.
+    Returns
+    -------
+    tuple[np.ndarray, np.ndarray, np.ndarray]
+        count, avg and stddev arrays.
+    """
+    if existing_statistics is None:
+        return None, None, None
+
+    last_scores = existing_statistics.scores
+    count = np.array([score.count for score in last_scores], dtype=np.int32)
+    avg_scores = np.array([score.avg_score for score in last_scores], dtype=np.float64)
+    stddev_scores = np.array(
+        [score.stddev_score for score in last_scores], dtype=np.float64
+    )
+    return count, avg_scores, stddev_scores
+
+
 def calculate_milestone_statistics_by_age(
-    session: SessionDep,
     milestone_id: int,
+    answers: Sequence[MilestoneAnswer],
+    child_ages: dict[int, int],
+    existing_statistics: MilestoneAgeScoreCollection | None,
 ) -> MilestoneAgeScoreCollection | None:
     """
     Calculate the mean, variance of a milestone per age in months.
-    Takes into account only answers from expired sessions. If no statistics exist yet, all answers from expired sessions are considered, else only the ones newer than the last statistics are considered.
+    If existing statistics are provided they are updated using the provided answers.
 
     Parameters
     ----------
-    session : SessionDep
-        database session
     milestone_id : int
         id of the milestone to calculate the statistics for
+    answers: Sequence[MilestoneAnswer]
+        the new answers to include in the statistics.
+    child_ages : dict[int, int]
+        dict of answer_session_id -> child age in months
+    existing_statistics: MilestoneAgeScoreCollection | None
+        the existing statistics to update, if any
     Returns
     -------
     MilestoneAgeScoreCollection | None
-        MilestoneAgeScoreCollection object which contains a list of MilestoneAgeScore objects,
-    one for each month, or None if there are no answers for the milestoneg and no previous statistics.
+        updated statistics, or None if there are no new answers and no existing statistics.
     """
-    session_expired_days: int = 7
-    # TODO: when the answersession eventually has an expired flag, this can go again.
+    if len(answers) == 0:
+        # return existing statistics if no new answers are available
+        return existing_statistics
 
-    # get the newest statistics for the milestone
-    last_statistics = session.get(MilestoneAgeScoreCollection, milestone_id)
+    # initialize avg and stddev scores with the existing statistics or to None if no statistics are available
+    count, avg_scores, stddev_scores = _extract_stats(existing_statistics)
 
-    # initialize avg and stddev scores with the last known statistics or to None if no statistics are available
-    child_ages = _get_answer_session_child_ages_in_months(session)
-    expiration_date = datetime.datetime.now() - datetime.timedelta(
-        days=session_expired_days
+    count, avg_scores, stddev_scores = _get_statistics_by_age(
+        answers, child_ages, count=count, avg=avg_scores, stddev=stddev_scores
     )
 
-    count = None
-    avg_scores = None
-    stddev_scores = None
-
-    if last_statistics is None:
-        # no statistics exists yet -> all answers from expired sessions are relevant
-
-        answers_query = (
-            select(MilestoneAnswer)
-            .join(
-                MilestoneAnswerSession,
-                col(MilestoneAnswer.answer_session_id) == MilestoneAnswerSession.id,
-            )
-            .where(MilestoneAnswer.milestone_id == milestone_id)
-            .where(~col(MilestoneAnswer.included_in_milestone_statistics))
-            .where(MilestoneAnswerSession.created_at < expiration_date)
-        )
-    else:
-        # initialize avg and stddev scores with the last known statistics
-        last_scores = last_statistics.scores
-        count = np.array([score.count for score in last_scores])
-        avg_scores = np.array([score.avg_score for score in last_scores])
-        stddev_scores = np.array([score.stddev_score for score in last_scores])
-
-        # we calculate the statistics with an online algorithm, so we only consider new data
-        # that has not been included in the last statistics but which stems from sessions that are expired
-        answers_query = (
-            select(MilestoneAnswer)
-            .join(
-                MilestoneAnswerSession,
-                col(MilestoneAnswer.answer_session_id) == MilestoneAnswerSession.id,
+    expected_age = _get_expected_age_from_scores(avg_scores)
+
+    # overwrite last_statistics with updated stuff --> set primary keys explicitly
+    return MilestoneAgeScoreCollection(
+        milestone_id=milestone_id,
+        expected_age=expected_age,
+        created_at=datetime.datetime.now(),
+        scores=[
+            MilestoneAgeScore(
+                age=age,
+                milestone_id=milestone_id,
+                count=int(
+                    count[age]
+                ),  # need a conversion to avoid numpy.int32 being stored as byte object
+                avg_score=avg_scores[age],
+                stddev_score=stddev_scores[age],
+                expected_score=4 if age >= expected_age else 1,
             )
-            .where(MilestoneAnswer.milestone_id == milestone_id)
-            .where(~col(MilestoneAnswer.included_in_milestone_statistics))
-            .where(col(MilestoneAnswerSession.created_at) <= expiration_date)
-        )
-
-    answers = session.exec(answers_query).all()
-
-    if len(answers) == 0:
-        # return last statistics if no new answers are available, because that is the best we can do then.
-        return last_statistics
-    else:
-        count, avg_scores, stddev_scores = _get_statistics_by_age(
-            answers, child_ages, count=count, avg=avg_scores, stddev=stddev_scores
-        )
-
-        expected_age = _get_expected_age_from_scores(avg_scores)
-
-        for answer in answers:
-            answer.included_in_milestone_statistics = True
-            session.merge(answer)
-        session.commit()
-
-        # overwrite last_statistics with updated stuff --> set primary keys explicitly
-        return MilestoneAgeScoreCollection(
-            milestone_id=milestone_id,
-            expected_age=expected_age,
-            created_at=datetime.datetime.now(),
-            scores=[
-                MilestoneAgeScore(
-                    age=age,
-                    milestone_id=milestone_id,
-                    count=int(
-                        count[age]
-                    ),  # need a conversion to avoid numpy.int32 being stored as byte object
-                    avg_score=avg_scores[age],
-                    stddev_score=stddev_scores[age],
-                    expected_score=4 if age >= expected_age else 1,
-                )
-                for age in range(0, len(avg_scores))
-            ],
-        )
+            for age in range(0, len(avg_scores))
+        ],
+    )
 
 
 def calculate_milestonegroup_statistics_by_age(
-    session: SessionDep,
-    milestonegroup_id: int,
+    milestone_group_id: int,
+    answers: Sequence[MilestoneAnswer],
+    child_ages: dict[int, int],
+    existing_statistics: MilestoneGroupAgeScoreCollection | None,
 ) -> MilestoneGroupAgeScoreCollection | None:
     """
-    Calculate the mean, variance of a milestonegroup per age in months.
-    Takes into account only answers from expired sessions. If no statistics exist yet, all answers from expired sessions are considered, else only the ones newer than the last statistics are considered.
+    Calculate the mean, variance of a milestone group per age in months.
+    If existing statistics are provided they are updated using the provided answers.
 
     Parameters
     ----------
-    session : SessionDep
-        database session
-    milestonegroup_id : int
-        id of the milestonegroup to calculate the statistics for
-
+    milestone_group_id : int
+        id of the milestone group to calculate the statistics for
+    answers: Sequence[MilestoneAnswer]
+        the new answers to include in the statistics.
+    child_ages : dict[int, int]
+        dict of answer_session_id -> child age in months
+    existing_statistics: MilestoneGroupAgeScoreCollection | None
+        the existing statistics to update, if any
     Returns
     -------
     MilestoneGroupAgeScoreCollection | None
-        MilestoneGroupAgeScoreCollection object which contains a list of MilestoneGroupAgeScore objects,
-    one for each month, or None if there are no answers for the milestonegroup and no previous statistics.
+        updated statistics, or None if there are no new answers and no existing statistics.
     """
+    if len(answers) == 0:
+        # return existing statistics if no new answers are available
+        return existing_statistics
 
-    session_expired_days: int = 7
-
-    # get the newest statistics for the milestonegroup
-    last_statistics = session.get(MilestoneGroupAgeScoreCollection, milestonegroup_id)
-
-    child_ages = _get_answer_session_child_ages_in_months(session)
-    expiration_date = datetime.datetime.now() - datetime.timedelta(
-        days=session_expired_days
+    # initialize avg and stddev scores with the existing statistics or to None if no statistics are available
+    count, avg_scores, stddev_scores = _extract_stats(existing_statistics)
+    count, avg_scores, stddev_scores = _get_statistics_by_age(
+        answers, child_ages, count=count, avg=avg_scores, stddev=stddev_scores
     )
 
-    count = None
-    avg_scores = None
-    stddev_scores = None
-    # we have 2 kinds of querys that need to be executed depending on the existence of a statistics object
-    if last_statistics is None:
-        # no statistics exists yet -> all answers from expired sessions are relevant
-        answer_query = (
-            select(MilestoneAnswer)
-            .join(
-                MilestoneAnswerSession,
-                col(MilestoneAnswer.answer_session_id) == MilestoneAnswerSession.id,
+    return MilestoneGroupAgeScoreCollection(
+        milestone_group_id=milestone_group_id,
+        scores=[
+            MilestoneGroupAgeScore(
+                milestone_group_id=milestone_group_id,
+                age=age,
+                count=int(
+                    count[age]
+                ),  # need a conversion to avoid numpy.int32 being stored as byte object
+                avg_score=avg_scores[age],
+                stddev_score=stddev_scores[age],
             )
-            .where(MilestoneAnswer.milestone_group_id == milestonegroup_id)
-            .where(~col(MilestoneAnswer.included_in_milestonegroup_statistics))
-            .where(
-                MilestoneAnswerSession.created_at
-                <= expiration_date  # expired session only
-            )
-        )
-    else:
-        # initialize avg and stddev scores with the last known statistics
-        count = np.array(
-            [score.count for score in last_statistics.scores], dtype=np.int32
-        )
-        avg_scores = np.array(
-            [score.avg_score for score in last_statistics.scores], dtype=np.float64
-        )
-        stddev_scores = np.array(
-            [score.stddev_score for score in last_statistics.scores]
-        )
-        # we calculate the statistics with an online algorithm, so we only consider new data
-        # that has not been included in the last statistics but which stems from sessions that are expired
-        # README: same reason for type: ignore as in the function above
-        answer_query = (
-            select(MilestoneAnswer)
-            .join(
-                MilestoneAnswerSession,
-                col(MilestoneAnswer.answer_session_id) == MilestoneAnswerSession.id,
-            )
-            .where(MilestoneAnswer.milestone_group_id == milestonegroup_id)
-            .where(~col(MilestoneAnswer.included_in_milestonegroup_statistics))
-            .where(MilestoneAnswerSession.created_at <= expiration_date)
-        )
-
-    answers = session.exec(answer_query).all()
-
-    if len(answers) == 0:
-        # return last statistics if no new answers are available, because that is the best we can do then.
-
-        return last_statistics
-    else:
-        count, avg, stddev = _get_statistics_by_age(
-            answers, child_ages, count=count, avg=avg_scores, stddev=stddev_scores
-        )
-
-        # update answer.included_in_milestonegroup_statistics to True
-        for answer in answers:
-            answer.included_in_milestonegroup_statistics = True
-            session.merge(answer)
-        session.commit()
-
-        return MilestoneGroupAgeScoreCollection(
-            milestone_group_id=milestonegroup_id,
-            scores=[
-                MilestoneGroupAgeScore(
-                    milestone_group_id=milestonegroup_id,
-                    age=age,
-                    count=int(
-                        count[age]
-                    ),  # need a conversion to avoid numpy.int32 being stored as byte object
-                    avg_score=avg[age],
-                    stddev_score=stddev[age],
-                )
-                for age in range(0, len(avg))
-            ],
-            created_at=datetime.datetime.now(),
-        )
+            for age in range(0, len(avg_scores))
+        ],
+        created_at=datetime.datetime.now(),
+    )
diff --git a/mondey_backend/src/mondey_backend/routers/users.py b/mondey_backend/src/mondey_backend/routers/users.py
index 4b3e7a45..c2637bd9 100644
--- a/mondey_backend/src/mondey_backend/routers/users.py
+++ b/mondey_backend/src/mondey_backend/routers/users.py
@@ -149,6 +149,8 @@ def update_milestone_answer(
         )
         if milestone_answer_session.user_id != current_active_user.id:
             raise HTTPException(401)
+        if milestone_answer_session.expired:
+            raise HTTPException(401, "Answer session has expired")
         milestone_answer = milestone_answer_session.answers.get(answer.milestone_id)
         if milestone_answer is None:
             raise HTTPException(401)
diff --git a/mondey_backend/src/mondey_backend/routers/utils.py b/mondey_backend/src/mondey_backend/routers/utils.py
index 7605a424..ccf9f3ff 100644
--- a/mondey_backend/src/mondey_backend/routers/utils.py
+++ b/mondey_backend/src/mondey_backend/routers/utils.py
@@ -4,6 +4,7 @@
 import logging
 import pathlib
 from collections.abc import Iterable
+from collections.abc import Sequence
 from typing import TypeVar
 
 import numpy as np
@@ -144,9 +145,12 @@ def get_or_create_current_milestone_answer_session(
         .where(col(MilestoneAnswerSession.child_id) == child.id)
         .order_by(col(MilestoneAnswerSession.created_at).desc())
     ).first()
-    if milestone_answer_session is None or _session_has_expired(
-        milestone_answer_session
-    ):
+    if milestone_answer_session and _session_has_expired(milestone_answer_session):
+        milestone_answer_session.expired = True
+        session.add(milestone_answer_session)
+        session.commit()
+        session.refresh(milestone_answer_session)
+    if milestone_answer_session is None or milestone_answer_session.expired:
         milestone_answer_session = MilestoneAnswerSession(
             child_id=child.id,
             user_id=current_active_user.id,
@@ -195,9 +199,9 @@ def get_db_child(
     return child
 
 
-def _get_answer_session_child_ages_in_months(session: SessionDep) -> dict[int, int]:
-    answer_sessions = session.exec(select(MilestoneAnswerSession)).all()
-
+def _get_answer_session_child_ages_in_months(
+    session: SessionDep, answer_sessions: Sequence[MilestoneAnswerSession]
+) -> dict[int, int]:
     return {
         answer_session.id: get_child_age_in_months(  # type: ignore
             get(session, Child, answer_session.child_id), answer_session.created_at
@@ -208,7 +212,7 @@ def _get_answer_session_child_ages_in_months(session: SessionDep) -> dict[int, i
 
 def _get_expected_age_from_scores(scores: np.ndarray) -> int:
     # placeholder algorithm: returns first age with avg score > 3
-    return np.argmax(scores >= 3.0)
+    return int(np.argmax(scores >= 3.0))
 
 
 def child_image_path(child_id: int | None) -> pathlib.Path:
@@ -242,12 +246,12 @@ def get_milestonegroups_for_answersession(
 ) -> dict[int, MilestoneGroup]:
     check_for_overlap = (
         select(Milestone.group_id)
-        .where(Milestone.id.in_(answersession.answers.keys()))  # type: ignore
+        .where(col(Milestone.id).in_(answersession.answers.keys()))
         .distinct()
     )
     return {
         m.id: m  # type: ignore
         for m in session.exec(
-            select(MilestoneGroup).where(MilestoneGroup.id.in_(check_for_overlap))  # type: ignore
+            select(MilestoneGroup).where(col(MilestoneGroup.id).in_(check_for_overlap))
         ).all()
     }
diff --git a/mondey_backend/tests/conftest.py b/mondey_backend/tests/conftest.py
index f9cc3a83..c8395ed8 100644
--- a/mondey_backend/tests/conftest.py
+++ b/mondey_backend/tests/conftest.py
@@ -245,6 +245,8 @@ def session(children: list[dict], monkeypatch: pytest.MonkeyPatch):
                 child_id=1,
                 user_id=3,
                 created_at=datetime.datetime(last_month.year, last_month.month, 15),
+                expired=True,
+                included_in_statistics=True,
             )
         )
         session.add(
@@ -253,8 +255,6 @@ def session(children: list[dict], monkeypatch: pytest.MonkeyPatch):
                 milestone_id=1,
                 milestone_group_id=1,
                 answer=1,
-                included_in_milestone_statistics=True,
-                included_in_milestonegroup_statistics=True,
             )
         )
         session.add(
@@ -263,12 +263,16 @@ def session(children: list[dict], monkeypatch: pytest.MonkeyPatch):
                 milestone_id=2,
                 milestone_group_id=1,
                 answer=0,
-                included_in_milestone_statistics=True,
-                included_in_milestonegroup_statistics=True,
             )
         )
-        # add another (current) milestone answer session for child 1 / user (id 3) with 2 answers to the same questions
-        session.add(MilestoneAnswerSession(child_id=1, user_id=3, created_at=today))
+        # add another (unexpired and not included in stats) milestone answer session for child 1 / user (id 3) with 2 answers to the same questions
+        session.add(
+            MilestoneAnswerSession(
+                child_id=1,
+                user_id=3,
+                created_at=datetime.datetime(last_month.year, last_month.month, 20),
+            )
+        )
         # add two milestone answers
         session.add(
             MilestoneAnswer(
@@ -280,22 +284,93 @@ def session(children: list[dict], monkeypatch: pytest.MonkeyPatch):
                 answer_session_id=2, milestone_id=2, milestone_group_id=1, answer=2
             )
         )
-        # add an (expired) milestone answer session for child 3 / admin user (id 1) with 1 answer
+        # add an (un-expired) milestone answer session for child 3 / admin user (id 1) with 1 answer
         session.add(
             MilestoneAnswerSession(
                 child_id=3,
                 user_id=1,
-                created_at=datetime.datetime(today.year - 1, 1, 1),
+                created_at=datetime.datetime.today(),
+                expired=False,
+                included_in_statistics=False,
             )
         )
         session.add(
             MilestoneAnswer(
                 answer_session_id=3,
-                milestone_id=7,
+                milestone_id=5,
                 milestone_group_id=2,
                 answer=2,
             )
         )
+        # add MilestoneAgeScoreCollection for milestone 1
+        session.add(
+            MilestoneAgeScoreCollection(
+                milestone_id=1,
+                expected_age=8,
+                created_at=datetime.datetime(
+                    last_month.year,
+                    last_month.month,
+                    17,
+                ),
+            )
+        )
+        for age in range(0, 73):
+            session.add(
+                MilestoneAgeScore(
+                    age=age,
+                    milestone_id=1,
+                    count=1 if age == 8 else 0,
+                    avg_score=2.0 if age == 8 else 0,
+                    stddev_score=0.0,
+                    expected_score=3 if age >= 8 else 1,
+                )
+            )
+
+        # add MilestoneAgeScoreCollection for milestone 2
+        session.add(
+            MilestoneAgeScoreCollection(
+                milestone_id=2,
+                expected_age=8,
+                created_at=datetime.datetime(
+                    last_month.year,
+                    last_month.month,
+                    17,
+                ),
+            )
+        )
+        for age in range(0, 73):
+            session.add(
+                MilestoneAgeScore(
+                    age=age,
+                    milestone_id=2,
+                    count=1 if age == 8 else 0,
+                    avg_score=1.0 if age == 8 else 0,
+                    stddev_score=0.0,
+                    expected_score=3 if age >= 8 else 1,
+                )
+            )
+
+        # add MilestoneGroupAgeScoreCollection for milestone group 1
+        session.add(
+            MilestoneGroupAgeScoreCollection(
+                milestone_group_id=1,
+                created_at=datetime.datetime(
+                    last_month.year,
+                    last_month.month,
+                    17,
+                ),
+            )
+        )
+        for age in range(0, 73):
+            session.add(
+                MilestoneGroupAgeScore(
+                    age=age,
+                    milestone_group_id=1,
+                    count=2 if age == 8 else 0,
+                    avg_score=1.5 if age == 8 else 0.0,
+                    stddev_score=0.5 if age == 8 else 0.0,
+                )
+            )
         # add a research group (that user with id 3 is part of, and researcher with id 2 has access to)
         session.add(ResearchGroup(id="123451"))
         # add user questions for admin
@@ -484,6 +559,8 @@ def statistics_session(session):
             child_id=1,
             user_id=3,
             created_at=datetime.datetime(last_month.year, last_month.month, 20),
+            expired=True,
+            included_in_statistics=False,
         )
     )
     session.add(
@@ -497,114 +574,21 @@ def statistics_session(session):
         )
     )
 
-    # add another expired answersession for milestone 7 for child 3 that is a bit later
-    # than answersession 3 (the last one for the same child), but still expired
+    # add an expired answersession for child 3 with answers for milestone 7 & 8
     session.add(
         MilestoneAnswerSession(
-            child_id=3, user_id=1, created_at=datetime.datetime(today.year - 1, 1, 10)
+            child_id=3,
+            user_id=1,
+            created_at=datetime.datetime(today.year - 1, 1, 10),
+            expired=True,
+            included_in_statistics=False,
         )
     )
     session.add(
         MilestoneAnswer(
-            answer_session_id=5, milestone_id=7, milestone_group_id=2, answer=1
-        )
-    )
-
-    # add MilestoneAgeScoreCollections for milestone 1 and 2. Done such that
-    # answersession 4 added above did not yet factor into its calculation
-    # numbers for avg/stddev in the scores will be arbitrary
-    session.add(
-        MilestoneAgeScoreCollection(
-            milestone_id=1,
-            expected_age=8,
-            created_at=datetime.datetime(
-                last_month.year,
-                last_month.month,
-                17,  # between answersessions -> recompute
-            ),
-        )
-    )
-
-    session.add(
-        MilestoneAgeScoreCollection(
-            milestone_id=2,
-            expected_age=8,
-            created_at=datetime.datetime(
-                last_month.year,
-                last_month.month,
-                17,  # between answersessions -> recompute
-            ),
-        )
-    )
-
-    def sigma(age, lower, upper, value):
-        if age < lower or age >= upper:
-            return 0
-        else:
-            return value
-
-    # add scores for milestone 1 and 2
-    for age in range(0, 73):
-        session.add(
-            MilestoneAgeScore(
-                age=age,
-                milestone_id=1,
-                count=12,
-                avg_score=0.0
-                if age < 5
-                else min(
-                    1 * age - 5, 3
-                ),  # linear increase from some age onward arbitrary numbers here
-                stddev_score=sigma(
-                    age, 5, 8, 0.35
-                ),  # arbitrary numbers here. constant stddev for increasing avg else 0
-                expected_score=3 if age >= 8 else 1,
-            )
-        )
-        session.add(
-            MilestoneAgeScore(
-                age=age,
-                milestone_id=2,
-                count=7,
-                avg_score=0.0 if age < 5 else min(0.5 * age - 2, 3),
-                stddev_score=sigma(age, 5, 10, 0.4),
-                expected_score=3 if age >= 10 else 1,
-            )
-        )
-
-    # add milestonegroup age score collection for milestonegroup 1
-    # which is a month old and hence is. repeats the logic used for the
-    # MilestoneAgeScores
-    session.add(
-        MilestoneGroupAgeScoreCollection(
-            milestone_group_id=1,
-            created_at=datetime.datetime(
-                last_month.year,
-                last_month.month,
-                17,  # between answersessions -> recompute
-            ),
+            answer_session_id=5, milestone_id=5, milestone_group_id=2, answer=1
         )
     )
-
-    for age in range(0, 73):
-        session.add(
-            MilestoneGroupAgeScore(
-                age=age,
-                milestone_group_id=1,
-                count=4
-                if age
-                in [
-                    5,
-                    6,
-                    7,
-                    8,
-                ]
-                else 0,
-                avg_score=0.0 if age < 5 else min(0.24 * age, 3),
-                stddev_score=sigma(age, 5, 9, 0.21),
-            )
-        )
-
     session.commit()
     yield session
 
diff --git a/mondey_backend/tests/routers/admin_routers/test_admin_milestones.py b/mondey_backend/tests/routers/admin_routers/test_admin_milestones.py
index 93907559..4fbcdd10 100644
--- a/mondey_backend/tests/routers/admin_routers/test_admin_milestones.py
+++ b/mondey_backend/tests/routers/admin_routers/test_admin_milestones.py
@@ -230,21 +230,17 @@ def test_get_milestone_age_scores(admin_client_stat: TestClient):
 
     assert response.json()["expected_age"] == 8
 
-    assert response.json()["scores"][7]["avg_score"] == pytest.approx(2.0)
-    assert response.json()["scores"][7]["stddev_score"] == pytest.approx(0.35)
-    assert response.json()["scores"][7]["count"] == 12
+    assert response.json()["scores"][7]["avg_score"] == pytest.approx(0.0)
+    assert response.json()["scores"][7]["stddev_score"] == pytest.approx(0.0)
+    assert response.json()["scores"][7]["count"] == 0
 
-    assert response.json()["scores"][8]["avg_score"] == pytest.approx(3.0)
+    assert response.json()["scores"][8]["avg_score"] == pytest.approx(2.0)
     assert response.json()["scores"][8]["stddev_score"] == pytest.approx(0.0)
-    assert response.json()["scores"][8]["count"] == 12
+    assert response.json()["scores"][8]["count"] == 1
 
-    assert response.json()["scores"][9]["avg_score"] == pytest.approx(3.0)
+    assert response.json()["scores"][9]["avg_score"] == pytest.approx(0.0)
     assert response.json()["scores"][9]["stddev_score"] == pytest.approx(0.0)
-    assert response.json()["scores"][9]["count"] == 12
-
-    assert response.json()["scores"][10]["avg_score"] == pytest.approx(3.0)
-    assert response.json()["scores"][10]["stddev_score"] == pytest.approx(0.0)
-    assert response.json()["scores"][10]["count"] == 12
+    assert response.json()["scores"][9]["count"] == 0
 
 
 def test_get_submitted_milestone_images(admin_client: TestClient):
diff --git a/mondey_backend/tests/routers/test_auth.py b/mondey_backend/tests/routers/test_auth.py
index 4a0b037f..2e0aafd2 100644
--- a/mondey_backend/tests/routers/test_auth.py
+++ b/mondey_backend/tests/routers/test_auth.py
@@ -127,8 +127,5 @@ def test_user_forgot_password_invalid_email(
     assert smtp_mock.last_message is None
     email = "invalid-email"
     response = user_client.post("/auth/forgot-password", json={"email": email})
-    assert (
-        response.json()["detail"][0]["msg"]
-        == "value is not a valid email address: An email address must have an @-sign."
-    )
+    assert "@" in response.json()["detail"][0]["msg"]
     assert response.json()["detail"][0]["type"] == "value_error"
diff --git a/mondey_backend/tests/routers/test_milestones.py b/mondey_backend/tests/routers/test_milestones.py
index a8d7d737..dfb901c4 100644
--- a/mondey_backend/tests/routers/test_milestones.py
+++ b/mondey_backend/tests/routers/test_milestones.py
@@ -47,10 +47,8 @@ def test_get_milestone_groups_child3(
     response = admin_client.get("/milestone-groups/3")
     assert response.status_code == 200
     assert len(response.json()) == 2
-    # child 3 age is ~60 months old, so no milestones
     milestone_group1["milestones"] = []
-    # and first last milestone from group2 (24m):
-    milestone_group2["milestones"] = []
+    milestone_group2["milestones"] = milestone_group2["milestones"][1:]
 
     assert response.json() == [milestone_group2, milestone_group1]
 
diff --git a/mondey_backend/tests/routers/test_users.py b/mondey_backend/tests/routers/test_users.py
index 8011e88c..46af9f2b 100644
--- a/mondey_backend/tests/routers/test_users.py
+++ b/mondey_backend/tests/routers/test_users.py
@@ -2,9 +2,6 @@
 import pathlib
 
 from fastapi.testclient import TestClient
-from sqlmodel import select
-
-from mondey_backend.models.milestones import MilestoneAnswer
 
 
 def _is_approx_now(iso_date_string: str, delta=datetime.timedelta(hours=1)) -> bool:
@@ -167,33 +164,40 @@ def test_get_milestone_answers_child8_child_does_not_exist(admin_client: TestCli
     assert response.status_code == 404
 
 
-def test_get_milestone_answers_child3_no_current_answer_session(
+def test_get_milestone_answers_child3_current_answer_session(
     admin_client: TestClient,
 ):
     response = admin_client.get("/users/milestone-answers/3")
     assert response.status_code == 200
-    assert response.json()["id"] == 4
+    assert response.json()["id"] == 3
     assert response.json()["child_id"] == 3
+    assert response.json()["answers"] == {
+        "5": {
+            "milestone_id": 5,
+            "answer": 2,
+        },
+    }
     assert _is_approx_now(response.json()["created_at"])
-    assert response.json()["answers"] == {}
 
 
-def test_get_milestone_answers_child1_current_answer_session(user_client: TestClient):
+def test_get_milestone_answers_child1_no_current_answer_session(
+    user_client: TestClient,
+):
     response = user_client.get("/users/milestone-answers/1")
     assert response.status_code == 200
-    assert response.json()["id"] == 2
+    assert response.json()["id"] == 4
     assert response.json()["child_id"] == 1
+    assert _is_approx_now(response.json()["created_at"])
     assert response.json()["answers"] == {
         "1": {
             "milestone_id": 1,
-            "answer": 3,
+            "answer": -1,
         },
         "2": {
             "milestone_id": 2,
-            "answer": 2,
+            "answer": -1,
         },
     }
-    assert _is_approx_now(response.json()["created_at"])
 
 
 def test_update_milestone_answer_no_current_answer_session(
@@ -221,7 +225,7 @@ def test_update_milestone_answer_update_existing_answer(user_client: TestClient)
     current_answer_session = user_client.get("/users/milestone-answers/1").json()
     assert current_answer_session["answers"]["1"] == {
         "milestone_id": 1,
-        "answer": 3,
+        "answer": -1,
     }
     new_answer = {
         "milestone_id": 1,
@@ -375,15 +379,6 @@ def test_update_current_child_answers_no_prexisting(
 
 
 def test_get_summary_feedback_for_session(user_client: TestClient, session):
-    answers = session.exec(
-        select(MilestoneAnswer).where(MilestoneAnswer.answer_session_id == 1)
-    ).all()
-    for answer in answers:
-        answer.included_in_milestone_statistics = False
-        answer.included_in_milestonegroup_statistics = False
-        session.merge(answer)
-    session.commit()
-
     response = user_client.get("/users/feedback/answersession=1/summary")
     assert response.status_code == 200
     assert response.json() == {"1": 1}
@@ -395,14 +390,6 @@ def test_get_summary_feedback_for_session_invalid(user_client: TestClient):
 
 
 def test_get_detailed_feedback_for_session(user_client: TestClient, session):
-    answers = session.exec(
-        select(MilestoneAnswer).where(MilestoneAnswer.answer_session_id == 1)
-    ).all()
-    for answer in answers:
-        answer.included_in_milestone_statistics = False
-        answer.included_in_milestonegroup_statistics = False
-        session.merge(answer)
-    session.commit()
     response = user_client.get("/users/feedback/answersession=1/detailed")
     assert response.status_code == 200
     assert response.json() == {"1": {"1": 1, "2": 1}}
diff --git a/mondey_backend/tests/utils/test_scores.py b/mondey_backend/tests/utils/test_scores.py
index 5d6ca73f..97d59e40 100644
--- a/mondey_backend/tests/utils/test_scores.py
+++ b/mondey_backend/tests/utils/test_scores.py
@@ -1,18 +1,14 @@
-from datetime import datetime
-from datetime import timedelta
-
-import numpy as np
+import pytest
 from sqlmodel import select
 
 from mondey_backend.models.milestones import MilestoneAgeScore
-from mondey_backend.models.milestones import MilestoneAgeScoreCollection
 from mondey_backend.models.milestones import MilestoneAnswerSession
-from mondey_backend.models.milestones import MilestoneGroupAgeScore
 from mondey_backend.models.milestones import MilestoneGroupAgeScoreCollection
 from mondey_backend.routers.scores import TrafficLight
 from mondey_backend.routers.scores import compute_feedback_simple
 from mondey_backend.routers.scores import compute_milestonegroup_feedback_detailed
 from mondey_backend.routers.scores import compute_milestonegroup_feedback_summary
+from mondey_backend.routers.statistics import update_stats
 from mondey_backend.routers.utils import get_milestonegroups_for_answersession
 
 
@@ -24,8 +20,9 @@ def test_get_milestonegroups_for_answersession(session):
 
 
 def test_get_milestonegroups_for_answersession_no_data(session):
-    answersession = session.get(MilestoneAnswerSession, 3)
-    milestonegroups = get_milestonegroups_for_answersession(session, answersession)
+    milestonegroups = get_milestonegroups_for_answersession(
+        session, MilestoneAnswerSession()
+    )
     assert len(milestonegroups) == 0
 
 
@@ -46,124 +43,102 @@ def test_compute_feedback_simple():
     score = 3
     assert compute_feedback_simple(dummy_scores, score) == 1
 
-    dummy_scores = MilestoneGroupAgeScore(
-        milestonegroup_id=1,
-        age_months=8,
-        avg_score=3.0,
-        stddev_score=1.2,
-    )
-
 
 def test_compute_summary_milestonegroup_feedback_for_answersession_with_recompute(
     statistics_session,
 ):
-    old_entries = statistics_session.exec(
-        select(MilestoneGroupAgeScoreCollection)
+    child_age = 8
+    # existing statistics for milestonegroup 1 at age 8 months: [1,2] -> mean = 1.5 +/- 0.5
+    statistics = statistics_session.exec(
+        select(MilestoneGroupAgeScoreCollection).where(
+            MilestoneGroupAgeScoreCollection.milestone_group_id == 1
+        )
     ).all()
-    assert len(old_entries) == 1
-    for entry in old_entries:
-        assert entry.created_at < datetime.now() - timedelta(
-            hours=1
-        )  # can be at max 1 hour old
-
-    # there is an existing statistics for milestonegroup 1, which has milestones 1 and 2
-    # which gives mean = 1.92 and stddev = 0.21, and we have 2 additional answers for these m
-    # milestones with answers 3 and 2 for milestones 1 and 2 respectively. ==> statistics
-    # changes to mean = 2.446 +/- 0.89. The first call updates the statistics with the new
-    # values, the second does not.
+    assert len(statistics) == 1
+    assert statistics[0].scores[child_age].avg_score == pytest.approx(1.5, abs=0.001)
+    assert statistics[0].scores[child_age].stddev_score == pytest.approx(0.5, abs=0.001)
+    # answer session 1 scores: [1, 2] -> mean 1.5 -> green
     feedback = compute_milestonegroup_feedback_summary(
         statistics_session, child_id=1, answersession_id=1
     )
-
-    assert feedback[1] == TrafficLight.yellow.value
+    assert feedback[1] == TrafficLight.green.value
     assert len(feedback) == 1
-
-    # same as above, but for answers 4, 3  -> 3.5 ==> green
+    # answer session 2 scores: [3, 4] -> mean 3.5 -> green
     feedback = compute_milestonegroup_feedback_summary(
         statistics_session, child_id=1, answersession_id=2
     )
-    assert len(feedback) == 1
     assert feedback[1] == TrafficLight.green.value
-    new_entries = statistics_session.exec(
-        select(MilestoneGroupAgeScoreCollection)
-    ).all()
-    assert len(new_entries) == len(old_entries)
-    for old, new in zip(old_entries, new_entries, strict=True):
-        assert new.created_at >= datetime.now() - timedelta(
-            hours=1
-        )  # can be at max 1 hour old
-        assert old.milestone_group_id == new.milestone_group_id
+    assert len(feedback) == 1
+    for update_existing_statistics in [True, False]:
+        update_stats(statistics_session, incremental_update=update_existing_statistics)
+        # updated stats for milestonegroup 1 at age 8 months: [1,2,3,4,3,4] -> mean = 2.83333 +/- ~1.2
+        statistics = statistics_session.exec(
+            select(MilestoneGroupAgeScoreCollection).where(
+                MilestoneGroupAgeScoreCollection.milestone_group_id == 1
+            )
+        ).all()
+        assert len(statistics) == 1
+        assert statistics[0].scores[child_age].avg_score == pytest.approx(
+            2.83333, abs=0.001
+        )
+        assert statistics[0].scores[child_age].stddev_score == pytest.approx(
+            1.2, abs=0.1
+        )
+        # answer session 1 score 1.5 -> yellow
+        feedback = compute_milestonegroup_feedback_summary(
+            statistics_session, child_id=1, answersession_id=1
+        )
+        assert feedback[1] == TrafficLight.yellow.value
+        assert len(feedback) == 1
+        # answer session 2 score 3.5 remain green
+        feedback = compute_milestonegroup_feedback_summary(
+            statistics_session, child_id=1, answersession_id=2
+        )
+        assert feedback[1] == TrafficLight.green.value
+        assert len(feedback) == 1
 
 
 def test_compute_summary_milestonegroup_feedback_for_answersession_no_existing_stat(
     statistics_session,
 ):
-    # there is only 2 answer sfor milestonegroup 2 which only has milestone 7.
-    # these 2 are from 2 answersessions which are 10 days apart so fall into the
-    # same age group => the feedback has only one entry for milestonegroup 2
-    # and because the answers are 3 and 2 -> avg = 2.5 +/- 0.7071 -> green for answer = 3
     feedback = compute_milestonegroup_feedback_summary(
         statistics_session, child_id=3, answersession_id=3
     )
 
     assert len(feedback) == 1
-    assert feedback[2] == TrafficLight.green.value
-
-    #  check that the statistics have been updated
-    statistics = statistics_session.exec(
-        select(MilestoneGroupAgeScoreCollection).where(
-            MilestoneGroupAgeScoreCollection.milestone_group_id == 2
-        )
-    ).all()
-    assert len(statistics) == 1
-    assert statistics[0].created_at >= datetime.now() - timedelta(
-        minutes=1
-    )  # can be at max 1 min old
-
-    assert statistics[0].scores[42].count == 2
-    assert np.isclose(statistics[0].scores[42].avg_score, 2.5)
-    assert np.isclose(statistics[0].scores[42].stddev_score, 0.7071)
-
-    for i, score in enumerate(statistics[0].scores):
-        if i != 42:
-            assert np.isclose(score.avg_score, 0)
-            assert np.isclose(score.stddev_score, 0)
-            assert np.isclose(score.count, 0)
+    assert feedback[2] == TrafficLight.invalid.value
 
 
 def test_compute_detailed_milestonegroup_feedback_for_answersession_with_recompute(
     statistics_session,
 ):
-    old_entries = statistics_session.exec(select(MilestoneAgeScoreCollection)).all()
-    assert len(old_entries) == 2
-    for entry in old_entries:
-        assert entry.created_at < datetime.now() - timedelta(
-            hours=1
-        )  # can be at max 1 hour old
+    # initial stats only include answer session 1: all feedback green
+    feedback = compute_milestonegroup_feedback_detailed(
+        statistics_session, child_id=1, answersession_id=1
+    )
+    assert len(feedback) == 1
+    assert len(feedback[1]) == 2
+    assert feedback[1][1] == TrafficLight.green.value
+    assert feedback[1][2] == TrafficLight.green.value
 
+    # updated stats include more answer sessions
+    update_stats(statistics_session, incremental_update=True)
     feedback = compute_milestonegroup_feedback_detailed(
         statistics_session, child_id=1, answersession_id=1
     )
     assert len(feedback) == 1
     assert len(feedback[1]) == 2
-    assert feedback[1][1] == TrafficLight.green.red.value
-    assert feedback[1][2] == TrafficLight.green.red.value
-    new_entries = statistics_session.exec(select(MilestoneAgeScoreCollection)).all()
-    assert len(new_entries) == 2
-    for old, new in zip(old_entries, new_entries, strict=True):
-        assert new.created_at >= datetime.now() - timedelta(
-            hours=1
-        )  # can be at max 1 hour old
-        assert old.milestone_id == new.milestone_id
+    # milestone 1: score 2, mean = 3.33+/-1.2 -> yellow
+    assert feedback[1][1] == TrafficLight.yellow.value
+    # milestone 2: score 1, mean = 2.33+/-1.2 -> yello
+    assert feedback[1][2] == TrafficLight.yellow.value
 
 
 def test_compute_detailed_milestonegroup_feedback_for_answersession_no_existing_stat(
     statistics_session,
 ):
-    # follows the same logic as the corresponding test for the milestonegroup summary feedback
     feedback = compute_milestonegroup_feedback_detailed(
         statistics_session, child_id=3, answersession_id=3
     )
-
     assert len(feedback) == 1
-    assert feedback[2][7] == TrafficLight.green.green.value
+    assert feedback[2][5] == TrafficLight.invalid.value
diff --git a/mondey_backend/tests/utils/test_statistics.py b/mondey_backend/tests/utils/test_statistics.py
index 890dafdf..b0392049 100644
--- a/mondey_backend/tests/utils/test_statistics.py
+++ b/mondey_backend/tests/utils/test_statistics.py
@@ -1,20 +1,14 @@
-import datetime
-
 import numpy as np
 import pytest
-from sqlmodel import col
 from sqlmodel import select
 
 from mondey_backend.models.milestones import MilestoneAgeScoreCollection
 from mondey_backend.models.milestones import MilestoneAnswer
-from mondey_backend.models.milestones import MilestoneAnswerSession
-from mondey_backend.models.milestones import MilestoneGroup
 from mondey_backend.models.milestones import MilestoneGroupAgeScoreCollection
 from mondey_backend.routers.statistics import _add_sample
 from mondey_backend.routers.statistics import _finalize_statistics
 from mondey_backend.routers.statistics import _get_statistics_by_age
-from mondey_backend.routers.statistics import calculate_milestone_statistics_by_age
-from mondey_backend.routers.statistics import calculate_milestonegroup_statistics_by_age
+from mondey_backend.routers.statistics import update_stats
 
 
 def test_online_statistics_computation():
@@ -204,149 +198,74 @@ def test_get_score_statistics_by_age_no_data(statistics_session):
 
 
 def test_calculate_milestone_statistics_by_age(statistics_session):
-    expiration_date = datetime.datetime.now() - datetime.timedelta(days=7)
-    answers_query = (
-        select(MilestoneAnswer)
-        .join(
-            MilestoneAnswerSession,
-            col(MilestoneAnswer.answer_session_id) == MilestoneAnswerSession.id,
-        )
-        .where(MilestoneAnswer.milestone_id == 1)
-        .where(~col(MilestoneAnswer.included_in_milestone_statistics))
-        .where(col(MilestoneAnswerSession.created_at) <= expiration_date)
-    )
-
-    # originally, the relevant answers have not been integrated into the statistics yet
-    all_answers = statistics_session.exec(answers_query).all()
-    for answer in all_answers:
-        assert answer.included_in_milestone_statistics is False
-
-    # calculate_milestone_statistics_by_age
-    mscore = calculate_milestone_statistics_by_age(statistics_session, 1)
-
-    # old statistics has avg[age=8] = 3.0 and stddev[age=8] = 0.35, and we
-    # get one more answer from answersession 4 with answer = 3
-    assert mscore.milestone_id == 1
-    assert mscore.scores[8].count == 13
-    assert np.isclose(mscore.scores[8].avg_score, 3.0769)
-    assert np.isclose(mscore.scores[8].stddev_score, 0.27735)
-
-    # we have nothing new for everything else
-    for age in range(0, len(mscore.scores)):
-        if age != 8:
-            assert mscore.scores[age].count == 12
-            avg = 0 if age < 5 else min(1 * age - 5, 3)
-            assert np.isclose(mscore.scores[age].avg_score, avg)
-            stddev = 0.0 if age < 5 or age >= 8 else 0.35
-            assert np.isclose(mscore.scores[age].stddev_score, stddev)
-
-        if age < 8:
-            assert mscore.scores[age].expected_score == 1
-        else:
-            assert mscore.scores[age].expected_score == 4
-
-    # all answers for milestone 1 are now included into the answersesssion
-    # if they come from expired milestonesessions
-
-    all_answers = statistics_session.exec(answers_query).all()
-    for answer in all_answers:
-        assert answer.included_in_milestone_statistics is True
-
-    # the new result is not written into the database, so in order to check
-    # that data is not taken into account twice, we need to check against the
-    # old result, not the new one.
-    old = statistics_session.get(MilestoneAgeScoreCollection, 1)
-
-    mscore2 = calculate_milestone_statistics_by_age(statistics_session, 1)
-    for s1, s2 in zip(mscore2.scores, old.scores, strict=True):
-        assert s1.age == s2.age
-        assert s1.count == s2.count
-        assert np.isclose(s1.avg_score, s2.avg_score)
-        assert np.isclose(s1.stddev_score, s2.stddev_score)
-        assert np.isclose(s1.expected_score, s2.expected_score)
+    m1 = statistics_session.get(MilestoneAgeScoreCollection, 1)
+    m2 = statistics_session.get(MilestoneAgeScoreCollection, 2)
+
+    # existing stats (only answer session 1)
+    assert m1.milestone_id == 1
+    assert m1.scores[8].count == 1
+    assert np.isclose(m1.scores[8].avg_score, 2.0)
+    assert np.isclose(m1.scores[8].stddev_score, 0.0)
+
+    assert m2.milestone_id == 2
+    assert m2.scores[8].count == 1
+    assert np.isclose(m2.scores[8].avg_score, 1.0)
+    assert np.isclose(m2.scores[8].stddev_score, 0.0)
+
+    # updated stats (answer sessions 1, 2, 4)
+    update_stats(statistics_session, incremental_update=False)
+    m1 = statistics_session.get(MilestoneAgeScoreCollection, 1)
+    m2 = statistics_session.get(MilestoneAgeScoreCollection, 2)
+
+    assert m1.milestone_id == 1
+    assert m1.scores[8].count == 3
+    assert np.isclose(m1.scores[8].avg_score, (2 + 4 + 4) / 3.0)
+    assert m1.scores[8].stddev_score == pytest.approx(1.15, abs=0.1)
+
+    assert m2.milestone_id == 2
+    assert m2.scores[8].count == 3
+    assert np.isclose(m2.scores[8].avg_score, (1 + 3 + 3) / 3.0)
+    assert m2.scores[8].stddev_score == pytest.approx(1.15, abs=0.1)
+
+    # re-calculating using all answers gives the same results
+    update_stats(statistics_session, incremental_update=False)
+    m1 = statistics_session.get(MilestoneAgeScoreCollection, 1)
+    m2 = statistics_session.get(MilestoneAgeScoreCollection, 2)
+
+    assert m1.milestone_id == 1
+    assert m1.scores[8].count == 3
+    assert np.isclose(m1.scores[8].avg_score, (2 + 4 + 4) / 3.0)
+    assert m1.scores[8].stddev_score == pytest.approx(1.15, abs=0.1)
+
+    assert m2.milestone_id == 2
+    assert m2.scores[8].count == 3
+    assert np.isclose(m2.scores[8].avg_score, (1 + 3 + 3) / 3.0)
+    assert m2.scores[8].stddev_score == pytest.approx(1.15, abs=0.1)
 
 
 def test_calculate_milestonegroup_statistics(statistics_session):
-    expiration_date = datetime.datetime.now() - datetime.timedelta(days=7)
-
-    answer_query = (
-        select(MilestoneAnswer)
-        .join(
-            MilestoneAnswerSession,
-            col(MilestoneAnswer.answer_session_id) == MilestoneAnswerSession.id,
-        )
-        .where(MilestoneAnswer.milestone_group_id == 1)
-        .where(~col(MilestoneAnswer.included_in_milestonegroup_statistics))
-        .where(MilestoneAnswerSession.created_at <= expiration_date)
-    )
-
-    all_answers = statistics_session.exec(answer_query).all()
-    for answer in all_answers:
-        print(answer)
-        assert answer.included_in_milestonegroup_statistics is False
-
-    milestone_group = statistics_session.exec(
-        select(MilestoneGroup).where(MilestoneGroup.id == 1)
-    ).first()
-
-    score = calculate_milestonegroup_statistics_by_age(
-        statistics_session,
-        milestone_group.id,
-    )
-
-    assert score.milestone_group_id == 1
-    # no change for these ages
-    assert np.isclose(score.scores[5].avg_score, 1.2)
-    assert np.isclose(score.scores[6].avg_score, 1.44)
-    assert np.isclose(score.scores[7].avg_score, 1.68)
-    assert np.isclose(score.scores[9].avg_score, 2.16)
-    assert np.isclose(score.scores[10].avg_score, 2.4)
-    assert np.isclose(score.scores[11].avg_score, 2.64)
-    assert np.isclose(score.scores[12].avg_score, 2.88)
-
-    for age in [
-        5,
-        6,
-        7,
-    ]:
-        assert np.isclose(score.scores[age].count, 4)  # no change for this age
-        assert np.isclose(
-            score.scores[age].stddev_score, 0.21
-        )  # no change for this age
-
-    assert score.scores[8].count == 6
-    assert np.isclose(
-        score.scores[8].avg_score, 2.446666
-    )  # new answers from answersession 4 -> changed value
-    assert np.isclose(
-        score.scores[8].stddev_score, 0.890037
-    )  # new answers from answersession 4 -> changed value
-    assert score.scores[8].age == 8
-    assert score.scores[8].milestone_group_id == 1
-    assert score.created_at - datetime.datetime.now() < datetime.timedelta(
-        minutes=1
-    )  # allow for very slow machine in CI
-
-    for age in range(0, len(score.scores)):
-        if age not in [5, 6, 7, 8]:
-            assert score.scores[age].count == 0
-        if age > 12:
-            assert np.isclose(score.scores[age].avg_score, 3.0)
-
-    # check that calling the statistics anew with already integrated answers doesn´t change anything.
-    # we need to check against the old result, not the new one because this is not written into the database
-    all_answers = statistics_session.exec(answer_query).all()
-    for answer in all_answers:
-        assert answer.included_in_milestonegroup_statistics is True
-
-    old_stats = statistics_session.get(MilestoneGroupAgeScoreCollection, 1)
-    new_stats = calculate_milestonegroup_statistics_by_age(
-        statistics_session,
-        milestone_group.id,
-    )
-    for new_score, old_score in zip(new_stats.scores, old_stats.scores, strict=True):
-        assert new_score.age == old_score.age
-        assert new_score.count == old_score.count
-        assert np.isclose(new_score.avg_score, old_score.avg_score)
-        assert np.isclose(new_score.stddev_score, old_score.stddev_score)
-        assert new_score.milestone_group_id == old_score.milestone_group_id
+    mg = statistics_session.get(MilestoneGroupAgeScoreCollection, 1)
+
+    # existing stats (only answer session 1)
+    assert mg.milestone_group_id == 1
+    assert mg.scores[8].count == 2
+    assert np.isclose(mg.scores[8].avg_score, (1 + 2) / 2.0)
+    assert np.isclose(mg.scores[8].stddev_score, 0.5)
+
+    # updated stats (answer sessions 1, 2, 4)
+    update_stats(statistics_session, incremental_update=True)
+    mg = statistics_session.get(MilestoneGroupAgeScoreCollection, 1)
+
+    assert mg.milestone_group_id == 1
+    assert mg.scores[8].count == 6
+    assert np.isclose(mg.scores[8].avg_score, (1 + 2 + 3 + 4 + 3 + 4) / 6.0)
+    assert mg.scores[8].stddev_score == pytest.approx(1.15, abs=0.1)
+
+    # re-calculating using all answers gives the same results
+    update_stats(statistics_session, incremental_update=False)
+    mg = statistics_session.get(MilestoneGroupAgeScoreCollection, 1)
+
+    assert mg.milestone_group_id == 1
+    assert mg.scores[8].count == 6
+    assert np.isclose(mg.scores[8].avg_score, (1 + 2 + 3 + 4 + 3 + 4) / 6.0)
+    assert mg.scores[8].stddev_score == pytest.approx(1.15, abs=0.1)
diff --git a/mondey_backend/tests/utils/test_utils.py b/mondey_backend/tests/utils/test_utils.py
index 488f696b..4d1f7866 100644
--- a/mondey_backend/tests/utils/test_utils.py
+++ b/mondey_backend/tests/utils/test_utils.py
@@ -22,9 +22,10 @@ def test_get_milestonegroups_for_answersession(session):
 
 
 def test_get_answer_session_child_ages_in_months(session):
-    child_ages = _get_answer_session_child_ages_in_months(session)
+    answer_sessions = session.exec(select(MilestoneAnswerSession)).all()
+    child_ages = _get_answer_session_child_ages_in_months(session, answer_sessions)
 
     assert len(child_ages) == 3
     assert child_ages[1] == 8
-    assert child_ages[2] == 9
-    assert child_ages[3] == 42
+    assert child_ages[2] == 8
+    assert child_ages[3] == 55