Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: implement grading strategies for peer evaluations #2196

Merged
merged 30 commits into from
May 9, 2024
Merged
Show file tree
Hide file tree
Changes from 23 commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
85bf487
feat: implement mean score strategy
mariajgrimaldi Sep 26, 2023
eb4b911
refactor: re-generate statics for implementation
mariajgrimaldi Mar 25, 2024
9b4d8e2
fix: address test failures
mariajgrimaldi Mar 25, 2024
c3c73c2
feat: generate translations in english
mariajgrimaldi Apr 1, 2024
a4fe366
feat: generate translations in eo
mariajgrimaldi Apr 1, 2024
cde0f5d
feat: generate dummy translations
mariajgrimaldi Apr 1, 2024
09e7b9f
docs: update docstring accordingly
mariajgrimaldi Apr 2, 2024
d804bdb
temp: remove forcing default value for grading strategy
mariajgrimaldi Apr 2, 2024
fcc3c8b
refactor: add missing defaults for tests
mariajgrimaldi Apr 2, 2024
3936e2d
refactor: set peerAssessmentView grading strategy
mariajgrimaldi Apr 2, 2024
9351c35
refactor!: address PR reviews
mariajgrimaldi Apr 2, 2024
3a3f841
feat: re-generate translations
mariajgrimaldi Apr 2, 2024
0cd23e1
refactor: address PR review
mariajgrimaldi Apr 4, 2024
a31f654
Revert "refactor: add missing defaults for tests"
mariajgrimaldi Apr 4, 2024
ced373a
chore: update translations to latest
mariajgrimaldi May 7, 2024
a955bde
chore: generate latest statics
mariajgrimaldi May 7, 2024
99a8bc1
test: add test cases for message rendering in grade details
mariajgrimaldi May 7, 2024
4101928
docs: update docstrings with better descriptions
mariajgrimaldi May 7, 2024
f4da665
refactor: get peer_median_default instead of peer_median
mariajgrimaldi May 7, 2024
d323dc9
fix: add user_id to scenario decorator
mariajgrimaldi May 7, 2024
0d8975b
refactor: test only peer-steps instead of peer-self
mariajgrimaldi May 7, 2024
7c97379
fix: match must_grade/must_be_graded_by to test setup
mariajgrimaldi May 7, 2024
b1df638
fix: get peer_median_default instead of peer_median
mariajgrimaldi May 7, 2024
e507ab5
refactor: address PR reviews
mariajgrimaldi May 8, 2024
61f952c
refactor: use new method instead of median scores getter
mariajgrimaldi May 8, 2024
9ba4f80
fix: remove extra unnecessary argument
mariajgrimaldi May 8, 2024
4c6f474
fix: rewrite tests according latest feedback/changes
mariajgrimaldi May 8, 2024
97ff348
fix: use correct feature dicts
mariajgrimaldi May 8, 2024
da58dc8
refactor: address test failures
mariajgrimaldi May 8, 2024
69bc17a
docs: update version for upcoming release
mariajgrimaldi May 9, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
74 changes: 71 additions & 3 deletions openassessment/assessment/api/peer.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

import logging

from django.conf import settings
from django.db import DatabaseError, IntegrityError, transaction
from django.utils import timezone

Expand All @@ -27,6 +28,12 @@
FLEXIBLE_PEER_GRADING_GRADED_BY_PERCENTAGE = 30


class PeerGradingStrategy:
"""Grading strategies for peer assessments."""
MEAN = "mean"
MEDIAN = "median"


def flexible_peer_grading_enabled(peer_requirements, course_settings):
"""
Is flexible peer grading turned on? Either at the course override
Expand All @@ -51,6 +58,17 @@ def flexible_peer_grading_active(submission_uuid, peer_requirements, course_sett
return days_elapsed >= FLEXIBLE_PEER_GRADING_REQUIRED_SUBMISSION_AGE_IN_DAYS


def get_peer_grading_strategy(workflow_requirements):
"""
Get the peer grading type, either mean or median. Default is median.
"""
if "peer" not in workflow_requirements:
return workflow_requirements.get("grading_strategy", PeerGradingStrategy.MEDIAN)
return workflow_requirements.get("peer", {}).get(
"grading_strategy", PeerGradingStrategy.MEDIAN,
)


def required_peer_grades(submission_uuid, peer_requirements, course_settings):
"""
Given a submission id, finds how many peer assessment required.
Expand Down Expand Up @@ -281,10 +299,12 @@ def get_score(submission_uuid, peer_requirements, course_settings):
scored_item.save()
assessments = [item.assessment for item in items]

scores_dict = get_assessment_scores_with_grading_strategy(
submission_uuid,
peer_requirements,
)
return {
"points_earned": sum(
get_assessment_median_scores(submission_uuid).values()
),
"points_earned": sum(scores_dict.values()),
"points_possible": assessments[0].points_possible,
"contributing_assessments": [assessment.id for assessment in assessments],
"staff_id": None,
Expand Down Expand Up @@ -501,6 +521,54 @@ def get_rubric_max_scores(submission_uuid):
raise PeerAssessmentInternalError(error_message) from ex


def get_assessment_scores_with_grading_strategy(submission_uuid, workflow_requirements):
"""Get the score for each rubric criterion calculated given grading strategy
obtained from the peer requirements dictionary. If no grading strategy is
provided in the peer requirements or the feature flag is not enabled, the
default median score calculation is used.

This function is based on get_assessment_median_scores, but allows the caller
to specify the grading strategy (mean, median) to use when calculating the score.

Args:
submission_uuid (str): The submission uuid is used to get the
assessments used to score this submission, and generate the
appropriate median/mean score.
workflow_requirements (dict): Dictionary with the key "grading_strategy"

Returns:
dict: A dictionary of rubric criterion names,
with a median/mean score of the peer assessments.

Raises:
PeerAssessmentInternalError: If any error occurs while retrieving
information to form the median/mean scores, an error is raised.
"""
# If the feature flag is not enabled, use the median score calculation
# as the default behavior.
if not settings.FEATURES.get("ENABLE_ORA_PEER_CONFIGURABLE_GRADING", False):
return get_assessment_median_scores(submission_uuid)
pomegranited marked this conversation as resolved.
Show resolved Hide resolved

current_grading_strategy = get_peer_grading_strategy(workflow_requirements)
try:
workflow = PeerWorkflow.objects.get(submission_uuid=submission_uuid)
items = workflow.graded_by.filter(scored=True)
assessments = [item.assessment for item in items]
scores = Assessment.scores_by_criterion(assessments)
return Assessment.get_score_dict(
scores,
grading_strategy=current_grading_strategy,
)
except PeerWorkflow.DoesNotExist:
return {}
except DatabaseError as ex:
error_message = (
"Error getting assessment median scores for submission {uuid}"
).format(uuid=submission_uuid)
logger.exception(error_message)
raise PeerAssessmentInternalError(error_message) from ex


def get_assessment_median_scores(submission_uuid):
"""Get the median score for each rubric criterion

Expand Down
68 changes: 68 additions & 0 deletions openassessment/assessment/models/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -488,6 +488,22 @@ def create(cls, rubric, scorer_id, submission_uuid, score_type, feedback=None, s

return cls.objects.create(**assessment_params)

@classmethod
def get_score_dict(cls, scores_dict, grading_strategy):
"""Determine the score in a dictionary of lists of scores based on the
grading strategy calculation configuration.

Args:
scores_dict (dict): A dictionary of lists of int values. These int values
are reduced to a single value that represents the median.
grading_strategy (str): The type of score to calculate. Defaults to "median".

Returns:
(dict): A dictionary with criterion name keys and median score
values.
"""
return getattr(cls, f"get_{grading_strategy}_score_dict")(scores_dict)
pomegranited marked this conversation as resolved.
Show resolved Hide resolved

@classmethod
def get_median_score_dict(cls, scores_dict):
"""Determine the median score in a dictionary of lists of scores
Expand Down Expand Up @@ -518,6 +534,36 @@ def get_median_score_dict(cls, scores_dict):
median_scores[criterion] = criterion_score
return median_scores

@classmethod
def get_mean_score_dict(cls, scores_dict):
"""Determine the mean score in a dictionary of lists of scores

For a dictionary of lists, where each list contains a set of scores,
determine the mean value in each list.

Args:
scores_dict (dict): A dictionary of lists of int values. These int
values are reduced to a single value that represents the mean.

Returns:
(dict): A dictionary with criterion name keys and mean score
values.

Examples:
>>> scores = {
>>> "foo": [5, 6, 12, 16, 22, 53],
>>> "bar": [5, 6, 12, 16, 22, 53, 102]
>>> }
>>> Assessment.get_mean_score_dict(scores)
{"foo": 19, "bar": 31}

"""
mean_scores = {}
for criterion, criterion_scores in scores_dict.items():
criterion_score = Assessment.get_mean_score(criterion_scores)
mean_scores[criterion] = criterion_score
return mean_scores

@staticmethod
def get_median_score(scores):
"""Determine the median score in a list of scores
Expand Down Expand Up @@ -552,6 +598,28 @@ def get_median_score(scores):
)
return median_score

@staticmethod
def get_mean_score(scores):
mariajgrimaldi marked this conversation as resolved.
Show resolved Hide resolved
"""Calculate the mean score from a list of scores

Args:
scores (list): A list of int values. These int values
are reduced to a single value that represents the mean.

Returns:
(int): The mean score.

Examples:
>>> scores = [5, 6, 12, 16, 22, 53]
>>> Assessment.get_mean_score(scores)
19

"""
total_criterion_scores = len(scores)
if total_criterion_scores == 0:
return 0
return math.ceil(sum(scores) / float(total_criterion_scores))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I got an unexpected mean score:

image

(it's the second default rubric, so poor=0, fair=1, good=3, excellent=5):
1 + 0 + 3 + 1 + 5 = 10, 10/5 =2

Is it because there's no 2 in the rubric, so it rounds up to the next one 3?

Copy link
Member Author

@mariajgrimaldi mariajgrimaldi May 8, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This threw me for a loop. So, I tried replicating the same scenario. Here's the criteria I used:
Ideas: poor=0, fair=3, good=5
Content: poor=0, fair=1, good=3, excellent=5
Graded by 5 configuration
And got:
image

While researching, I found that the assessments are retrieved, ordered, and then truncated by the number of peer grades needed. Could you confirm that the number of peer grades configured while testing was 5?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I also reproduced the same scenario with the graded-by-3 configurations and got 2 since the 1st three peers graded 1, 0, and 3 in that order. Please let me know if the submission order also affected your tests.

Copy link
Contributor

@pomegranited pomegranited May 9, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh wow, submission order does affect the final score! That's annoying.

I also added an extra test here with the exact values: https://github.com/openedx/edx-ora2/pull/2196/files#diff-7fe9b2681acaf897f2dd62a13553badf288fd6144832ce0562c4a58bd2640e7fR2381

Thanks for that -- I also ran this test with the numbers shuffled in a different order, to confirm that it's not the "mean score" logic that's causing this issue. It must be something done after that.

--- a/openassessment/assessment/test/test_peer.py
+++ b/openassessment/assessment/test/test_peer.py
@@ -2375,6 +2375,7 @@ class TestPeerApi(CacheResetTest):
         self.assertEqual(31, Assessment.get_mean_score([5, 6, 12, 16, 22, 53, 102]))
         self.assertEqual(31, Assessment.get_mean_score([16, 6, 12, 102, 22, 53, 5]))
         self.assertEqual(2, Assessment.get_mean_score([0, 1, 3, 1, 5]))
+        self.assertEqual(2, Assessment.get_mean_score([5, 3, 1, 1, 0]))

Could you confirm that the number of peer grades configured while testing was 5?

Yes, 5 peer grades were required, and only 5 were received.

I used the default graded-by-5 configuration, with
Ideas: poor=0, fair=3, good=5
Content: poor=0, fair=1, good=3, excellent=5

Audit user received the same "Content" peer assessments as Honor, but in a different order, and so they got different scores. (Apologies about the "Ideas" assessments, I thought I did them all the same too, but apparently not.)

Audit received:
image

Honor received:
image


@classmethod
def scores_by_criterion(cls, assessments):
"""Create a dictionary of lists for scores associated with criterion
Expand Down
139 changes: 137 additions & 2 deletions openassessment/assessment/test/test_peer.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,19 +2,21 @@

import copy
import datetime
from unittest.mock import patch
from unittest.mock import Mock, patch

from ddt import ddt, file_data, data, unpack
import pytz

from django.conf import settings
from django.db import DatabaseError, IntegrityError
from django.utils import timezone
from django.test.utils import override_settings
from freezegun import freeze_time
from pytest import raises

from submissions import api as sub_api
from openassessment.assessment.api import peer as peer_api
from openassessment.assessment.errors.peer import PeerAssessmentWorkflowError
from openassessment.assessment.errors.peer import PeerAssessmentInternalError, PeerAssessmentWorkflowError
from openassessment.assessment.models import (
Assessment,
AssessmentFeedback,
Expand Down Expand Up @@ -167,6 +169,11 @@
'force_on_flexible_peer_openassessments': True
}

FEATURES_WITH_GRADING_STRATEGY_ON = settings.FEATURES.copy()
FEATURES_WITH_GRADING_STRATEGY_OFF = settings.FEATURES.copy()
FEATURES_WITH_GRADING_STRATEGY_ON['ENABLE_ORA_PEER_CONFIGURABLE_GRADING'] = True
FEATURES_WITH_GRADING_STRATEGY_OFF['ENABLE_ORA_PEER_CONFIGURABLE_GRADING'] = False


@ddt
class TestPeerApi(CacheResetTest):
Expand Down Expand Up @@ -2230,6 +2237,134 @@ def test_flexible_peer_grading_waiting_on_submitter(self):
self.assertTrue(peer_api.flexible_peer_grading_active(alice_sub['uuid'], requirements, COURSE_SETTINGS))
self._assert_num_scored_items(alice_sub, 4)

@override_settings(FEATURES=FEATURES_WITH_GRADING_STRATEGY_ON)
@patch("openassessment.assessment.api.peer.PeerWorkflow")
@patch("openassessment.assessment.api.peer.Assessment.scores_by_criterion")
@data("mean", "median")
def test_get_peer_score_with_grading_strategy_enabled(
self,
grading_strategy,
scores_by_criterion_mock,
_
):
pomegranited marked this conversation as resolved.
Show resolved Hide resolved
"""Test get score when grading strategy feature is on.

When grading strategy is enabled, the score is calculated using the
mean or median calculation method based on the grading strategy
configured for the peers step requirement.
"""
workflow_requirements = {
"peer": {
"must_grade": 5,
"must_be_graded_by": 4,
"enable_flexible_grading": False,
"grading_strategy": grading_strategy,
},
}

submission_uuid = "test_submission_uuid"
scores_by_criterion = {
"criterion_1": [6, 8, 15, 29, 37],
"criterion_2": [0, 1, 2, 4, 9]
}
scores_by_criterion_mock.return_value = scores_by_criterion

score_dict = peer_api.get_assessment_scores_with_grading_strategy(submission_uuid, workflow_requirements)

if grading_strategy == "mean":
self.assertDictEqual(score_dict, {"criterion_1": 19, "criterion_2": 4})

if grading_strategy == "median":
self.assertDictEqual(score_dict, {"criterion_1": 15, "criterion_2": 2})

@override_settings(FEATURES=FEATURES_WITH_GRADING_STRATEGY_ON)
@patch("openassessment.assessment.api.peer.PeerWorkflow.objects.get")
@patch("openassessment.assessment.api.peer.Assessment.scores_by_criterion")
@data("mean", "median")
def test_get_peer_score_with_grading_strategy_raise_errors(
self,
grading_strategy,
scores_by_criterion_mock,
peer_workflow_mock
):
"""Test get score when grading strategy feature is on and the
application flow raises errors.

When grading strategy is enabled, the score is calculated using the
mean or median calculation method based on the grading strategy
configured for the peers step requirement.
"""
workflow_requirements = {
"peer": {
"must_grade": 5,
"must_be_graded_by": 4,
"enable_flexible_grading": False,
"grading_strategy": grading_strategy,
},
}

submission_uuid = "test_submission_uuid"
scores_by_criterion = {
"criterion_1": [6, 8, 15, 29, 37],
"criterion_2": [0, 1, 2, 4, 9]
}
scores_by_criterion_mock.return_value = scores_by_criterion
peer_workflow_mock.side_effect = PeerWorkflow.DoesNotExist

self.assertDictEqual(
{},
peer_api.get_assessment_scores_with_grading_strategy(
submission_uuid,
workflow_requirements
)
)

workflow = Mock()
peer_workflow_mock.side_effect = workflow
workflow.return_value.graded_by.filter.return_value = []
scores_by_criterion_mock.side_effect = DatabaseError

with self.assertRaises(PeerAssessmentInternalError):
peer_api.get_assessment_scores_with_grading_strategy(
submission_uuid,
workflow_requirements
)

@override_settings(FEATURES=FEATURES_WITH_GRADING_STRATEGY_OFF)
@patch("openassessment.assessment.api.peer.Assessment")
@patch("openassessment.assessment.api.peer.get_assessment_median_scores")
def test_get_peer_score_with_grading_strategy_disabled(
self,
get_median_scores_mock,
assessment_mock
):
"""Test get score when grading strategy feature is off.

When grading strategy is disabled, the score is calculated using the
median of the peer assessments calculation method.
"""
workflow_requirements = {
"must_grade": 5,
"must_be_graded_by": 4,
"enable_flexible_grading": False,
"grading_strategy": "mean",
}
submission_uuid = "test_submission_uuid"
peer_api.get_assessment_scores_with_grading_strategy(submission_uuid, workflow_requirements)

get_median_scores_mock.assert_called_once_with(submission_uuid)
assessment_mock.get_score_dict.assert_not_called()

def test_mean_score_calculation(self):
"""Test mean score calculation for peer assessments."""
self.assertEqual(0, Assessment.get_mean_score([]))
self.assertEqual(5, Assessment.get_mean_score([5]))
self.assertEqual(6, Assessment.get_mean_score([5, 6]))
self.assertEqual(19, Assessment.get_mean_score([5, 6, 12, 16, 22, 53]))
self.assertEqual(19, Assessment.get_mean_score([6, 5, 12, 53, 16, 22]))
self.assertEqual(31, Assessment.get_mean_score([5, 6, 12, 16, 22, 53, 102]))
self.assertEqual(31, Assessment.get_mean_score([16, 6, 12, 102, 22, 53, 5]))


class PeerWorkflowTest(CacheResetTest):
"""
Expand Down
Loading
Loading