AI4Bharat · KunalTiwary · Aug 20, 2024 · Aug 28, 2024 · Sep 19, 2024 · Sep 19, 2024
diff --git a/backend/functions/tasks.py b/backend/functions/tasks.py
@@ -1541,15 +1541,24 @@ def update_meta_stats(
             ann_obj.result
         )
     elif project_type in get_audio_project_types():
-        result_meta_stats[ann_obj.annotation_status]["Raw Audio Duration"] += task_data[
-            "audio_duration"
-        ]
-        result_meta_stats[ann_obj.annotation_status][
-            "Segment Duration"
-        ] += get_audio_transcription_duration(ann_obj.result)
-        result_meta_stats[ann_obj.annotation_status][
-            "Not Null Segment Duration"
-        ] += get_not_null_audio_transcription_duration(ann_obj.result, ann_obj.id)
+        meta_stats = ann_obj.meta_stats
+        if meta_stats:
+            result_meta_stats[ann_obj.annotation_status][
+                "Segment Duration"
+            ] = meta_stats["total_segment_duration"]
+            result_meta_stats[ann_obj.annotation_status][
+                "Not Null Segment Duration"
+            ] = meta_stats["not_null_segment_duration"]
+        else:
+            result_meta_stats[ann_obj.annotation_status][
+                "Raw Audio Duration"
+            ] += task_data["audio_duration"]
+            result_meta_stats[ann_obj.annotation_status][
+                "Segment Duration"
+            ] += get_audio_transcription_duration(ann_obj.result)
+            result_meta_stats[ann_obj.annotation_status][
+                "Not Null Segment Duration"
+            ] += get_not_null_audio_transcription_duration(ann_obj.result, ann_obj.id)
 
 
 def calculate_ced_between_two_annotations(annotation1, annotation2):

diff --git a/backend/organizations/tasks.py b/backend/organizations/tasks.py
diff --git a/backend/projects/utils.py b/backend/projects/utils.py
@@ -173,7 +173,7 @@ def get_audio_transcription_duration(annotation_result):
     return audio_duration
 
 
-def get_not_null_audio_transcription_duration(annotation_result, ann_id):
+def get_not_null_audio_transcription_duration(annotation_result, ann_id=None):
     audio_duration = 0
     memory = create_memory(annotation_result)
     for key, indexes in memory.items():

diff --git a/backend/tasks/migrations/0049_annotation_meta_stats.py b/backend/tasks/migrations/0049_annotation_meta_stats.py
@@ -0,0 +1,22 @@
+# Generated by Django 3.2.14 on 2024-09-19 03:43
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+    dependencies = [
+        ("tasks", "0048_alter_annotation_unique_together"),
+    ]
+
+    operations = [
+        migrations.AddField(
+            model_name="annotation",
+            name="meta_stats",
+            field=models.JSONField(
+                blank=True,
+                help_text="Meta statistics for the annotation result",
+                null=True,
+                verbose_name="meta_stats",
+            ),
+        ),
+    ]
diff --git a/backend/tasks/models.py b/backend/tasks/models.py
@@ -261,6 +261,13 @@ class Annotation(models.Model):
         help_text=("Time when the annotation was first labeled/accepted/validated"),
     )
 
+    meta_stats = models.JSONField(
+        blank=True,
+        null=True,
+        verbose_name="meta_stats",
+        help_text="Meta statistics for the annotation result",
+    )
+
     def __str__(self):
         return str(self.id)
 

diff --git a/backend/tasks/utils.py b/backend/tasks/utils.py
@@ -1,7 +1,27 @@
 import os
+import re
 from requests import RequestException
 import requests
 from dotenv import load_dotenv
+from projects.utils import (
+    no_of_words,
+    get_audio_project_types,
+    get_audio_transcription_duration,
+    get_not_null_audio_transcription_duration,
+    calculate_word_error_rate_between_two_audio_transcription_annotation,
+)
+from tasks.models import (
+    Annotation,
+    REVIEWER_ANNOTATION,
+    ANNOTATOR_ANNOTATION,
+    SUPER_CHECKER_ANNOTATION,
+    ACCEPTED,
+    ACCEPTED_WITH_MINOR_CHANGES,
+    ACCEPTED_WITH_MAJOR_CHANGES,
+    VALIDATED,
+    VALIDATED_WITH_CHANGES,
+)
+
 
 Queued_Task_name = {
     "dataset.tasks.deduplicate_dataset_instance_items": "Deduplicate Dataset Instance Items",
@@ -60,3 +80,214 @@ def query_flower(filters=None):
             return {"error": "Failed to retrieve tasks from Flower"}
     except RequestException as e:
         return {"error": f" failed to connect to flower API, {str(e)}"}
+
+
+def compute_meta_stats_for_annotation(ann_obj, project_type):
+    from tasks.views import SentenceOperationViewSet
+
+    task_obj = ann_obj.task
+    task_data = task_obj.data
+    ced_project_type_choices = ["ContextualTranslationEditing"]
+    result_meta_stats = {}
+    result = ann_obj.result
+
+    # calculating wer and bleu scores
+    all_annotations = Annotation.objects.filter(task_id=task_obj.id)
+    ar_wer_score, as_wer_score, rs_wer_score = 0, 0, 0
+    ar_bleu_score, rs_bleu_score = 0, 0
+    ar_done, as_done, rs_done = False, False, False
+    ann_ann, rev_ann, sup_ann = "", "", ""
+    for a in all_annotations:
+        if a.annotation_type == REVIEWER_ANNOTATION and a.annotation_status in [
+            ACCEPTED,
+            ACCEPTED_WITH_MINOR_CHANGES,
+            ACCEPTED_WITH_MAJOR_CHANGES,
+        ]:
+            rev_ann = a
+        elif a.annotation_type == SUPER_CHECKER_ANNOTATION and a.annotation_status in [
+            VALIDATED,
+            VALIDATED_WITH_CHANGES,
+        ]:
+            sup_ann = a
+        elif a.annotation_type == ANNOTATOR_ANNOTATION:
+            ann_ann = a
+        if ann_ann and rev_ann and not ar_done:
+            try:
+                ar_wer_score += calculate_word_error_rate_between_two_audio_transcription_annotation(
+                    rev_ann.result, ann_ann.result, project_type
+                )
+                ar_done = True
+            except Exception as e:
+                pass
+            try:
+                s1 = SentenceOperationViewSet()
+                sampleRequest = {
+                    "annotation_result1": rev_ann.result,
+                    "annotation_result2": ann_ann.result,
+                }
+                ar_bleu_score += float(
+                    s1.calculate_bleu_score(sampleRequest).data["ar_bleu_score"]
+                )
+            except Exception as e:
+                pass
+        if rev_ann and sup_ann and not rs_done:
+            try:
+                rs_wer_score += calculate_word_error_rate_between_two_audio_transcription_annotation(
+                    sup_ann.result, rev_ann.result, project_type
+                )
+                rs_done = True
+            except Exception as e:
+                pass
+            try:
+                s1 = SentenceOperationViewSet()
+                sampleRequest = {
+                    "annotation_result1": sup_ann.result,
+                    "annotation_result2": rev_ann.result,
+                }
+                rs_bleu_score += float(
+                    s1.calculate_bleu_score(sampleRequest).data["rs_bleu_score"]
+                )
+            except Exception as e:
+                pass
+        if ann_ann and sup_ann and not as_done:
+            meta_stats = sup_ann.meta_stats
+            if "as_wer_score" in meta_stats:
+                as_wer_score += meta_stats["as_wer_score"]
+                as_done = True
+            try:
+                as_wer_score += calculate_word_error_rate_between_two_audio_transcription_annotation(
+                    sup_ann.result, ann_ann.result, project_type
+                )
+                as_done = True
+            except Exception as e:
+                pass
+
+    if project_type == "AcousticNormalisedTranscriptionEditing":
+        (
+            acoustic_normalised_word_count,
+            verbatim_word_count,
+            acoustic_normalised_duration,
+            verbatim_duration,
+        ) = (0, 0, 0, 0)
+        for r in result:
+            if r["from_name"] == "acoustic_normalised_transcribed_json":
+                acoustic_normalised_word_count += calculateWordCount(r)
+                acoustic_normalised_duration += calculateAudioDuration(r)
+            elif r["from_name"] == "verbatim_transcribed_json":
+                verbatim_word_count += calculateWordCount(r)
+                verbatim_duration += calculateAudioDuration(r)
+        segment_duration = get_audio_transcription_duration(result)
+        not_null_segment_duration = get_not_null_audio_transcription_duration(result)
+        return {
+            "acoustic_normalised_word_count": acoustic_normalised_word_count,
+            "verbatim_word_count": verbatim_word_count,
+            "acoustic_normalised_duration": acoustic_normalised_duration,
+            "verbatim_duration": verbatim_duration,
+            "total_segment_duration": segment_duration,
+            "not_null_segment_duration": not_null_segment_duration,
+            "ar_wer_score": ar_wer_score,
+            "as_wer_score": as_wer_score,
+            "rs_wer_score": rs_wer_score,
+            "ar_bleu_score": ar_bleu_score,
+            "rs_bleu_score": rs_bleu_score,
+        }
+    elif project_type in ["AudioTranscription", "AudioTranscriptionEditing"]:
+        transcribed_word_count, transcribed_duration = 0, 0
+        for r in result:
+            if r["from_name"] == "transcribed_json":
+                transcribed_word_count += calculateWordCount(r)
+                transcribed_duration += calculateAudioDuration(r)
+        segment_duration = get_audio_transcription_duration(result)
+        not_null_segment_duration = get_not_null_audio_transcription_duration(result)
+        return {
+            "audio_word_count": transcribed_word_count,
+            "transcribed_duration": transcribed_duration,
+            "total_segment_duration": segment_duration,
+            "not_null_segment_duration": not_null_segment_duration,
+            "ar_wer_score": ar_wer_score,
+            "as_wer_score": as_wer_score,
+            "rs_wer_score": rs_wer_score,
+            "ar_bleu_score": ar_bleu_score,
+            "rs_bleu_score": rs_bleu_score,
+        }
+    elif project_type in [
+        "ContextualSentenceVerification",
+        "ContextualSentenceVerificationAndDomainClassification",
+        "ContextualTranslationEditing",
+        "TranslationEditing",
+    ]:
+        word_count = 0
+        for r in result:
+            if r["type"] == "textarea":
+                word_count += calculateWordCount(r)
+        return {
+            "word_count": word_count,
+            "ar_wer_score": ar_wer_score,
+            "as_wer_score": as_wer_score,
+            "rs_wer_score": rs_wer_score,
+            "ar_bleu_score": ar_bleu_score,
+            "rs_bleu_score": rs_bleu_score,
+        }
+    elif project_type in [
+        "ConversationTranslation",
+        "ConversationTranslationEditing",
+        "ConversationVerification",
+    ]:
+        word_count, sentence_count = 0, 0
+        for r in result:
+            if r["type"] == "textarea":
+                word_count += calculateWordCount(r)
+                sentence_count += calculateSentenceCount(
+                    ann_obj.result["value"]["text"][0]
+                )
+
+        return {
+            "word_count": word_count,
+            "sentence_count": sentence_count,
+            "ar_wer_score": ar_wer_score,
+            "as_wer_score": as_wer_score,
+            "rs_wer_score": rs_wer_score,
+            "ar_bleu_score": ar_bleu_score,
+            "rs_bleu_score": rs_bleu_score,
+        }
+    elif project_type in [
+        "OCRTranscription",
+        "OCRTranscriptionEditing",
+        "OCRSegmentCategorizationEditing",
+    ]:
+        word_count = 0
+        for r in result:
+            if r["from_name"] == "ocr_transcribed_json":
+                word_count += calculateWordCount(r)
+        return {
+            "word_count": word_count,
+            "ar_wer_score": ar_wer_score,
+            "as_wer_score": as_wer_score,
+            "rs_wer_score": rs_wer_score,
+            "ar_bleu_score": ar_bleu_score,
+            "rs_bleu_score": rs_bleu_score,
+        }
+
+
+def calculateWordCount(annotation_result):
+    word_count = 0
+    try:
+        word_count = no_of_words(annotation_result["value"]["text"][0])
+    except:
+        pass
+    return word_count
+
+
+def calculateAudioDuration(annotation_result):
+    try:
+        start = annotation_result["value"]["start"]
+        end = annotation_result["value"]["end"]
+    except:
+        start, end = 0, 0
+        pass
+    return abs(end - start)
+
+
+def calculateSentenceCount(text):
+    sentences = re.split(r"[.!?]+", text)
+    return len([sentence for sentence in sentences if sentence.strip()])
diff --git a/backend/tasks/views.py b/backend/tasks/views.py
@@ -59,6 +59,7 @@
 
 from utils.date_time_conversions import utc_to_ist
 from django.db import IntegrityError
+from .utils import compute_meta_stats_for_annotation
 
 # Create your views here.
 
@@ -2335,6 +2336,10 @@ def partial_update(self, request, pk=None):
                 if supercheck_status in [UNVALIDATED, REJECTED, DRAFT, SKIPPED]:
                     task.correct_annotation = None
                     task.save()
+        annotation_obj.meta_stats = compute_meta_stats_for_annotation(
+            annotation_obj, annotation_obj.task.project_id.project_type
+        )
+        annotation_obj.save()
         annotation_response.data["message"] = response_message
         return annotation_response