From ccd61635bd5d7c2188c474226f0c8ba530878904 Mon Sep 17 00:00:00 2001 From: Kunal Tiwary Date: Tue, 20 Aug 2024 04:45:31 +0000 Subject: [PATCH 1/5] initial changes in partial_update --- backend/tasks/models.py | 7 +++ backend/tasks/utils.py | 104 ++++++++++++++++++++++++++++++++++++++++ backend/tasks/views.py | 5 ++ 3 files changed, 116 insertions(+) diff --git a/backend/tasks/models.py b/backend/tasks/models.py index 7fac09c7a..e636a01cb 100644 --- a/backend/tasks/models.py +++ b/backend/tasks/models.py @@ -261,6 +261,13 @@ class Annotation(models.Model): help_text=("Time when the annotation was first labeled/accepted/validated"), ) + meta_stats = models.JSONField( + blank=True, + null=True, + verbose_name="meta_stats", + help_text="Meta statistics for the annotation result", + ) + def __str__(self): return str(self.id) diff --git a/backend/tasks/utils.py b/backend/tasks/utils.py index 442901c8b..3af7ea41c 100644 --- a/backend/tasks/utils.py +++ b/backend/tasks/utils.py @@ -1,7 +1,16 @@ import os +import re from requests import RequestException import requests from dotenv import load_dotenv +from functions.tasks import update_meta_stats +from projects.utils import ( + no_of_words, + get_audio_project_types, + get_audio_transcription_duration, + get_not_null_audio_transcription_duration, +) + Queued_Task_name = { "dataset.tasks.deduplicate_dataset_instance_items": "Deduplicate Dataset Instance Items", @@ -60,3 +69,98 @@ def query_flower(filters=None): return {"error": "Failed to retrieve tasks from Flower"} except RequestException as e: return {"error": f" failed to connect to flower API, {str(e)}"} + + +def compute_meta_stats_for_annotation(ann_obj, project_type): + task_obj = ann_obj.task + task_data = task_obj.data + ced_project_type_choices = ["ContextualTranslationEditing"] + result_meta_stats = {} + result = ann_obj.result + if project_type == "AcousticNormalisedTranscriptionEditing": + ( + acousticNormalisedWordCount, + verbatimWordCount, + acousticNormalisedDuration, + verbatimDuration, + ) = (0, 0, 0, 0) + for r in result: + if r["from_name"] == "acoustic_normalised_transcribed_json": + acousticNormalisedWordCount += calculateWordCount(ann_obj.result) + acousticNormalisedDuration += calculateAudioDuration(ann_obj.result) + elif r["from_name"] == "verbatim_transcribed_json": + verbatimWordCount += calculateWordCount(ann_obj.result) + verbatimDuration += calculateAudioDuration(ann_obj.result) + # elif r["from_name"] == "transcribed_json": + return { + "acousticNormalisedWordCount": acousticNormalisedWordCount, + "verbatimWordCount": verbatimWordCount, + "acousticNormalisedDuration": acousticNormalisedDuration, + "verbatimDuration": verbatimDuration, + } + elif project_type in ["AudioTranscription", "AudioTranscriptionEditing"]: + wordCount, transcribedDuration = 0, 0 + for r in result: + if r["from_name"] == "transcribed_json": + wordCount += calculateWordCount(ann_obj.result) + transcribedDuration += calculateAudioDuration(ann_obj.result) + return {"wordCount": wordCount, "transcribedDuration": transcribedDuration} + elif project_type in [ + "ContextualSentenceVerification", + "ContextualSentenceVerificationAndDomainClassification", + "ContextualTranslationEditing", + "TranslationEditing", + ]: + wordCount = 0 + for r in result: + if r["type"] == "textarea": + wordCount += calculateWordCount(ann_obj.result) + return {"wordCount": wordCount} + elif project_type in [ + "ConversationTranslation", + "ConversationTranslationEditing", + "ConversationVerification", + ]: + wordCount, sentenceCount = 0, 0 + for r in result: + if r["type"] == "textarea": + wordCount += calculateWordCount(ann_obj.result) + sentenceCount += calculateSentenceCount( + ann_obj.result["value"]["text"][0] + ) + + return {"wordCount": wordCount, "sentenceCount": sentenceCount} + elif project_type in [ + "OCRTranscription", + "OCRTranscriptionEditing", + "OCRSegmentCategorizationEditing", + ]: + wordCount = 0 + for r in result: + if r["from_name"] == "ocr_transcribed_json": + wordCount += calculateWordCount(ann_obj.result) + return {"wordCount": wordCount} + + +def calculateWordCount(annotation_result): + word_count = 0 + try: + word_count = no_of_words(annotation_result["value"]["text"][0]) + except: + pass + return word_count + + +def calculateAudioDuration(annotation_result): + try: + start = annotation_result["value"]["start"] + end = annotation_result["value"]["end"] + except: + start, end = 0, 0 + pass + return abs(end - start) + + +def calculateSentenceCount(text): + sentences = re.split(r"[.!?]+", text) + return len([sentence for sentence in sentences if sentence.strip()]) diff --git a/backend/tasks/views.py b/backend/tasks/views.py index 991c4f141..7f549cfbd 100644 --- a/backend/tasks/views.py +++ b/backend/tasks/views.py @@ -59,6 +59,7 @@ from utils.date_time_conversions import utc_to_ist from django.db import IntegrityError +from .utils import compute_meta_stats_for_annotation # Create your views here. @@ -2335,6 +2336,10 @@ def partial_update(self, request, pk=None): if supercheck_status in [UNVALIDATED, REJECTED, DRAFT, SKIPPED]: task.correct_annotation = None task.save() + annotation_obj.meta_stats = compute_meta_stats_for_annotation( + annotation_obj, annotation_obj.task.project_id.project_type + ) + annotation_obj.save() annotation_response.data["message"] = response_message return annotation_response From bae2c1057c882e4465de9a93c094ec61264e8095 Mon Sep 17 00:00:00 2001 From: Kunal Tiwary Date: Wed, 28 Aug 2024 05:41:41 +0000 Subject: [PATCH 2/5] added initial updated code --- backend/functions/tasks.py | 30 +++-- backend/organizations/tasks.py | 227 ++++++++++++++++++++++----------- backend/workspaces/tasks.py | 227 ++++++++++++++++++++++----------- 3 files changed, 331 insertions(+), 153 deletions(-) diff --git a/backend/functions/tasks.py b/backend/functions/tasks.py index e52b02a84..92a0dc917 100644 --- a/backend/functions/tasks.py +++ b/backend/functions/tasks.py @@ -1479,15 +1479,27 @@ def update_meta_stats( ann_obj.result ) elif project_type in get_audio_project_types(): - result_meta_stats[ann_obj.annotation_status]["Raw Audio Duration"] += task_data[ - "audio_duration" - ] - result_meta_stats[ann_obj.annotation_status][ - "Segment Duration" - ] += get_audio_transcription_duration(ann_obj.result) - result_meta_stats[ann_obj.annotation_status][ - "Not Null Segment Duration" - ] += get_not_null_audio_transcription_duration(ann_obj.result, ann_obj.id) + meta_stats = ann_obj.meta_stats + if meta_stats: + result_meta_stats[ann_obj.annotation_status][ + "Raw Audio Duration" + ] = meta_stats["Raw Audio Duration"] + result_meta_stats[ann_obj.annotation_status][ + "Segment Duration" + ] = meta_stats["Segment Duration"] + result_meta_stats[ann_obj.annotation_status][ + "Not Null Segment Duration" + ] = meta_stats["Not Null Segment Duration"] + else: + result_meta_stats[ann_obj.annotation_status][ + "Raw Audio Duration" + ] += task_data["audio_duration"] + result_meta_stats[ann_obj.annotation_status][ + "Segment Duration" + ] += get_audio_transcription_duration(ann_obj.result) + result_meta_stats[ann_obj.annotation_status][ + "Not Null Segment Duration" + ] += get_not_null_audio_transcription_duration(ann_obj.result, ann_obj.id) def calculate_ced_between_two_annotations(annotation1, annotation2): diff --git a/backend/organizations/tasks.py b/backend/organizations/tasks.py index 5fef46978..ff8231aac 100644 --- a/backend/organizations/tasks.py +++ b/backend/organizations/tasks.py @@ -119,27 +119,41 @@ def get_all_annotation_reports( if a.annotation_type == REVIEWER_ANNOTATION: number_of_tasks_that_has_review_annotations += 1 if ann_ann and rev_ann and not ar_done: - try: - ar_wer_score += calculate_word_error_rate_between_two_audio_transcription_annotation( - rev_ann.result, ann_ann.result, project_type - ) + meta_stats = rev_ann.meta_stats + if "word_error_rate" in meta_stats: + ar_wer_score += meta_stats["word_error_rate"] number_of_tasks_contributed_for_ar_wer += 1 ar_done = True - except Exception as e: - pass - try: - s1 = SentenceOperationViewSet() - sampleRequest = { - "annotation_result1": rev_ann.result, - "annotation_result2": ann_ann.result, - } - ar_bleu_score += float( - s1.calculate_bleu_score(sampleRequest).data["bleu_score"] - ) - number_of_tasks_contributed_for_ar_bleu += 1 - except Exception as e: - pass + else: + try: + ar_wer_score += calculate_word_error_rate_between_two_audio_transcription_annotation( + rev_ann.result, ann_ann.result, project_type + ) + number_of_tasks_contributed_for_ar_wer += 1 + ar_done = True + except Exception as e: + pass + if "bleu_score" in meta_stats: + ar_bleu_score += meta_stats["bleu_score"] + else: + try: + s1 = SentenceOperationViewSet() + sampleRequest = { + "annotation_result1": rev_ann.result, + "annotation_result2": ann_ann.result, + } + ar_bleu_score += float( + s1.calculate_bleu_score(sampleRequest).data["bleu_score"] + ) + number_of_tasks_contributed_for_ar_bleu += 1 + except Exception as e: + pass if ann_ann and sup_ann and not as_done: + meta_stats = sup_ann.meta_stats + if "word_error_rate" in meta_stats: + as_wer_score += meta_stats["word_error_rate"] + number_of_tasks_contributed_for_as_wer += 1 + as_done = True try: as_wer_score += calculate_word_error_rate_between_two_audio_transcription_annotation( sup_ann.result, ann_ann.result, project_type @@ -164,24 +178,44 @@ def get_all_annotation_reports( only_tasks = False if is_translation_project: for anno in submitted_tasks: - try: - total_word_count_list.append(anno.task.data["word_count"]) - except: - pass + meta_stats = anno.meta_stats + if "word_count" in meta_stats: + total_word_count_list.append(meta_stats["word_count"]) + else: + try: + total_word_count_list.append(anno.task.data["word_count"]) + except: + pass elif "OCRTranscription" in project_type: for anno in submitted_tasks: - total_word_count_list.append(ocr_word_count(anno.result)) + meta_stats = anno.meta_stats + if "word_count" in meta_stats: + total_word_count_list.append(meta_stats["word_count"]) + else: + total_word_count_list.append(ocr_word_count(anno.result)) elif ( project_type in get_audio_project_types() or project_type == "AllAudioProjects" ): for anno in submitted_tasks: - try: - total_audio_duration_list.append( - get_audio_transcription_duration(anno.result) - ) - total_raw_audio_duration_list.append(anno.task.data["audio_duration"]) - except: - pass + meta_stats = anno.meta_stats + if "total_audio_duration" in meta_stats: + total_audio_duration_list.append(meta_stats["total_audio_duration"]) + else: + try: + total_audio_duration_list.append( + get_audio_transcription_duration(anno.result) + ) + except: + pass + if "raw_audio_duration" in meta_stats: + total_raw_audio_duration_list.append(meta_stats["raw_audio_duration"]) + else: + try: + total_raw_audio_duration_list.append( + anno.task.data["audio_duration"] + ) + except: + pass else: only_tasks = True @@ -335,26 +369,35 @@ def get_all_review_reports( if a.annotation_type == SUPER_CHECKER_ANNOTATION: number_of_tasks_that_has_sup_annotations += 1 if rev_ann and sup_ann and not rs_done: - try: - rs_wer_score += calculate_word_error_rate_between_two_audio_transcription_annotation( - sup_ann.result, rev_ann.result, project_type - ) + meta_stats = sup_ann.meta_stats + if "word_error_rate" in meta_stats: + rs_wer_score += meta_stats["word_error_rate"] number_of_tasks_contributed_for_rs_wer += 1 rs_done = True - except Exception as e: - pass - try: - s1 = SentenceOperationViewSet() - sampleRequest = { - "annotation_result1": sup_ann.result, - "annotation_result2": rev_ann.result, - } - rs_bleu_score += float( - s1.calculate_bleu_score(sampleRequest).data["bleu_score"] - ) - number_of_tasks_contributed_for_rs_bleu += 1 - except Exception as e: - pass + else: + try: + rs_wer_score += calculate_word_error_rate_between_two_audio_transcription_annotation( + sup_ann.result, rev_ann.result, project_type + ) + number_of_tasks_contributed_for_rs_wer += 1 + rs_done = True + except Exception as e: + pass + if "bleu_score" in meta_stats: + rs_bleu_score += meta_stats["bleu_score"] + else: + try: + s1 = SentenceOperationViewSet() + sampleRequest = { + "annotation_result1": sup_ann.result, + "annotation_result2": rev_ann.result, + } + rs_bleu_score += float( + s1.calculate_bleu_score(sampleRequest).data["bleu_score"] + ) + number_of_tasks_contributed_for_rs_bleu += 1 + except Exception as e: + pass submitted_tasks_count = submitted_tasks.count() project_type_lower = project_type.lower() @@ -370,24 +413,44 @@ def get_all_review_reports( only_tasks = False if is_translation_project: for anno in submitted_tasks: - try: - total_word_count_list.append(anno.task.data["word_count"]) - except: - pass + meta_stats = anno.meta_stats + if "word_count" in meta_stats: + total_word_count_list.append(meta_stats["word_count"]) + else: + try: + total_word_count_list.append(anno.task.data["word_count"]) + except: + pass elif "OCRTranscription" in project_type: for anno in submitted_tasks: - total_word_count_list.append(ocr_word_count(anno.result)) + meta_stats = anno.meta_stats + if "word_count" in meta_stats: + total_word_count_list.append(meta_stats["word_count"]) + else: + total_word_count_list.append(ocr_word_count(anno.result)) elif ( project_type in get_audio_project_types() or project_type == "AllAudioProjects" ): for anno in submitted_tasks: - try: - total_audio_duration_list.append( - get_audio_transcription_duration(anno.result) - ) - total_raw_audio_duration_list.append(anno.task.data["audio_duration"]) - except: - pass + meta_stats = anno.meta_stats + if "total_audio_duration" in meta_stats: + total_audio_duration_list.append(meta_stats["total_audio_duration"]) + else: + try: + total_audio_duration_list.append( + get_audio_transcription_duration(anno.result) + ) + except: + pass + if "raw_audio_duration" in meta_stats: + total_raw_audio_duration_list.append(meta_stats["raw_audio_duration"]) + else: + try: + total_raw_audio_duration_list.append( + anno.task.data["audio_duration"] + ) + except: + pass else: only_tasks = True @@ -514,26 +577,46 @@ def get_all_supercheck_reports( only_tasks = False if is_translation_project: for anno in submitted_tasks: - try: - validated_word_count_list.append(anno.task.data["word_count"]) - except: - pass + meta_stats = anno.meta_stats + if "word_count" in meta_stats: + validated_word_count_list.append(meta_stats["word_count"]) + else: + try: + validated_word_count_list.append(anno.task.data["word_count"]) + except: + pass elif "OCRTranscription" in project_type: for anno in submitted_tasks: - validated_word_count_list.append(ocr_word_count(anno.result)) + meta_stats = anno.meta_stats + if "word_count" in meta_stats: + validated_word_count_list.append(meta_stats["word_count"]) + else: + validated_word_count_list.append(ocr_word_count(anno.result)) elif ( project_type in get_audio_project_types() or project_type == "AllAudioProjects" ): for anno in submitted_tasks: - try: - validated_audio_duration_list.append( - get_audio_transcription_duration(anno.result) - ) + meta_stats = anno.meta_stats + if "total_audio_duration" in meta_stats: + validated_audio_duration_list.append(meta_stats["total_audio_duration"]) + else: + try: + validated_audio_duration_list.append( + get_audio_transcription_duration(anno.result) + ) + except: + pass + if "raw_audio_duration" in meta_stats: validated_raw_audio_duration_list.append( - anno.task.data["audio_duration"] + meta_stats["raw_audio_duration"] ) - except: - pass + else: + try: + validated_raw_audio_duration_list.append( + anno.task.data["audio_duration"] + ) + except: + pass else: only_tasks = True diff --git a/backend/workspaces/tasks.py b/backend/workspaces/tasks.py index 4720d6a6d..90aae0516 100644 --- a/backend/workspaces/tasks.py +++ b/backend/workspaces/tasks.py @@ -116,27 +116,41 @@ def get_all_annotation_reports( if a.annotation_type == REVIEWER_ANNOTATION: number_of_tasks_that_has_review_annotations += 1 if ann_ann and rev_ann and not ar_done: - try: - ar_wer_score += calculate_word_error_rate_between_two_audio_transcription_annotation( - rev_ann.result, ann_ann.result, project_type - ) + meta_stats = rev_ann.meta_stats + if "word_error_rate" in meta_stats: + ar_wer_score += meta_stats["word_error_rate"] number_of_tasks_contributed_for_ar_wer += 1 ar_done = True - except Exception as e: - pass - try: - s1 = SentenceOperationViewSet() - sampleRequest = { - "annotation_result1": rev_ann.result, - "annotation_result2": ann_ann.result, - } - ar_bleu_score += float( - s1.calculate_bleu_score(sampleRequest).data["bleu_score"] - ) - number_of_tasks_contributed_for_ar_bleu += 1 - except Exception as e: - pass + else: + try: + ar_wer_score += calculate_word_error_rate_between_two_audio_transcription_annotation( + rev_ann.result, ann_ann.result, project_type + ) + number_of_tasks_contributed_for_ar_wer += 1 + ar_done = True + except Exception as e: + pass + if "bleu_score" in meta_stats: + ar_bleu_score += meta_stats["bleu_score"] + else: + try: + s1 = SentenceOperationViewSet() + sampleRequest = { + "annotation_result1": rev_ann.result, + "annotation_result2": ann_ann.result, + } + ar_bleu_score += float( + s1.calculate_bleu_score(sampleRequest).data["bleu_score"] + ) + number_of_tasks_contributed_for_ar_bleu += 1 + except Exception as e: + pass if ann_ann and sup_ann and not as_done: + meta_stats = sup_ann.meta_stats + if "word_error_rate" in meta_stats: + as_wer_score += meta_stats["word_error_rate"] + number_of_tasks_contributed_for_as_wer += 1 + as_done = True try: as_wer_score += calculate_word_error_rate_between_two_audio_transcription_annotation( sup_ann.result, ann_ann.result, project_type @@ -161,24 +175,44 @@ def get_all_annotation_reports( only_tasks = False if is_translation_project: for anno in submitted_tasks: - try: - total_word_count_list.append(anno.task.data["word_count"]) - except: - pass + meta_stats = anno.meta_stats + if "word_count" in meta_stats: + total_word_count_list.append(meta_stats["word_count"]) + else: + try: + total_word_count_list.append(anno.task.data["word_count"]) + except: + pass elif "OCRTranscription" in project_type: for anno in submitted_tasks: - total_word_count_list.append(ocr_word_count(anno.result)) + meta_stats = anno.meta_stats + if "word_count" in meta_stats: + total_word_count_list.append(meta_stats["word_count"]) + else: + total_word_count_list.append(ocr_word_count(anno.result)) elif ( project_type in get_audio_project_types() or project_type == "AllAudioProjects" ): for anno in submitted_tasks: - try: - total_audio_duration_list.append( - get_audio_transcription_duration(anno.result) - ) - total_raw_audio_duration_list.append(anno.task.data["audio_duration"]) - except: - pass + meta_stats = anno.meta_stats + if "total_audio_duration" in meta_stats: + total_audio_duration_list.append(meta_stats["total_audio_duration"]) + else: + try: + total_audio_duration_list.append( + get_audio_transcription_duration(anno.result) + ) + except: + pass + if "raw_audio_duration" in meta_stats: + total_raw_audio_duration_list.append(meta_stats["raw_audio_duration"]) + else: + try: + total_raw_audio_duration_list.append( + anno.task.data["audio_duration"] + ) + except: + pass else: only_tasks = True @@ -333,26 +367,35 @@ def get_all_review_reports( if a.annotation_type == SUPER_CHECKER_ANNOTATION: number_of_tasks_that_has_sup_annotations += 1 if rev_ann and sup_ann and not rs_done: - try: - rs_wer_score += calculate_word_error_rate_between_two_audio_transcription_annotation( - sup_ann.result, rev_ann.result, project_type - ) + meta_stats = sup_ann.meta_stats + if "word_error_rate" in meta_stats: + rs_wer_score += meta_stats["word_error_rate"] number_of_tasks_contributed_for_rs_wer += 1 rs_done = True - except Exception as e: - pass - try: - s1 = SentenceOperationViewSet() - sampleRequest = { - "annotation_result1": sup_ann.result, - "annotation_result2": rev_ann.result, - } - rs_bleu_score += float( - s1.calculate_bleu_score(sampleRequest).data["bleu_score"] - ) - number_of_tasks_contributed_for_rs_bleu += 1 - except Exception as e: - pass + else: + try: + rs_wer_score += calculate_word_error_rate_between_two_audio_transcription_annotation( + sup_ann.result, rev_ann.result, project_type + ) + number_of_tasks_contributed_for_rs_wer += 1 + rs_done = True + except Exception as e: + pass + if "bleu_score" in meta_stats: + rs_bleu_score += meta_stats["bleu_score"] + else: + try: + s1 = SentenceOperationViewSet() + sampleRequest = { + "annotation_result1": sup_ann.result, + "annotation_result2": rev_ann.result, + } + rs_bleu_score += float( + s1.calculate_bleu_score(sampleRequest).data["bleu_score"] + ) + number_of_tasks_contributed_for_rs_bleu += 1 + except Exception as e: + pass submitted_tasks_count = submitted_tasks.count() project_type_lower = project_type.lower() @@ -368,24 +411,44 @@ def get_all_review_reports( only_tasks = False if is_translation_project: for anno in submitted_tasks: - try: - total_word_count_list.append(anno.task.data["word_count"]) - except: - pass + meta_stats = anno.meta_stats + if "word_count" in meta_stats: + total_word_count_list.append(meta_stats["word_count"]) + else: + try: + total_word_count_list.append(anno.task.data["word_count"]) + except: + pass elif "OCRTranscription" in project_type: for anno in submitted_tasks: - total_word_count_list.append(ocr_word_count(anno.result)) + meta_stats = anno.meta_stats + if "word_count" in meta_stats: + total_word_count_list.append(meta_stats["word_count"]) + else: + total_word_count_list.append(ocr_word_count(anno.result)) elif ( project_type in get_audio_project_types() or project_type == "AllAudioProjects" ): for anno in submitted_tasks: - try: - total_audio_duration_list.append( - get_audio_transcription_duration(anno.result) - ) - total_raw_audio_duration_list.append(anno.task.data["audio_duration"]) - except: - pass + meta_stats = anno.meta_stats + if "total_audio_duration" in meta_stats: + total_audio_duration_list.append(meta_stats["total_audio_duration"]) + else: + try: + total_audio_duration_list.append( + get_audio_transcription_duration(anno.result) + ) + except: + pass + if "raw_audio_duration" in meta_stats: + total_raw_audio_duration_list.append(meta_stats["raw_audio_duration"]) + else: + try: + total_raw_audio_duration_list.append( + anno.task.data["audio_duration"] + ) + except: + pass else: only_tasks = True total_word_count = sum(total_word_count_list) @@ -512,26 +575,46 @@ def get_all_supercheck_reports( only_tasks = False if is_translation_project: for anno in submitted_tasks: - try: - validated_word_count_list.append(anno.task.data["word_count"]) - except: - pass + meta_stats = anno.meta_stats + if "word_count" in meta_stats: + validated_word_count_list.append(meta_stats["word_count"]) + else: + try: + validated_word_count_list.append(anno.task.data["word_count"]) + except: + pass elif "OCRTranscription" in project_type: for anno in submitted_tasks: - validated_word_count_list.append(ocr_word_count(anno.result)) + meta_stats = anno.meta_stats + if "word_count" in meta_stats: + validated_word_count_list.append(meta_stats["word_count"]) + else: + validated_word_count_list.append(ocr_word_count(anno.result)) elif ( project_type in get_audio_project_types() or project_type == "AllAudioProjects" ): for anno in submitted_tasks: - try: - validated_audio_duration_list.append( - get_audio_transcription_duration(anno.result) - ) + meta_stats = anno.meta_stats + if "total_audio_duration" in meta_stats: + validated_audio_duration_list.append(meta_stats["total_audio_duration"]) + else: + try: + validated_audio_duration_list.append( + get_audio_transcription_duration(anno.result) + ) + except: + pass + if "raw_audio_duration" in meta_stats: validated_raw_audio_duration_list.append( - anno.task.data["audio_duration"] + meta_stats["raw_audio_duration"] ) - except: - pass + else: + try: + validated_raw_audio_duration_list.append( + anno.task.data["audio_duration"] + ) + except: + pass else: only_tasks = True From f6cf078fd6b8f12e241cd65dee640a4df05f2581 Mon Sep 17 00:00:00 2001 From: Kunal Tiwary Date: Thu, 19 Sep 2024 07:08:29 +0000 Subject: [PATCH 3/5] added final changes --- backend/functions/tasks.py | 7 +- backend/organizations/tasks.py | 526 +++++++++++++++--- backend/projects/utils.py | 2 +- .../migrations/0049_annotation_meta_stats.py | 18 + backend/tasks/utils.py | 168 +++++- backend/workspaces/tasks.py | 524 ++++++++++++++--- 6 files changed, 1049 insertions(+), 196 deletions(-) create mode 100644 backend/tasks/migrations/0049_annotation_meta_stats.py diff --git a/backend/functions/tasks.py b/backend/functions/tasks.py index 92a0dc917..cfea42864 100644 --- a/backend/functions/tasks.py +++ b/backend/functions/tasks.py @@ -1481,15 +1481,12 @@ def update_meta_stats( elif project_type in get_audio_project_types(): meta_stats = ann_obj.meta_stats if meta_stats: - result_meta_stats[ann_obj.annotation_status][ - "Raw Audio Duration" - ] = meta_stats["Raw Audio Duration"] result_meta_stats[ann_obj.annotation_status][ "Segment Duration" - ] = meta_stats["Segment Duration"] + ] = meta_stats["total_segment_duration"] result_meta_stats[ann_obj.annotation_status][ "Not Null Segment Duration" - ] = meta_stats["Not Null Segment Duration"] + ] = meta_stats["not_null_segment_duration"] else: result_meta_stats[ann_obj.annotation_status][ "Raw Audio Duration" diff --git a/backend/organizations/tasks.py b/backend/organizations/tasks.py index ff8231aac..d4440e0cd 100644 --- a/backend/organizations/tasks.py +++ b/backend/organizations/tasks.py @@ -35,6 +35,7 @@ un_pack_annotation_tasks, ) from django.db.models import Q +from tasks.utils import calculateWordCount, calculateAudioDuration, calculateSentenceCount def get_all_annotation_reports( @@ -172,7 +173,13 @@ def get_all_annotation_reports( if project_type in ["ConversationTranslationEditing", "ConversationTranslation"] else False ) - total_audio_duration_list = [] + acoustic_normalised_duration = [] + verbatim_duration = [] + transcribed_duration = [] + acoustic_normalised_word_count = [] + verbatim_word_count = [] + transcribed_word_count = [] + total_segment_duration = [] total_raw_audio_duration_list = [] total_word_count_list = [] only_tasks = False @@ -198,32 +205,116 @@ def get_all_annotation_reports( ): for anno in submitted_tasks: meta_stats = anno.meta_stats - if "total_audio_duration" in meta_stats: - total_audio_duration_list.append(meta_stats["total_audio_duration"]) + if not meta_stats: + meta_stats = [] + if project_type == "AllAudioProjects": + if "acoustic_normalised_duration" in meta_stats: + acoustic_normalised_duration.append(meta_stats["acoustic_normalised_duration"]) + else: + for r in anno.result: + if r["from_name"] == "acoustic_normalised_transcribed_json": + acoustic_normalised_duration.append(calculateAudioDuration(r)) + if "acoustic_normalised_word_count" in meta_stats: + acoustic_normalised_word_count.append(meta_stats["acoustic_normalised_word_count"]) + else: + for r in anno.result: + if r["from_name"] == "acoustic_normalised_transcribed_json": + acoustic_normalised_word_count.append(calculateWordCount(r)) + if "verbatim_duration" in meta_stats: + verbatim_duration.append(meta_stats["verbatim_duration"]) + else: + for r in anno.result: + if r["from_name"] == "verbatim_transcribed_json": + verbatim_duration.append(calculateAudioDuration(r)) + if "verbatim_word_count" in meta_stats: + verbatim_word_count.append(meta_stats["verbatim_word_count"]) + else: + for r in anno.result: + if r["from_name"] == "verbatim_transcribed_json": + verbatim_word_count.append(calculateWordCount(r)) + if "transcribed_duration" in meta_stats: + transcribed_duration.append(meta_stats["transcribed_duration"]) + else: + for r in anno.result: + if r["from_name"] == "transcribed_json": + transcribed_duration.append(calculateAudioDuration(r)) + if "transcribed_word_count" in meta_stats: + transcribed_word_count.append(meta_stats["transcribed_word_count"]) + else: + for r in anno.result: + if r["from_name"] == "transcribed_json": + transcribed_word_count.append(calculateAudioDuration(r)) + elif project_type == "AcousticNormalisedTranscriptionEditing": + if "acoustic_normalised_duration" in meta_stats: + acoustic_normalised_duration.append(meta_stats["acoustic_normalised_duration"]) + else: + for r in anno.result: + if r["from_name"] == "acoustic_normalised_transcribed_json": + acoustic_normalised_duration.append(calculateAudioDuration(r)) + if "acoustic_normalised_word_count" in meta_stats: + acoustic_normalised_word_count.append(meta_stats["acoustic_normalised_word_count"]) + else: + for r in anno.result: + if r["from_name"] == "acoustic_normalised_transcribed_json": + acoustic_normalised_word_count.append(calculateWordCount(r)) + if "verbatim_duration" in meta_stats: + verbatim_duration.append(meta_stats["verbatim_duration"]) + else: + for r in anno.result: + if r["from_name"] == "verbatim_transcribed_json": + verbatim_duration.append(calculateAudioDuration(r)) + if "verbatim_word_count" in meta_stats: + verbatim_word_count.append(meta_stats["verbatim_word_count"]) + else: + for r in anno.result: + if r["from_name"] == "verbatim_transcribed_json": + verbatim_word_count.append(calculateWordCount(r)) else: - try: - total_audio_duration_list.append( - get_audio_transcription_duration(anno.result) - ) - except: - pass - if "raw_audio_duration" in meta_stats: - total_raw_audio_duration_list.append(meta_stats["raw_audio_duration"]) + if "transcribed_duration" in meta_stats: + transcribed_duration.append(meta_stats["transcribed_duration"]) + else: + for r in anno.result: + if r["from_name"] == "transcribed_json": + transcribed_duration.append(calculateAudioDuration(r)) + if "transcribed_word_count" in meta_stats: + transcribed_word_count.append(meta_stats["transcribed_word_count"]) + else: + for r in anno.result: + if r["from_name"] == "transcribed_json": + transcribed_word_count.append(calculateAudioDuration(r)) + try: + total_raw_audio_duration_list.append( + anno.task.data["audio_duration"] + ) + except: + pass + if "total_segment_duration" in meta_stats: + total_segment_duration.append(meta_stats["total_segment_duration"]) else: try: - total_raw_audio_duration_list.append( - anno.task.data["audio_duration"] + total_segment_duration.append( + get_audio_transcription_duration(anno.result) ) except: pass else: only_tasks = True - - total_word_count = sum(total_word_count_list) - total_audio_duration = convert_seconds_to_hours(sum(total_audio_duration_list)) - total_raw_audio_duration = convert_seconds_to_hours( - sum(total_raw_audio_duration_list) - ) + total_raw_audio_duration, total_word_count = 0, 0 + if project_type in get_audio_project_types() or project_type == "AllAudioProjects": + acoustic_normalised_duration = convert_seconds_to_hours(sum(acoustic_normalised_duration)) + verbatim_duration = convert_seconds_to_hours(sum(verbatim_duration)) + transcribed_duration = convert_seconds_to_hours(sum(transcribed_duration)) + acoustic_normalised_word_count = convert_seconds_to_hours(sum(acoustic_normalised_word_count)) + verbatim_word_count = convert_seconds_to_hours(sum(verbatim_word_count)) + transcribed_word_count = convert_seconds_to_hours(sum(transcribed_word_count)) + total_raw_audio_duration = convert_seconds_to_hours( + sum(total_raw_audio_duration_list) + ) + total_segment_duration = convert_seconds_to_hours( + sum(total_segment_duration) + ) + else: + total_word_count = sum(total_word_count_list) cumulative_rejection_score_ar = 0 if tasks_and_rejection_count_map_ar: for task, rc in tasks_and_rejection_count_map_ar.items(): @@ -234,10 +325,16 @@ def get_all_annotation_reports( "Participation Type": participation_type, "Role": role, "Type of Work": "Annotator", - "Total Segments Duration": total_audio_duration, + "Acoustic Normalised Duration": acoustic_normalised_duration, + "Verbatim Duration": verbatim_duration, + "Transcribed Duration": transcribed_duration, "Total Raw Audio Duration": total_raw_audio_duration, - "Word Count": total_word_count, + "Total Segment Duration": total_segment_duration, + "Acoustic Normalised Word Count": acoustic_normalised_word_count, + "Verbatim Word Count": verbatim_word_count, + "Transcribed Word Count": transcribed_word_count, "Submitted Tasks": submitted_tasks_count, + "Word Count": total_word_count, "Language": user_lang, "Average Word Error Rate Annotator Vs Reviewer": ar_wer_score / number_of_tasks_contributed_for_ar_wer @@ -265,17 +362,44 @@ def get_all_annotation_reports( if project_type in get_audio_project_types() or project_type == "AllAudioProjects": del result["Word Count"] + if project_type == "AcousticNormalisedTranscriptionEditing": + del result["Transcribed Duration"] + del result["Transcribed Word Count"] + elif project_type in get_audio_project_types(): + del result["Acoustic Normalised Duration"] + del result["Verbatim Duration"] + del result["Raw Audio Duration"] + del result["Acoustic Normalised Word Count"] + del result["Verbatim Word Count"] elif only_tasks: - del result["Total Segments Duration"] - del result["Total Raw Audio Duration"] del result["Word Count"] + del result["Acoustic Normalised Duration"] + del result["Verbatim Duration"] + del result["Transcribed Duration"] + del result["Raw Audio Duration"] + del result["Total Segment Duration"] + del result["Acoustic Normalised Word Count"] + del result["Verbatim Word Count"] + del result["Transcribed Word Count"] elif is_CT_OR_CTE: - del result["Total Segments Duration"] - del result["Total Raw Audio Duration"] del result["Word Count"] + del result["Acoustic Normalised Duration"] + del result["Verbatim Duration"] + del result["Transcribed Duration"] + del result["Raw Audio Duration"] + del result["Total Segment Duration"] + del result["Acoustic Normalised Word Count"] + del result["Verbatim Word Count"] + del result["Transcribed Word Count"] else: - del result["Total Segments Duration"] - del result["Total Raw Audio Duration"] + del result["Acoustic Normalised Duration"] + del result["Verbatim Duration"] + del result["Transcribed Duration"] + del result["Raw Audio Duration"] + del result["Total Segment Duration"] + del result["Acoustic Normalised Word Count"] + del result["Verbatim Word Count"] + del result["Transcribed Word Count"] return result @@ -407,9 +531,15 @@ def get_all_review_reports( if project_type in ["ConversationTranslationEditing", "ConversationTranslation"] else False ) - total_audio_duration_list = [] total_raw_audio_duration_list = [] total_word_count_list = [] + acoustic_normalised_duration = [] + verbatim_duration = [] + transcribed_duration = [] + acoustic_normalised_word_count = [] + verbatim_word_count = [] + transcribed_word_count = [] + total_segment_duration = [] only_tasks = False if is_translation_project: for anno in submitted_tasks: @@ -429,36 +559,119 @@ def get_all_review_reports( else: total_word_count_list.append(ocr_word_count(anno.result)) elif ( - project_type in get_audio_project_types() or project_type == "AllAudioProjects" + project_type in get_audio_project_types() or project_type == "AllAudioProjects" ): for anno in submitted_tasks: meta_stats = anno.meta_stats - if "total_audio_duration" in meta_stats: - total_audio_duration_list.append(meta_stats["total_audio_duration"]) + if project_type == "AllAudioProjects": + if "acoustic_normalised_duration" in meta_stats: + acoustic_normalised_duration.append(meta_stats["acoustic_normalised_duration"]) + else: + for r in anno.result: + if r["from_name"] == "acoustic_normalised_transcribed_json": + acoustic_normalised_duration.append(calculateAudioDuration(r)) + if "acoustic_normalised_word_count" in meta_stats: + acoustic_normalised_word_count.append(meta_stats["acoustic_normalised_word_count"]) + else: + for r in anno.result: + if r["from_name"] == "acoustic_normalised_transcribed_json": + acoustic_normalised_word_count.append(calculateWordCount(r)) + if "verbatim_duration" in meta_stats: + verbatim_duration.append(meta_stats["verbatim_duration"]) + else: + for r in anno.result: + if r["from_name"] == "verbatim_transcribed_json": + verbatim_duration.append(calculateAudioDuration(r)) + if "verbatim_word_count" in meta_stats: + verbatim_word_count.append(meta_stats["verbatim_word_count"]) + else: + for r in anno.result: + if r["from_name"] == "verbatim_transcribed_json": + verbatim_word_count.append(calculateWordCount(r)) + if "transcribed_duration" in meta_stats: + transcribed_duration.append(meta_stats["transcribed_duration"]) + else: + for r in anno.result: + if r["from_name"] == "transcribed_json": + transcribed_duration.append(calculateAudioDuration(r)) + if "transcribed_word_count" in meta_stats: + transcribed_word_count.append(meta_stats["transcribed_word_count"]) + else: + for r in anno.result: + if r["from_name"] == "transcribed_json": + transcribed_word_count.append(calculateAudioDuration(r)) + elif project_type == "AcousticNormalisedTranscriptionEditing": + if "acoustic_normalised_duration" in meta_stats: + acoustic_normalised_duration.append(meta_stats["acoustic_normalised_duration"]) + else: + for r in anno.result: + if r["from_name"] == "acoustic_normalised_transcribed_json": + acoustic_normalised_duration.append(calculateAudioDuration(r)) + if "acoustic_normalised_word_count" in meta_stats: + acoustic_normalised_word_count.append(meta_stats["acoustic_normalised_word_count"]) + else: + for r in anno.result: + if r["from_name"] == "acoustic_normalised_transcribed_json": + acoustic_normalised_word_count.append(calculateWordCount(r)) + if "verbatim_duration" in meta_stats: + verbatim_duration.append(meta_stats["verbatim_duration"]) + else: + for r in anno.result: + if r["from_name"] == "verbatim_transcribed_json": + verbatim_duration.append(calculateAudioDuration(r)) + if "verbatim_word_count" in meta_stats: + verbatim_word_count.append(meta_stats["verbatim_word_count"]) + else: + for r in anno.result: + if r["from_name"] == "verbatim_transcribed_json": + verbatim_word_count.append(calculateWordCount(r)) else: - try: - total_audio_duration_list.append( - get_audio_transcription_duration(anno.result) - ) - except: - pass - if "raw_audio_duration" in meta_stats: - total_raw_audio_duration_list.append(meta_stats["raw_audio_duration"]) + if "transcribed_duration" in meta_stats: + transcribed_duration.append(meta_stats["transcribed_duration"]) + else: + for r in anno.result: + if r["from_name"] == "transcribed_json": + transcribed_duration.append(calculateAudioDuration(r)) + if "transcribed_word_count" in meta_stats: + transcribed_word_count.append(meta_stats["transcribed_word_count"]) + else: + for r in anno.result: + if r["from_name"] == "transcribed_json": + transcribed_word_count.append(calculateAudioDuration(r)) + if "total_segment_duration" in meta_stats: + total_segment_duration.append(meta_stats["total_segment_duration"]) else: try: - total_raw_audio_duration_list.append( - anno.task.data["audio_duration"] + total_segment_duration.append( + get_audio_transcription_duration(anno.result) ) except: pass + try: + total_raw_audio_duration_list.append( + anno.task.data["audio_duration"] + ) + except: + pass else: only_tasks = True - total_word_count = sum(total_word_count_list) - total_audio_duration = convert_seconds_to_hours(sum(total_audio_duration_list)) - total_raw_audio_duration = convert_seconds_to_hours( - sum(total_raw_audio_duration_list) - ) + total_raw_audio_duration, total_word_count = 0, 0 + if project_type in get_audio_project_types() or project_type == "AllAudioProjects": + acoustic_normalised_duration = convert_seconds_to_hours(sum(acoustic_normalised_duration)) + verbatim_duration = convert_seconds_to_hours(sum(verbatim_duration)) + transcribed_duration = convert_seconds_to_hours(sum(transcribed_duration)) + acoustic_normalised_word_count = convert_seconds_to_hours(sum(acoustic_normalised_word_count)) + verbatim_word_count = convert_seconds_to_hours(sum(verbatim_word_count)) + transcribed_word_count = convert_seconds_to_hours(sum(transcribed_word_count)) + total_raw_audio_duration = convert_seconds_to_hours( + sum(total_raw_audio_duration_list) + ) + total_segment_duration = convert_seconds_to_hours( + sum(total_segment_duration) + ) + else: + total_word_count = sum(total_word_count_list) cumulative_rejection_score_ar = 0 if tasks_and_rejection_count_map_ar: for task, rc in tasks_and_rejection_count_map_ar.items(): @@ -475,8 +688,14 @@ def get_all_review_reports( "Participation Type": participation_type, "Role": role, "Type of Work": "Review", - "Total Segments Duration": total_audio_duration, + "Acoustic Normalised Duration": acoustic_normalised_duration, + "Verbatim Duration": verbatim_duration, + "Transcribed Duration": transcribed_duration, "Total Raw Audio Duration": total_raw_audio_duration, + "Total Segment Duration": total_segment_duration, + "Acoustic Normalised Word Count": acoustic_normalised_word_count, + "Verbatim Word Count": verbatim_word_count, + "Transcribed Word Count": transcribed_word_count, "Word Count": total_word_count, "Submitted Tasks": submitted_tasks_count, "Language": user_lang, @@ -503,17 +722,44 @@ def get_all_review_reports( if project_type in get_audio_project_types() or project_type == "AllAudioProjects": del result["Word Count"] + if project_type == "AcousticNormalisedTranscriptionEditing": + del result["Transcribed Duration"] + del result["Transcribed Word Count"] + elif project_type in get_audio_project_types(): + del result["Acoustic Normalised Duration"] + del result["Verbatim Duration"] + del result["Raw Audio Duration"] + del result["Acoustic Normalised Word Count"] + del result["Verbatim Word Count"] elif only_tasks: - del result["Total Segments Duration"] - del result["Total Raw Audio Duration"] del result["Word Count"] + del result["Acoustic Normalised Duration"] + del result["Verbatim Duration"] + del result["Transcribed Duration"] + del result["Raw Audio Duration"] + del result["Total Segment Duration"] + del result["Acoustic Normalised Word Count"] + del result["Verbatim Word Count"] + del result["Transcribed Word Count"] elif is_CT_OR_CTE: - del result["Total Segments Duration"] - del result["Total Raw Audio Duration"] del result["Word Count"] + del result["Acoustic Normalised Duration"] + del result["Verbatim Duration"] + del result["Transcribed Duration"] + del result["Raw Audio Duration"] + del result["Total Segment Duration"] + del result["Acoustic Normalised Word Count"] + del result["Verbatim Word Count"] + del result["Transcribed Word Count"] else: - del result["Total Segments Duration"] - del result["Total Raw Audio Duration"] + del result["Acoustic Normalised Duration"] + del result["Verbatim Duration"] + del result["Transcribed Duration"] + del result["Raw Audio Duration"] + del result["Total Segment Duration"] + del result["Acoustic Normalised Word Count"] + del result["Verbatim Word Count"] + del result["Transcribed Word Count"] return result @@ -572,7 +818,13 @@ def get_all_supercheck_reports( else False ) validated_word_count_list = [] - validated_audio_duration_list = [] + acoustic_normalised_duration = [] + verbatim_duration = [] + transcribed_duration = [] + acoustic_normalised_word_count = [] + verbatim_word_count = [] + transcribed_word_count = [] + total_segment_duration = [] validated_raw_audio_duration_list = [] only_tasks = False if is_translation_project: @@ -593,40 +845,119 @@ def get_all_supercheck_reports( else: validated_word_count_list.append(ocr_word_count(anno.result)) elif ( - project_type in get_audio_project_types() or project_type == "AllAudioProjects" + project_type in get_audio_project_types() or project_type == "AllAudioProjects" ): for anno in submitted_tasks: meta_stats = anno.meta_stats - if "total_audio_duration" in meta_stats: - validated_audio_duration_list.append(meta_stats["total_audio_duration"]) + if project_type == "AllAudioProjects": + if "acoustic_normalised_duration" in meta_stats: + acoustic_normalised_duration.append(meta_stats["acoustic_normalised_duration"]) + else: + for r in anno.result: + if r["from_name"] == "acoustic_normalised_transcribed_json": + acoustic_normalised_duration.append(calculateAudioDuration(r)) + if "acoustic_normalised_word_count" in meta_stats: + acoustic_normalised_word_count.append(meta_stats["acoustic_normalised_word_count"]) + else: + for r in anno.result: + if r["from_name"] == "acoustic_normalised_transcribed_json": + acoustic_normalised_word_count.append(calculateWordCount(r)) + if "verbatim_duration" in meta_stats: + verbatim_duration.append(meta_stats["verbatim_duration"]) + else: + for r in anno.result: + if r["from_name"] == "verbatim_transcribed_json": + verbatim_duration.append(calculateAudioDuration(r)) + if "verbatim_word_count" in meta_stats: + verbatim_word_count.append(meta_stats["verbatim_word_count"]) + else: + for r in anno.result: + if r["from_name"] == "verbatim_transcribed_json": + verbatim_word_count.append(calculateWordCount(r)) + if "transcribed_duration" in meta_stats: + transcribed_duration.append(meta_stats["transcribed_duration"]) + else: + for r in anno.result: + if r["from_name"] == "transcribed_json": + transcribed_duration.append(calculateAudioDuration(r)) + if "transcribed_word_count" in meta_stats: + transcribed_word_count.append(meta_stats["transcribed_word_count"]) + else: + for r in anno.result: + if r["from_name"] == "transcribed_json": + transcribed_word_count.append(calculateAudioDuration(r)) + elif project_type == "AcousticNormalisedTranscriptionEditing": + if "acoustic_normalised_duration" in meta_stats: + acoustic_normalised_duration.append(meta_stats["acoustic_normalised_duration"]) + else: + for r in anno.result: + if r["from_name"] == "acoustic_normalised_transcribed_json": + acoustic_normalised_duration.append(calculateAudioDuration(r)) + if "acoustic_normalised_word_count" in meta_stats: + acoustic_normalised_word_count.append(meta_stats["acoustic_normalised_word_count"]) + else: + for r in anno.result: + if r["from_name"] == "acoustic_normalised_transcribed_json": + acoustic_normalised_word_count.append(calculateWordCount(r)) + if "verbatim_duration" in meta_stats: + verbatim_duration.append(meta_stats["verbatim_duration"]) + else: + for r in anno.result: + if r["from_name"] == "verbatim_transcribed_json": + verbatim_duration.append(calculateAudioDuration(r)) + if "verbatim_word_count" in meta_stats: + verbatim_word_count.append(meta_stats["verbatim_word_count"]) + else: + for r in anno.result: + if r["from_name"] == "verbatim_transcribed_json": + verbatim_word_count.append(calculateWordCount(r)) + else: + if "transcribed_duration" in meta_stats: + transcribed_duration.append(meta_stats["transcribed_duration"]) + else: + for r in anno.result: + if r["from_name"] == "transcribed_json": + transcribed_duration.append(calculateAudioDuration(r)) + if "transcribed_word_count" in meta_stats: + transcribed_word_count.append(meta_stats["transcribed_word_count"]) + else: + for r in anno.result: + if r["from_name"] == "transcribed_json": + transcribed_word_count.append(calculateAudioDuration(r)) + if "total_segment_duration" in meta_stats: + total_segment_duration.append(meta_stats["total_segment_duration"]) else: try: - validated_audio_duration_list.append( + total_segment_duration.append( get_audio_transcription_duration(anno.result) ) except: pass - if "raw_audio_duration" in meta_stats: + try: validated_raw_audio_duration_list.append( - meta_stats["raw_audio_duration"] + anno.task.data["audio_duration"] ) - else: - try: - validated_raw_audio_duration_list.append( - anno.task.data["audio_duration"] - ) - except: - pass + except: + pass else: only_tasks = True - validated_word_count = sum(validated_word_count_list) - validated_audio_duration = convert_seconds_to_hours( - sum(validated_audio_duration_list) - ) - validated_raw_audio_duration = convert_seconds_to_hours( - sum(validated_raw_audio_duration_list) - ) + validated_raw_audio_duration, validated_word_count = 0, 0 + if project_type in get_audio_project_types() or project_type == "AllAudioProjects": + acoustic_normalised_duration = convert_seconds_to_hours(sum(acoustic_normalised_duration)) + verbatim_duration = convert_seconds_to_hours(sum(verbatim_duration)) + transcribed_duration = convert_seconds_to_hours(sum(transcribed_duration)) + acoustic_normalised_word_count = convert_seconds_to_hours(sum(acoustic_normalised_word_count)) + verbatim_word_count = convert_seconds_to_hours(sum(verbatim_word_count)) + transcribed_word_count = convert_seconds_to_hours(sum(transcribed_word_count)) + validated_raw_audio_duration = convert_seconds_to_hours( + sum(validated_raw_audio_duration_list) + ) + total_segment_duration = convert_seconds_to_hours( + sum(total_segment_duration) + ) + else: + validated_word_count = sum(validated_word_count_list) cumulative_rejection_score_rs = 0 if tasks_and_rejection_count_map_rs: for task, rc in tasks_and_rejection_count_map_rs.items(): @@ -638,8 +969,14 @@ def get_all_supercheck_reports( "Participation Type": participation_type, "Role": role, "Type of Work": "Supercheck", - "Total Segments Duration": validated_audio_duration, + "Acoustic Normalised Duration": acoustic_normalised_duration, + "Verbatim Duration": verbatim_duration, + "Transcribed Duration": transcribed_duration, "Total Raw Audio Duration": validated_raw_audio_duration, + "Total Segment Duration": total_segment_duration, + "Acoustic Normalised Word Count": acoustic_normalised_word_count, + "Verbatim Word Count": verbatim_word_count, + "Transcribed Word Count": transcribed_word_count, "Word Count": validated_word_count, "Submitted Tasks": submitted_tasks_count, "Language": user_lang, @@ -651,17 +988,44 @@ def get_all_supercheck_reports( if project_type in get_audio_project_types() or project_type == "AllAudioProjects": del result["Word Count"] + if project_type == "AcousticNormalisedTranscriptionEditing": + del result["Transcribed Duration"] + del result["Transcribed Word Count"] + elif project_type in get_audio_project_types(): + del result["Acoustic Normalised Duration"] + del result["Verbatim Duration"] + del result["Raw Audio Duration"] + del result["Acoustic Normalised Word Count"] + del result["Verbatim Word Count"] elif only_tasks: - del result["Total Segments Duration"] - del result["Total Raw Audio Duration"] del result["Word Count"] + del result["Acoustic Normalised Duration"] + del result["Verbatim Duration"] + del result["Transcribed Duration"] + del result["Raw Audio Duration"] + del result["Total Segment Duration"] + del result["Acoustic Normalised Word Count"] + del result["Verbatim Word Count"] + del result["Transcribed Word Count"] elif is_CT_OR_CTE: - del result["Total Segments Duration"] - del result["Total Raw Audio Duration"] del result["Word Count"] + del result["Acoustic Normalised Duration"] + del result["Verbatim Duration"] + del result["Transcribed Duration"] + del result["Raw Audio Duration"] + del result["Total Segment Duration"] + del result["Acoustic Normalised Word Count"] + del result["Verbatim Word Count"] + del result["Transcribed Word Count"] else: - del result["Total Segments Duration"] - del result["Total Raw Audio Duration"] + del result["Acoustic Normalised Duration"] + del result["Verbatim Duration"] + del result["Transcribed Duration"] + del result["Raw Audio Duration"] + del result["Total Segment Duration"] + del result["Acoustic Normalised Word Count"] + del result["Verbatim Word Count"] + del result["Transcribed Word Count"] return result diff --git a/backend/projects/utils.py b/backend/projects/utils.py index 931271435..634066e15 100644 --- a/backend/projects/utils.py +++ b/backend/projects/utils.py @@ -173,7 +173,7 @@ def get_audio_transcription_duration(annotation_result): return audio_duration -def get_not_null_audio_transcription_duration(annotation_result, ann_id): +def get_not_null_audio_transcription_duration(annotation_result, ann_id=None): audio_duration = 0 memory = create_memory(annotation_result) for key, indexes in memory.items(): diff --git a/backend/tasks/migrations/0049_annotation_meta_stats.py b/backend/tasks/migrations/0049_annotation_meta_stats.py new file mode 100644 index 000000000..bc235ca03 --- /dev/null +++ b/backend/tasks/migrations/0049_annotation_meta_stats.py @@ -0,0 +1,18 @@ +# Generated by Django 3.2.14 on 2024-09-19 03:43 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('tasks', '0048_alter_annotation_unique_together'), + ] + + operations = [ + migrations.AddField( + model_name='annotation', + name='meta_stats', + field=models.JSONField(blank=True, help_text='Meta statistics for the annotation result', null=True, verbose_name='meta_stats'), + ), + ] diff --git a/backend/tasks/utils.py b/backend/tasks/utils.py index 3af7ea41c..71c8209c2 100644 --- a/backend/tasks/utils.py +++ b/backend/tasks/utils.py @@ -3,13 +3,14 @@ from requests import RequestException import requests from dotenv import load_dotenv -from functions.tasks import update_meta_stats from projects.utils import ( no_of_words, get_audio_project_types, get_audio_transcription_duration, get_not_null_audio_transcription_duration, + calculate_word_error_rate_between_two_audio_transcription_annotation, ) +from tasks.models import Annotation, REVIEWER_ANNOTATION, ANNOTATOR_ANNOTATION, SUPER_CHECKER_ANNOTATION, ACCEPTED, ACCEPTED_WITH_MINOR_CHANGES, ACCEPTED_WITH_MAJOR_CHANGES, VALIDATED, VALIDATED_WITH_CHANGES Queued_Task_name = { @@ -72,74 +73,185 @@ def query_flower(filters=None): def compute_meta_stats_for_annotation(ann_obj, project_type): + from tasks.views import SentenceOperationViewSet task_obj = ann_obj.task task_data = task_obj.data ced_project_type_choices = ["ContextualTranslationEditing"] result_meta_stats = {} result = ann_obj.result + + # calculating wer and bleu scores + all_annotations = Annotation.objects.filter(task_id=task_obj.id) + ar_wer_score, as_wer_score, rs_wer_score = 0, 0, 0 + ar_bleu_score, rs_bleu_score = 0, 0 + ar_done, as_done, rs_done = False, False, False + ann_ann, rev_ann, sup_ann = "", "", "" + for a in all_annotations: + if a.annotation_type == REVIEWER_ANNOTATION and a.annotation_status in [ + ACCEPTED, + ACCEPTED_WITH_MINOR_CHANGES, + ACCEPTED_WITH_MAJOR_CHANGES, + ]: + rev_ann = a + elif ( + a.annotation_type == SUPER_CHECKER_ANNOTATION + and a.annotation_status in [VALIDATED, VALIDATED_WITH_CHANGES] + ): + sup_ann = a + elif a.annotation_type == ANNOTATOR_ANNOTATION: + ann_ann = a + if ann_ann and rev_ann and not ar_done: + try: + ar_wer_score += calculate_word_error_rate_between_two_audio_transcription_annotation( + rev_ann.result, ann_ann.result, project_type + ) + ar_done = True + except Exception as e: + pass + try: + s1 = SentenceOperationViewSet() + sampleRequest = { + "annotation_result1": rev_ann.result, + "annotation_result2": ann_ann.result, + } + ar_bleu_score += float( + s1.calculate_bleu_score(sampleRequest).data["ar_bleu_score"] + ) + except Exception as e: + pass + if rev_ann and sup_ann and not rs_done: + try: + rs_wer_score += calculate_word_error_rate_between_two_audio_transcription_annotation( + sup_ann.result, rev_ann.result, project_type + ) + rs_done = True + except Exception as e: + pass + try: + s1 = SentenceOperationViewSet() + sampleRequest = { + "annotation_result1": sup_ann.result, + "annotation_result2": rev_ann.result, + } + rs_bleu_score += float( + s1.calculate_bleu_score(sampleRequest).data["rs_bleu_score"] + ) + except Exception as e: + pass + if ann_ann and sup_ann and not as_done: + meta_stats = sup_ann.meta_stats + if "as_wer_score" in meta_stats: + as_wer_score += meta_stats["as_wer_score"] + as_done = True + try: + as_wer_score += calculate_word_error_rate_between_two_audio_transcription_annotation( + sup_ann.result, ann_ann.result, project_type + ) + as_done = True + except Exception as e: + pass + if project_type == "AcousticNormalisedTranscriptionEditing": ( - acousticNormalisedWordCount, - verbatimWordCount, - acousticNormalisedDuration, - verbatimDuration, + acoustic_normalised_word_count, + verbatim_word_count, + acoustic_normalised_duration, + verbatim_duration, ) = (0, 0, 0, 0) for r in result: if r["from_name"] == "acoustic_normalised_transcribed_json": - acousticNormalisedWordCount += calculateWordCount(ann_obj.result) - acousticNormalisedDuration += calculateAudioDuration(ann_obj.result) + acoustic_normalised_word_count += calculateWordCount(ann_obj.result) + acoustic_normalised_duration += calculateAudioDuration(ann_obj.result) elif r["from_name"] == "verbatim_transcribed_json": - verbatimWordCount += calculateWordCount(ann_obj.result) - verbatimDuration += calculateAudioDuration(ann_obj.result) - # elif r["from_name"] == "transcribed_json": + verbatim_word_count += calculateWordCount(ann_obj.result) + verbatim_duration += calculateAudioDuration(ann_obj.result) + segment_duration = get_audio_transcription_duration(result) + not_null_segment_duration = get_not_null_audio_transcription_duration(result) return { - "acousticNormalisedWordCount": acousticNormalisedWordCount, - "verbatimWordCount": verbatimWordCount, - "acousticNormalisedDuration": acousticNormalisedDuration, - "verbatimDuration": verbatimDuration, + "acoustic_normalised_word_count": acoustic_normalised_word_count, + "verbatim_word_count": verbatim_word_count, + "acoustic_normalised_duration": acoustic_normalised_duration, + "verbatim_duration": verbatim_duration, + "total_segment_duration": segment_duration, + "not_null_segment_duration": not_null_segment_duration, + "ar_wer_score": ar_wer_score, + "as_wer_score": as_wer_score, + "rs_wer_score": rs_wer_score, + "ar_bleu_score": ar_bleu_score, + "rs_bleu_score": rs_bleu_score } elif project_type in ["AudioTranscription", "AudioTranscriptionEditing"]: - wordCount, transcribedDuration = 0, 0 + transcribed_word_count, transcribed_duration = 0, 0 for r in result: if r["from_name"] == "transcribed_json": - wordCount += calculateWordCount(ann_obj.result) - transcribedDuration += calculateAudioDuration(ann_obj.result) - return {"wordCount": wordCount, "transcribedDuration": transcribedDuration} + transcribed_word_count += calculateWordCount(ann_obj.result) + transcribed_duration += calculateAudioDuration(ann_obj.result) + segment_duration = get_audio_transcription_duration(result) + not_null_segment_duration = get_not_null_audio_transcription_duration(result) + return {"audio_word_count": transcribed_word_count, + "transcribed_duration": transcribed_duration, + "total_segment_duration": segment_duration, + "not_null_segment_duration": not_null_segment_duration, + "ar_wer_score": ar_wer_score, + "as_wer_score": as_wer_score, + "rs_wer_score": rs_wer_score, + "ar_bleu_score": ar_bleu_score, + "rs_bleu_score": rs_bleu_score + } elif project_type in [ "ContextualSentenceVerification", "ContextualSentenceVerificationAndDomainClassification", "ContextualTranslationEditing", "TranslationEditing", ]: - wordCount = 0 + word_count = 0 for r in result: if r["type"] == "textarea": - wordCount += calculateWordCount(ann_obj.result) - return {"wordCount": wordCount} + word_count += calculateWordCount(ann_obj.result) + return {"word_count": word_count, + "ar_wer_score": ar_wer_score, + "as_wer_score": as_wer_score, + "rs_wer_score": rs_wer_score, + "ar_bleu_score": ar_bleu_score, + "rs_bleu_score": rs_bleu_score + } elif project_type in [ "ConversationTranslation", "ConversationTranslationEditing", "ConversationVerification", ]: - wordCount, sentenceCount = 0, 0 + word_count, sentence_count = 0, 0 for r in result: if r["type"] == "textarea": - wordCount += calculateWordCount(ann_obj.result) - sentenceCount += calculateSentenceCount( + word_count += calculateWordCount(ann_obj.result) + sentence_count += calculateSentenceCount( ann_obj.result["value"]["text"][0] ) - return {"wordCount": wordCount, "sentenceCount": sentenceCount} + return {"word_count": word_count, + "sentence_count": sentence_count, + "ar_wer_score": ar_wer_score, + "as_wer_score": as_wer_score, + "rs_wer_score": rs_wer_score, + "ar_bleu_score": ar_bleu_score, + "rs_bleu_score": rs_bleu_score + } elif project_type in [ "OCRTranscription", "OCRTranscriptionEditing", "OCRSegmentCategorizationEditing", ]: - wordCount = 0 + word_count = 0 for r in result: if r["from_name"] == "ocr_transcribed_json": - wordCount += calculateWordCount(ann_obj.result) - return {"wordCount": wordCount} + word_count += calculateWordCount(ann_obj.result) + return {"word_count": word_count, + "ar_wer_score": ar_wer_score, + "as_wer_score": as_wer_score, + "rs_wer_score": rs_wer_score, + "ar_bleu_score": ar_bleu_score, + "rs_bleu_score": rs_bleu_score + } def calculateWordCount(annotation_result): diff --git a/backend/workspaces/tasks.py b/backend/workspaces/tasks.py index 90aae0516..cda293bf4 100644 --- a/backend/workspaces/tasks.py +++ b/backend/workspaces/tasks.py @@ -32,6 +32,7 @@ ocr_word_count, ) from tasks.views import SentenceOperationViewSet +from tasks.utils import calculateWordCount, calculateAudioDuration, calculateSentenceCount def get_all_annotation_reports( @@ -169,7 +170,13 @@ def get_all_annotation_reports( if project_type in ["ConversationTranslationEditing", "ConversationTranslation"] else False ) - total_audio_duration_list = [] + acoustic_normalised_duration = [] + verbatim_duration = [] + transcribed_duration = [] + acoustic_normalised_word_count = [] + verbatim_word_count = [] + transcribed_word_count = [] + total_segment_duration = [] total_raw_audio_duration_list = [] total_word_count_list = [] only_tasks = False @@ -191,36 +198,120 @@ def get_all_annotation_reports( else: total_word_count_list.append(ocr_word_count(anno.result)) elif ( - project_type in get_audio_project_types() or project_type == "AllAudioProjects" + project_type in get_audio_project_types() or project_type == "AllAudioProjects" ): for anno in submitted_tasks: meta_stats = anno.meta_stats - if "total_audio_duration" in meta_stats: - total_audio_duration_list.append(meta_stats["total_audio_duration"]) + if not meta_stats: + meta_stats = [] + if project_type == "AllAudioProjects": + if "acoustic_normalised_duration" in meta_stats: + acoustic_normalised_duration.append(meta_stats["acoustic_normalised_duration"]) + else: + for r in anno.result: + if r["from_name"] == "acoustic_normalised_transcribed_json": + acoustic_normalised_duration.append(calculateAudioDuration(r)) + if "acoustic_normalised_word_count" in meta_stats: + acoustic_normalised_word_count.append(meta_stats["acoustic_normalised_word_count"]) + else: + for r in anno.result: + if r["from_name"] == "acoustic_normalised_transcribed_json": + acoustic_normalised_word_count.append(calculateWordCount(r)) + if "verbatim_duration" in meta_stats: + verbatim_duration.append(meta_stats["verbatim_duration"]) + else: + for r in anno.result: + if r["from_name"] == "verbatim_transcribed_json": + verbatim_duration.append(calculateAudioDuration(r)) + if "verbatim_word_count" in meta_stats: + verbatim_word_count.append(meta_stats["verbatim_word_count"]) + else: + for r in anno.result: + if r["from_name"] == "verbatim_transcribed_json": + verbatim_word_count.append(calculateWordCount(r)) + if "transcribed_duration" in meta_stats: + transcribed_duration.append(meta_stats["transcribed_duration"]) + else: + for r in anno.result: + if r["from_name"] == "transcribed_json": + transcribed_duration.append(calculateAudioDuration(r)) + if "transcribed_word_count" in meta_stats: + transcribed_word_count.append(meta_stats["transcribed_word_count"]) + else: + for r in anno.result: + if r["from_name"] == "transcribed_json": + transcribed_word_count.append(calculateAudioDuration(r)) + elif project_type == "AcousticNormalisedTranscriptionEditing": + if "acoustic_normalised_duration" in meta_stats: + acoustic_normalised_duration.append(meta_stats["acoustic_normalised_duration"]) + else: + for r in anno.result: + if r["from_name"] == "acoustic_normalised_transcribed_json": + acoustic_normalised_duration.append(calculateAudioDuration(r)) + if "acoustic_normalised_word_count" in meta_stats: + acoustic_normalised_word_count.append(meta_stats["acoustic_normalised_word_count"]) + else: + for r in anno.result: + if r["from_name"] == "acoustic_normalised_transcribed_json": + acoustic_normalised_word_count.append(calculateWordCount(r)) + if "verbatim_duration" in meta_stats: + verbatim_duration.append(meta_stats["verbatim_duration"]) + else: + for r in anno.result: + if r["from_name"] == "verbatim_transcribed_json": + verbatim_duration.append(calculateAudioDuration(r)) + if "verbatim_word_count" in meta_stats: + verbatim_word_count.append(meta_stats["verbatim_word_count"]) + else: + for r in anno.result: + if r["from_name"] == "verbatim_transcribed_json": + verbatim_word_count.append(calculateWordCount(r)) else: - try: - total_audio_duration_list.append( - get_audio_transcription_duration(anno.result) - ) - except: - pass - if "raw_audio_duration" in meta_stats: - total_raw_audio_duration_list.append(meta_stats["raw_audio_duration"]) + if "transcribed_duration" in meta_stats: + transcribed_duration.append(meta_stats["transcribed_duration"]) + else: + for r in anno.result: + if r["from_name"] == "transcribed_json": + transcribed_duration.append(calculateAudioDuration(r)) + if "transcribed_word_count" in meta_stats: + transcribed_word_count.append(meta_stats["transcribed_word_count"]) + else: + for r in anno.result: + if r["from_name"] == "transcribed_json": + transcribed_word_count.append(calculateAudioDuration(r)) + try: + total_raw_audio_duration_list.append( + anno.task.data["audio_duration"] + ) + except: + pass + if "total_segment_duration" in meta_stats: + total_segment_duration.append(meta_stats["total_segment_duration"]) else: try: - total_raw_audio_duration_list.append( - anno.task.data["audio_duration"] + total_segment_duration.append( + get_audio_transcription_duration(anno.result) ) except: pass else: only_tasks = True - - total_word_count = sum(total_word_count_list) - total_audio_duration = convert_seconds_to_hours(sum(total_audio_duration_list)) - total_raw_audio_duration = convert_seconds_to_hours( - sum(total_raw_audio_duration_list) - ) + total_raw_audio_duration, total_word_count = 0, 0 + if project_type in get_audio_project_types() or project_type == "AllAudioProjects": + acoustic_normalised_duration = convert_seconds_to_hours(sum(acoustic_normalised_duration)) + verbatim_duration = convert_seconds_to_hours(sum(verbatim_duration)) + transcribed_duration = convert_seconds_to_hours(sum(transcribed_duration)) + acoustic_normalised_word_count = convert_seconds_to_hours(sum(acoustic_normalised_word_count)) + verbatim_word_count = convert_seconds_to_hours(sum(verbatim_word_count)) + transcribed_word_count = convert_seconds_to_hours(sum(transcribed_word_count)) + total_raw_audio_duration = convert_seconds_to_hours( + sum(total_raw_audio_duration_list) + ) + total_segment_duration = convert_seconds_to_hours( + sum(total_segment_duration) + ) + else: + total_word_count = sum(total_word_count_list) cumulative_rejection_score_ar = 0 if tasks_and_rejection_count_map_ar: for task, rc in tasks_and_rejection_count_map_ar.items(): @@ -232,8 +323,14 @@ def get_all_annotation_reports( "Participation Type": participation_type, "Role": role, "Type of Work": "Annotator", - "Total Segments Duration": total_audio_duration, + "Acoustic Normalised Duration": acoustic_normalised_duration, + "Verbatim Duration": verbatim_duration, + "Transcribed Duration": transcribed_duration, "Total Raw Audio Duration": total_raw_audio_duration, + "Total Segment Duration": total_segment_duration, + "Acoustic Normalised Word Count": acoustic_normalised_word_count, + "Verbatim Word Count": verbatim_word_count, + "Transcribed Word Count": transcribed_word_count, "Word Count": total_word_count, "Submitted Tasks": submitted_tasks_count, "Language": user_lang, @@ -263,17 +360,44 @@ def get_all_annotation_reports( if project_type in get_audio_project_types() or project_type == "AllAudioProjects": del result["Word Count"] + if project_type == "AcousticNormalisedTranscriptionEditing": + del result["Transcribed Duration"] + del result["Transcribed Word Count"] + elif project_type in get_audio_project_types(): + del result["Acoustic Normalised Duration"] + del result["Verbatim Duration"] + del result["Raw Audio Duration"] + del result["Acoustic Normalised Word Count"] + del result["Verbatim Word Count"] elif only_tasks: - del result["Total Segments Duration"] - del result["Total Raw Audio Duration"] del result["Word Count"] + del result["Acoustic Normalised Duration"] + del result["Verbatim Duration"] + del result["Transcribed Duration"] + del result["Raw Audio Duration"] + del result["Total Segment Duration"] + del result["Acoustic Normalised Word Count"] + del result["Verbatim Word Count"] + del result["Transcribed Word Count"] elif is_CT_OR_CTE: - del result["Total Segments Duration"] - del result["Total Raw Audio Duration"] del result["Word Count"] + del result["Acoustic Normalised Duration"] + del result["Verbatim Duration"] + del result["Transcribed Duration"] + del result["Raw Audio Duration"] + del result["Total Segment Duration"] + del result["Acoustic Normalised Word Count"] + del result["Verbatim Word Count"] + del result["Transcribed Word Count"] else: - del result["Total Segments Duration"] - del result["Total Raw Audio Duration"] + del result["Acoustic Normalised Duration"] + del result["Verbatim Duration"] + del result["Transcribed Duration"] + del result["Raw Audio Duration"] + del result["Total Segment Duration"] + del result["Acoustic Normalised Word Count"] + del result["Verbatim Word Count"] + del result["Transcribed Word Count"] return result @@ -405,9 +529,15 @@ def get_all_review_reports( if project_type in ["ConversationTranslationEditing", "ConversationTranslation"] else False ) - total_audio_duration_list = [] total_raw_audio_duration_list = [] total_word_count_list = [] + acoustic_normalised_duration = [] + verbatim_duration = [] + transcribed_duration = [] + acoustic_normalised_word_count = [] + verbatim_word_count = [] + transcribed_word_count = [] + total_segment_duration = [] only_tasks = False if is_translation_project: for anno in submitted_tasks: @@ -431,31 +561,114 @@ def get_all_review_reports( ): for anno in submitted_tasks: meta_stats = anno.meta_stats - if "total_audio_duration" in meta_stats: - total_audio_duration_list.append(meta_stats["total_audio_duration"]) + if project_type == "AllAudioProjects": + if "acoustic_normalised_duration" in meta_stats: + acoustic_normalised_duration.append(meta_stats["acoustic_normalised_duration"]) + else: + for r in anno.result: + if r["from_name"] == "acoustic_normalised_transcribed_json": + acoustic_normalised_duration.append(calculateAudioDuration(r)) + if "acoustic_normalised_word_count" in meta_stats: + acoustic_normalised_word_count.append(meta_stats["acoustic_normalised_word_count"]) + else: + for r in anno.result: + if r["from_name"] == "acoustic_normalised_transcribed_json": + acoustic_normalised_word_count.append(calculateWordCount(r)) + if "verbatim_duration" in meta_stats: + verbatim_duration.append(meta_stats["verbatim_duration"]) + else: + for r in anno.result: + if r["from_name"] == "verbatim_transcribed_json": + verbatim_duration.append(calculateAudioDuration(r)) + if "verbatim_word_count" in meta_stats: + verbatim_word_count.append(meta_stats["verbatim_word_count"]) + else: + for r in anno.result: + if r["from_name"] == "verbatim_transcribed_json": + verbatim_word_count.append(calculateWordCount(r)) + if "transcribed_duration" in meta_stats: + transcribed_duration.append(meta_stats["transcribed_duration"]) + else: + for r in anno.result: + if r["from_name"] == "transcribed_json": + transcribed_duration.append(calculateAudioDuration(r)) + if "transcribed_word_count" in meta_stats: + transcribed_word_count.append(meta_stats["transcribed_word_count"]) + else: + for r in anno.result: + if r["from_name"] == "transcribed_json": + transcribed_word_count.append(calculateAudioDuration(r)) + elif project_type == "AcousticNormalisedTranscriptionEditing": + if "acoustic_normalised_duration" in meta_stats: + acoustic_normalised_duration.append(meta_stats["acoustic_normalised_duration"]) + else: + for r in anno.result: + if r["from_name"] == "acoustic_normalised_transcribed_json": + acoustic_normalised_duration.append(calculateAudioDuration(r)) + if "acoustic_normalised_word_count" in meta_stats: + acoustic_normalised_word_count.append(meta_stats["acoustic_normalised_word_count"]) + else: + for r in anno.result: + if r["from_name"] == "acoustic_normalised_transcribed_json": + acoustic_normalised_word_count.append(calculateWordCount(r)) + if "verbatim_duration" in meta_stats: + verbatim_duration.append(meta_stats["verbatim_duration"]) + else: + for r in anno.result: + if r["from_name"] == "verbatim_transcribed_json": + verbatim_duration.append(calculateAudioDuration(r)) + if "verbatim_word_count" in meta_stats: + verbatim_word_count.append(meta_stats["verbatim_word_count"]) + else: + for r in anno.result: + if r["from_name"] == "verbatim_transcribed_json": + verbatim_word_count.append(calculateWordCount(r)) else: - try: - total_audio_duration_list.append( - get_audio_transcription_duration(anno.result) - ) - except: - pass - if "raw_audio_duration" in meta_stats: - total_raw_audio_duration_list.append(meta_stats["raw_audio_duration"]) + if "transcribed_duration" in meta_stats: + transcribed_duration.append(meta_stats["transcribed_duration"]) + else: + for r in anno.result: + if r["from_name"] == "transcribed_json": + transcribed_duration.append(calculateAudioDuration(r)) + if "transcribed_word_count" in meta_stats: + transcribed_word_count.append(meta_stats["transcribed_word_count"]) + else: + for r in anno.result: + if r["from_name"] == "transcribed_json": + transcribed_word_count.append(calculateAudioDuration(r)) + if "total_segment_duration" in meta_stats: + total_segment_duration.append(meta_stats["total_segment_duration"]) else: try: - total_raw_audio_duration_list.append( - anno.task.data["audio_duration"] + total_segment_duration.append( + get_audio_transcription_duration(anno.result) ) except: pass + try: + total_raw_audio_duration_list.append( + anno.task.data["audio_duration"] + ) + except: + pass else: only_tasks = True - total_word_count = sum(total_word_count_list) - total_audio_duration = convert_seconds_to_hours(sum(total_audio_duration_list)) - total_raw_audio_duration = convert_seconds_to_hours( - sum(total_raw_audio_duration_list) - ) + total_raw_audio_duration, total_word_count = 0, 0 + if project_type in get_audio_project_types() or project_type == "AllAudioProjects": + acoustic_normalised_duration = convert_seconds_to_hours(sum(acoustic_normalised_duration)) + verbatim_duration = convert_seconds_to_hours(sum(verbatim_duration)) + transcribed_duration = convert_seconds_to_hours(sum(transcribed_duration)) + acoustic_normalised_word_count = convert_seconds_to_hours(sum(acoustic_normalised_word_count)) + verbatim_word_count = convert_seconds_to_hours(sum(verbatim_word_count)) + transcribed_word_count = convert_seconds_to_hours(sum(transcribed_word_count)) + total_raw_audio_duration = convert_seconds_to_hours( + sum(total_raw_audio_duration_list) + ) + total_segment_duration = convert_seconds_to_hours( + sum(total_segment_duration) + ) + else: + total_word_count = sum(total_word_count_list) cumulative_rejection_score_ar = 0 if tasks_and_rejection_count_map_ar: for task, rc in tasks_and_rejection_count_map_ar.items(): @@ -472,8 +685,14 @@ def get_all_review_reports( "Participation Type": participation_type, "Role": role, "Type of Work": "Review", - "Total Segments Duration": total_audio_duration, + "Acoustic Normalised Duration": acoustic_normalised_duration, + "Verbatim Duration": verbatim_duration, + "Transcribed Duration": transcribed_duration, "Total Raw Audio Duration": total_raw_audio_duration, + "Total Segment Duration": total_segment_duration, + "Acoustic Normalised Word Count": acoustic_normalised_word_count, + "Verbatim Word Count": verbatim_word_count, + "Transcribed Word Count": transcribed_word_count, "Word Count": total_word_count, "Submitted Tasks": submitted_tasks_count, "Language": user_lang, @@ -500,17 +719,44 @@ def get_all_review_reports( if project_type in get_audio_project_types() or project_type == "AllAudioProjects": del result["Word Count"] + if project_type == "AcousticNormalisedTranscriptionEditing": + del result["Transcribed Duration"] + del result["Transcribed Word Count"] + elif project_type in get_audio_project_types(): + del result["Acoustic Normalised Duration"] + del result["Verbatim Duration"] + del result["Raw Audio Duration"] + del result["Acoustic Normalised Word Count"] + del result["Verbatim Word Count"] elif only_tasks: - del result["Total Segments Duration"] - del result["Total Raw Audio Duration"] del result["Word Count"] + del result["Acoustic Normalised Duration"] + del result["Verbatim Duration"] + del result["Transcribed Duration"] + del result["Raw Audio Duration"] + del result["Total Segment Duration"] + del result["Acoustic Normalised Word Count"] + del result["Verbatim Word Count"] + del result["Transcribed Word Count"] elif is_CT_OR_CTE: - del result["Total Segments Duration"] - del result["Total Raw Audio Duration"] del result["Word Count"] + del result["Acoustic Normalised Duration"] + del result["Verbatim Duration"] + del result["Transcribed Duration"] + del result["Raw Audio Duration"] + del result["Total Segment Duration"] + del result["Acoustic Normalised Word Count"] + del result["Verbatim Word Count"] + del result["Transcribed Word Count"] else: - del result["Total Segments Duration"] - del result["Total Raw Audio Duration"] + del result["Acoustic Normalised Duration"] + del result["Verbatim Duration"] + del result["Transcribed Duration"] + del result["Raw Audio Duration"] + del result["Total Segment Duration"] + del result["Acoustic Normalised Word Count"] + del result["Verbatim Word Count"] + del result["Transcribed Word Count"] return result @@ -570,7 +816,13 @@ def get_all_supercheck_reports( else False ) validated_word_count_list = [] - validated_audio_duration_list = [] + acoustic_normalised_duration = [] + verbatim_duration = [] + transcribed_duration = [] + acoustic_normalised_word_count = [] + verbatim_word_count = [] + transcribed_word_count = [] + total_segment_duration = [] validated_raw_audio_duration_list = [] only_tasks = False if is_translation_project: @@ -595,36 +847,115 @@ def get_all_supercheck_reports( ): for anno in submitted_tasks: meta_stats = anno.meta_stats - if "total_audio_duration" in meta_stats: - validated_audio_duration_list.append(meta_stats["total_audio_duration"]) + if project_type == "AllAudioProjects": + if "acoustic_normalised_duration" in meta_stats: + acoustic_normalised_duration.append(meta_stats["acoustic_normalised_duration"]) + else: + for r in anno.result: + if r["from_name"] == "acoustic_normalised_transcribed_json": + acoustic_normalised_duration.append(calculateAudioDuration(r)) + if "acoustic_normalised_word_count" in meta_stats: + acoustic_normalised_word_count.append(meta_stats["acoustic_normalised_word_count"]) + else: + for r in anno.result: + if r["from_name"] == "acoustic_normalised_transcribed_json": + acoustic_normalised_word_count.append(calculateWordCount(r)) + if "verbatim_duration" in meta_stats: + verbatim_duration.append(meta_stats["verbatim_duration"]) + else: + for r in anno.result: + if r["from_name"] == "verbatim_transcribed_json": + verbatim_duration.append(calculateAudioDuration(r)) + if "verbatim_word_count" in meta_stats: + verbatim_word_count.append(meta_stats["verbatim_word_count"]) + else: + for r in anno.result: + if r["from_name"] == "verbatim_transcribed_json": + verbatim_word_count.append(calculateWordCount(r)) + if "transcribed_duration" in meta_stats: + transcribed_duration.append(meta_stats["transcribed_duration"]) + else: + for r in anno.result: + if r["from_name"] == "transcribed_json": + transcribed_duration.append(calculateAudioDuration(r)) + if "transcribed_word_count" in meta_stats: + transcribed_word_count.append(meta_stats["transcribed_word_count"]) + else: + for r in anno.result: + if r["from_name"] == "transcribed_json": + transcribed_word_count.append(calculateAudioDuration(r)) + elif project_type == "AcousticNormalisedTranscriptionEditing": + if "acoustic_normalised_duration" in meta_stats: + acoustic_normalised_duration.append(meta_stats["acoustic_normalised_duration"]) + else: + for r in anno.result: + if r["from_name"] == "acoustic_normalised_transcribed_json": + acoustic_normalised_duration.append(calculateAudioDuration(r)) + if "acoustic_normalised_word_count" in meta_stats: + acoustic_normalised_word_count.append(meta_stats["acoustic_normalised_word_count"]) + else: + for r in anno.result: + if r["from_name"] == "acoustic_normalised_transcribed_json": + acoustic_normalised_word_count.append(calculateWordCount(r)) + if "verbatim_duration" in meta_stats: + verbatim_duration.append(meta_stats["verbatim_duration"]) + else: + for r in anno.result: + if r["from_name"] == "verbatim_transcribed_json": + verbatim_duration.append(calculateAudioDuration(r)) + if "verbatim_word_count" in meta_stats: + verbatim_word_count.append(meta_stats["verbatim_word_count"]) + else: + for r in anno.result: + if r["from_name"] == "verbatim_transcribed_json": + verbatim_word_count.append(calculateWordCount(r)) + else: + if "transcribed_duration" in meta_stats: + transcribed_duration.append(meta_stats["transcribed_duration"]) + else: + for r in anno.result: + if r["from_name"] == "transcribed_json": + transcribed_duration.append(calculateAudioDuration(r)) + if "transcribed_word_count" in meta_stats: + transcribed_word_count.append(meta_stats["transcribed_word_count"]) + else: + for r in anno.result: + if r["from_name"] == "transcribed_json": + transcribed_word_count.append(calculateAudioDuration(r)) + if "total_segment_duration" in meta_stats: + total_segment_duration.append(meta_stats["total_segment_duration"]) else: try: - validated_audio_duration_list.append( + total_segment_duration.append( get_audio_transcription_duration(anno.result) ) except: pass - if "raw_audio_duration" in meta_stats: + try: validated_raw_audio_duration_list.append( - meta_stats["raw_audio_duration"] + anno.task.data["audio_duration"] ) - else: - try: - validated_raw_audio_duration_list.append( - anno.task.data["audio_duration"] - ) - except: - pass + except: + pass else: only_tasks = True - validated_word_count = sum(validated_word_count_list) - validated_audio_duration = convert_seconds_to_hours( - sum(validated_audio_duration_list) - ) - validated_raw_audio_duration = convert_seconds_to_hours( - sum(validated_raw_audio_duration_list) - ) + validated_raw_audio_duration, validated_word_count = 0, 0 + if project_type in get_audio_project_types() or project_type == "AllAudioProjects": + acoustic_normalised_duration = convert_seconds_to_hours(sum(acoustic_normalised_duration)) + verbatim_duration = convert_seconds_to_hours(sum(verbatim_duration)) + transcribed_duration = convert_seconds_to_hours(sum(transcribed_duration)) + acoustic_normalised_word_count = convert_seconds_to_hours(sum(acoustic_normalised_word_count)) + verbatim_word_count = convert_seconds_to_hours(sum(verbatim_word_count)) + transcribed_word_count = convert_seconds_to_hours(sum(transcribed_word_count)) + validated_raw_audio_duration = convert_seconds_to_hours( + sum(validated_raw_audio_duration_list) + ) + total_segment_duration = convert_seconds_to_hours( + sum(total_segment_duration) + ) + else: + validated_word_count = sum(validated_word_count_list) cumulative_rejection_score_rs = 0 if tasks_and_rejection_count_map_rs: for task, rc in tasks_and_rejection_count_map_rs.items(): @@ -636,8 +967,14 @@ def get_all_supercheck_reports( "Participation Type": participation_type, "Role": role, "Type of Work": "Supercheck", - "Total Segments Duration": validated_audio_duration, + "Acoustic Normalised Duration": acoustic_normalised_duration, + "Verbatim Duration": verbatim_duration, + "Transcribed Duration": transcribed_duration, "Total Raw Audio Duration": validated_raw_audio_duration, + "Total Segment Duration": total_segment_duration, + "Acoustic Normalised Word Count": acoustic_normalised_word_count, + "Verbatim Word Count": verbatim_word_count, + "Transcribed Word Count": transcribed_word_count, "Word Count": validated_word_count, "Submitted Tasks": submitted_tasks_count, "Language": user_lang, @@ -649,19 +986,44 @@ def get_all_supercheck_reports( if project_type in get_audio_project_types() or project_type == "AllAudioProjects": del result["Word Count"] + if project_type == "AcousticNormalisedTranscriptionEditing": + del result["Transcribed Duration"] + del result["Transcribed Word Count"] + elif project_type in get_audio_project_types(): + del result["Acoustic Normalised Duration"] + del result["Verbatim Duration"] + del result["Raw Audio Duration"] + del result["Acoustic Normalised Word Count"] + del result["Verbatim Word Count"] elif only_tasks: - del result["Total Segments Duration"] - del result["Total Raw Audio Duration"] del result["Word Count"] + del result["Acoustic Normalised Duration"] + del result["Verbatim Duration"] + del result["Transcribed Duration"] + del result["Raw Audio Duration"] + del result["Total Segment Duration"] + del result["Acoustic Normalised Word Count"] + del result["Verbatim Word Count"] + del result["Transcribed Word Count"] elif is_CT_OR_CTE: - del result["Total Segments Duration"] - del result["Total Raw Audio Duration"] del result["Word Count"] + del result["Acoustic Normalised Duration"] + del result["Verbatim Duration"] + del result["Transcribed Duration"] + del result["Raw Audio Duration"] + del result["Total Segment Duration"] + del result["Acoustic Normalised Word Count"] + del result["Verbatim Word Count"] + del result["Transcribed Word Count"] else: - del result["Total Segments Duration"] - del result["Total Raw Audio Duration"] - - return result + del result["Acoustic Normalised Duration"] + del result["Verbatim Duration"] + del result["Transcribed Duration"] + del result["Raw Audio Duration"] + del result["Total Segment Duration"] + del result["Acoustic Normalised Word Count"] + del result["Verbatim Word Count"] + del result["Transcribed Word Count"] @shared_task(queue="reports") From 568ac0ebad80ea3d0673780581eec93642d8d2c4 Mon Sep 17 00:00:00 2001 From: Kunal Tiwary Date: Thu, 19 Sep 2024 07:10:05 +0000 Subject: [PATCH 4/5] black filtering --- backend/organizations/tasks.py | 126 ++++++++++++------ .../migrations/0049_annotation_meta_stats.py | 14 +- backend/tasks/utils.py | 91 +++++++------ backend/workspaces/tasks.py | 124 +++++++++++------ 4 files changed, 229 insertions(+), 126 deletions(-) diff --git a/backend/organizations/tasks.py b/backend/organizations/tasks.py index d4440e0cd..1a8754cf9 100644 --- a/backend/organizations/tasks.py +++ b/backend/organizations/tasks.py @@ -35,7 +35,11 @@ un_pack_annotation_tasks, ) from django.db.models import Q -from tasks.utils import calculateWordCount, calculateAudioDuration, calculateSentenceCount +from tasks.utils import ( + calculateWordCount, + calculateAudioDuration, + calculateSentenceCount, +) def get_all_annotation_reports( @@ -209,13 +213,19 @@ def get_all_annotation_reports( meta_stats = [] if project_type == "AllAudioProjects": if "acoustic_normalised_duration" in meta_stats: - acoustic_normalised_duration.append(meta_stats["acoustic_normalised_duration"]) + acoustic_normalised_duration.append( + meta_stats["acoustic_normalised_duration"] + ) else: for r in anno.result: if r["from_name"] == "acoustic_normalised_transcribed_json": - acoustic_normalised_duration.append(calculateAudioDuration(r)) + acoustic_normalised_duration.append( + calculateAudioDuration(r) + ) if "acoustic_normalised_word_count" in meta_stats: - acoustic_normalised_word_count.append(meta_stats["acoustic_normalised_word_count"]) + acoustic_normalised_word_count.append( + meta_stats["acoustic_normalised_word_count"] + ) else: for r in anno.result: if r["from_name"] == "acoustic_normalised_transcribed_json": @@ -246,13 +256,19 @@ def get_all_annotation_reports( transcribed_word_count.append(calculateAudioDuration(r)) elif project_type == "AcousticNormalisedTranscriptionEditing": if "acoustic_normalised_duration" in meta_stats: - acoustic_normalised_duration.append(meta_stats["acoustic_normalised_duration"]) + acoustic_normalised_duration.append( + meta_stats["acoustic_normalised_duration"] + ) else: for r in anno.result: if r["from_name"] == "acoustic_normalised_transcribed_json": - acoustic_normalised_duration.append(calculateAudioDuration(r)) + acoustic_normalised_duration.append( + calculateAudioDuration(r) + ) if "acoustic_normalised_word_count" in meta_stats: - acoustic_normalised_word_count.append(meta_stats["acoustic_normalised_word_count"]) + acoustic_normalised_word_count.append( + meta_stats["acoustic_normalised_word_count"] + ) else: for r in anno.result: if r["from_name"] == "acoustic_normalised_transcribed_json": @@ -283,9 +299,7 @@ def get_all_annotation_reports( if r["from_name"] == "transcribed_json": transcribed_word_count.append(calculateAudioDuration(r)) try: - total_raw_audio_duration_list.append( - anno.task.data["audio_duration"] - ) + total_raw_audio_duration_list.append(anno.task.data["audio_duration"]) except: pass if "total_segment_duration" in meta_stats: @@ -301,18 +315,20 @@ def get_all_annotation_reports( only_tasks = True total_raw_audio_duration, total_word_count = 0, 0 if project_type in get_audio_project_types() or project_type == "AllAudioProjects": - acoustic_normalised_duration = convert_seconds_to_hours(sum(acoustic_normalised_duration)) + acoustic_normalised_duration = convert_seconds_to_hours( + sum(acoustic_normalised_duration) + ) verbatim_duration = convert_seconds_to_hours(sum(verbatim_duration)) transcribed_duration = convert_seconds_to_hours(sum(transcribed_duration)) - acoustic_normalised_word_count = convert_seconds_to_hours(sum(acoustic_normalised_word_count)) + acoustic_normalised_word_count = convert_seconds_to_hours( + sum(acoustic_normalised_word_count) + ) verbatim_word_count = convert_seconds_to_hours(sum(verbatim_word_count)) transcribed_word_count = convert_seconds_to_hours(sum(transcribed_word_count)) total_raw_audio_duration = convert_seconds_to_hours( sum(total_raw_audio_duration_list) ) - total_segment_duration = convert_seconds_to_hours( - sum(total_segment_duration) - ) + total_segment_duration = convert_seconds_to_hours(sum(total_segment_duration)) else: total_word_count = sum(total_word_count_list) cumulative_rejection_score_ar = 0 @@ -559,19 +575,25 @@ def get_all_review_reports( else: total_word_count_list.append(ocr_word_count(anno.result)) elif ( - project_type in get_audio_project_types() or project_type == "AllAudioProjects" + project_type in get_audio_project_types() or project_type == "AllAudioProjects" ): for anno in submitted_tasks: meta_stats = anno.meta_stats if project_type == "AllAudioProjects": if "acoustic_normalised_duration" in meta_stats: - acoustic_normalised_duration.append(meta_stats["acoustic_normalised_duration"]) + acoustic_normalised_duration.append( + meta_stats["acoustic_normalised_duration"] + ) else: for r in anno.result: if r["from_name"] == "acoustic_normalised_transcribed_json": - acoustic_normalised_duration.append(calculateAudioDuration(r)) + acoustic_normalised_duration.append( + calculateAudioDuration(r) + ) if "acoustic_normalised_word_count" in meta_stats: - acoustic_normalised_word_count.append(meta_stats["acoustic_normalised_word_count"]) + acoustic_normalised_word_count.append( + meta_stats["acoustic_normalised_word_count"] + ) else: for r in anno.result: if r["from_name"] == "acoustic_normalised_transcribed_json": @@ -602,13 +624,19 @@ def get_all_review_reports( transcribed_word_count.append(calculateAudioDuration(r)) elif project_type == "AcousticNormalisedTranscriptionEditing": if "acoustic_normalised_duration" in meta_stats: - acoustic_normalised_duration.append(meta_stats["acoustic_normalised_duration"]) + acoustic_normalised_duration.append( + meta_stats["acoustic_normalised_duration"] + ) else: for r in anno.result: if r["from_name"] == "acoustic_normalised_transcribed_json": - acoustic_normalised_duration.append(calculateAudioDuration(r)) + acoustic_normalised_duration.append( + calculateAudioDuration(r) + ) if "acoustic_normalised_word_count" in meta_stats: - acoustic_normalised_word_count.append(meta_stats["acoustic_normalised_word_count"]) + acoustic_normalised_word_count.append( + meta_stats["acoustic_normalised_word_count"] + ) else: for r in anno.result: if r["from_name"] == "acoustic_normalised_transcribed_json": @@ -648,9 +676,7 @@ def get_all_review_reports( except: pass try: - total_raw_audio_duration_list.append( - anno.task.data["audio_duration"] - ) + total_raw_audio_duration_list.append(anno.task.data["audio_duration"]) except: pass else: @@ -658,18 +684,20 @@ def get_all_review_reports( total_raw_audio_duration, total_word_count = 0, 0 if project_type in get_audio_project_types() or project_type == "AllAudioProjects": - acoustic_normalised_duration = convert_seconds_to_hours(sum(acoustic_normalised_duration)) + acoustic_normalised_duration = convert_seconds_to_hours( + sum(acoustic_normalised_duration) + ) verbatim_duration = convert_seconds_to_hours(sum(verbatim_duration)) transcribed_duration = convert_seconds_to_hours(sum(transcribed_duration)) - acoustic_normalised_word_count = convert_seconds_to_hours(sum(acoustic_normalised_word_count)) + acoustic_normalised_word_count = convert_seconds_to_hours( + sum(acoustic_normalised_word_count) + ) verbatim_word_count = convert_seconds_to_hours(sum(verbatim_word_count)) transcribed_word_count = convert_seconds_to_hours(sum(transcribed_word_count)) total_raw_audio_duration = convert_seconds_to_hours( sum(total_raw_audio_duration_list) ) - total_segment_duration = convert_seconds_to_hours( - sum(total_segment_duration) - ) + total_segment_duration = convert_seconds_to_hours(sum(total_segment_duration)) else: total_word_count = sum(total_word_count_list) cumulative_rejection_score_ar = 0 @@ -845,19 +873,25 @@ def get_all_supercheck_reports( else: validated_word_count_list.append(ocr_word_count(anno.result)) elif ( - project_type in get_audio_project_types() or project_type == "AllAudioProjects" + project_type in get_audio_project_types() or project_type == "AllAudioProjects" ): for anno in submitted_tasks: meta_stats = anno.meta_stats if project_type == "AllAudioProjects": if "acoustic_normalised_duration" in meta_stats: - acoustic_normalised_duration.append(meta_stats["acoustic_normalised_duration"]) + acoustic_normalised_duration.append( + meta_stats["acoustic_normalised_duration"] + ) else: for r in anno.result: if r["from_name"] == "acoustic_normalised_transcribed_json": - acoustic_normalised_duration.append(calculateAudioDuration(r)) + acoustic_normalised_duration.append( + calculateAudioDuration(r) + ) if "acoustic_normalised_word_count" in meta_stats: - acoustic_normalised_word_count.append(meta_stats["acoustic_normalised_word_count"]) + acoustic_normalised_word_count.append( + meta_stats["acoustic_normalised_word_count"] + ) else: for r in anno.result: if r["from_name"] == "acoustic_normalised_transcribed_json": @@ -888,13 +922,19 @@ def get_all_supercheck_reports( transcribed_word_count.append(calculateAudioDuration(r)) elif project_type == "AcousticNormalisedTranscriptionEditing": if "acoustic_normalised_duration" in meta_stats: - acoustic_normalised_duration.append(meta_stats["acoustic_normalised_duration"]) + acoustic_normalised_duration.append( + meta_stats["acoustic_normalised_duration"] + ) else: for r in anno.result: if r["from_name"] == "acoustic_normalised_transcribed_json": - acoustic_normalised_duration.append(calculateAudioDuration(r)) + acoustic_normalised_duration.append( + calculateAudioDuration(r) + ) if "acoustic_normalised_word_count" in meta_stats: - acoustic_normalised_word_count.append(meta_stats["acoustic_normalised_word_count"]) + acoustic_normalised_word_count.append( + meta_stats["acoustic_normalised_word_count"] + ) else: for r in anno.result: if r["from_name"] == "acoustic_normalised_transcribed_json": @@ -944,18 +984,20 @@ def get_all_supercheck_reports( validated_raw_audio_duration, validated_word_count = 0, 0 if project_type in get_audio_project_types() or project_type == "AllAudioProjects": - acoustic_normalised_duration = convert_seconds_to_hours(sum(acoustic_normalised_duration)) + acoustic_normalised_duration = convert_seconds_to_hours( + sum(acoustic_normalised_duration) + ) verbatim_duration = convert_seconds_to_hours(sum(verbatim_duration)) transcribed_duration = convert_seconds_to_hours(sum(transcribed_duration)) - acoustic_normalised_word_count = convert_seconds_to_hours(sum(acoustic_normalised_word_count)) + acoustic_normalised_word_count = convert_seconds_to_hours( + sum(acoustic_normalised_word_count) + ) verbatim_word_count = convert_seconds_to_hours(sum(verbatim_word_count)) transcribed_word_count = convert_seconds_to_hours(sum(transcribed_word_count)) validated_raw_audio_duration = convert_seconds_to_hours( sum(validated_raw_audio_duration_list) ) - total_segment_duration = convert_seconds_to_hours( - sum(total_segment_duration) - ) + total_segment_duration = convert_seconds_to_hours(sum(total_segment_duration)) else: validated_word_count = sum(validated_word_count_list) cumulative_rejection_score_rs = 0 diff --git a/backend/tasks/migrations/0049_annotation_meta_stats.py b/backend/tasks/migrations/0049_annotation_meta_stats.py index bc235ca03..a9413e0f4 100644 --- a/backend/tasks/migrations/0049_annotation_meta_stats.py +++ b/backend/tasks/migrations/0049_annotation_meta_stats.py @@ -4,15 +4,19 @@ class Migration(migrations.Migration): - dependencies = [ - ('tasks', '0048_alter_annotation_unique_together'), + ("tasks", "0048_alter_annotation_unique_together"), ] operations = [ migrations.AddField( - model_name='annotation', - name='meta_stats', - field=models.JSONField(blank=True, help_text='Meta statistics for the annotation result', null=True, verbose_name='meta_stats'), + model_name="annotation", + name="meta_stats", + field=models.JSONField( + blank=True, + help_text="Meta statistics for the annotation result", + null=True, + verbose_name="meta_stats", + ), ), ] diff --git a/backend/tasks/utils.py b/backend/tasks/utils.py index 71c8209c2..da19d59d2 100644 --- a/backend/tasks/utils.py +++ b/backend/tasks/utils.py @@ -10,7 +10,17 @@ get_not_null_audio_transcription_duration, calculate_word_error_rate_between_two_audio_transcription_annotation, ) -from tasks.models import Annotation, REVIEWER_ANNOTATION, ANNOTATOR_ANNOTATION, SUPER_CHECKER_ANNOTATION, ACCEPTED, ACCEPTED_WITH_MINOR_CHANGES, ACCEPTED_WITH_MAJOR_CHANGES, VALIDATED, VALIDATED_WITH_CHANGES +from tasks.models import ( + Annotation, + REVIEWER_ANNOTATION, + ANNOTATOR_ANNOTATION, + SUPER_CHECKER_ANNOTATION, + ACCEPTED, + ACCEPTED_WITH_MINOR_CHANGES, + ACCEPTED_WITH_MAJOR_CHANGES, + VALIDATED, + VALIDATED_WITH_CHANGES, +) Queued_Task_name = { @@ -74,6 +84,7 @@ def query_flower(filters=None): def compute_meta_stats_for_annotation(ann_obj, project_type): from tasks.views import SentenceOperationViewSet + task_obj = ann_obj.task task_data = task_obj.data ced_project_type_choices = ["ContextualTranslationEditing"] @@ -93,10 +104,10 @@ def compute_meta_stats_for_annotation(ann_obj, project_type): ACCEPTED_WITH_MAJOR_CHANGES, ]: rev_ann = a - elif ( - a.annotation_type == SUPER_CHECKER_ANNOTATION - and a.annotation_status in [VALIDATED, VALIDATED_WITH_CHANGES] - ): + elif a.annotation_type == SUPER_CHECKER_ANNOTATION and a.annotation_status in [ + VALIDATED, + VALIDATED_WITH_CHANGES, + ]: sup_ann = a elif a.annotation_type == ANNOTATOR_ANNOTATION: ann_ann = a @@ -178,7 +189,7 @@ def compute_meta_stats_for_annotation(ann_obj, project_type): "as_wer_score": as_wer_score, "rs_wer_score": rs_wer_score, "ar_bleu_score": ar_bleu_score, - "rs_bleu_score": rs_bleu_score + "rs_bleu_score": rs_bleu_score, } elif project_type in ["AudioTranscription", "AudioTranscriptionEditing"]: transcribed_word_count, transcribed_duration = 0, 0 @@ -188,16 +199,17 @@ def compute_meta_stats_for_annotation(ann_obj, project_type): transcribed_duration += calculateAudioDuration(ann_obj.result) segment_duration = get_audio_transcription_duration(result) not_null_segment_duration = get_not_null_audio_transcription_duration(result) - return {"audio_word_count": transcribed_word_count, - "transcribed_duration": transcribed_duration, - "total_segment_duration": segment_duration, - "not_null_segment_duration": not_null_segment_duration, - "ar_wer_score": ar_wer_score, - "as_wer_score": as_wer_score, - "rs_wer_score": rs_wer_score, - "ar_bleu_score": ar_bleu_score, - "rs_bleu_score": rs_bleu_score - } + return { + "audio_word_count": transcribed_word_count, + "transcribed_duration": transcribed_duration, + "total_segment_duration": segment_duration, + "not_null_segment_duration": not_null_segment_duration, + "ar_wer_score": ar_wer_score, + "as_wer_score": as_wer_score, + "rs_wer_score": rs_wer_score, + "ar_bleu_score": ar_bleu_score, + "rs_bleu_score": rs_bleu_score, + } elif project_type in [ "ContextualSentenceVerification", "ContextualSentenceVerificationAndDomainClassification", @@ -208,13 +220,14 @@ def compute_meta_stats_for_annotation(ann_obj, project_type): for r in result: if r["type"] == "textarea": word_count += calculateWordCount(ann_obj.result) - return {"word_count": word_count, - "ar_wer_score": ar_wer_score, - "as_wer_score": as_wer_score, - "rs_wer_score": rs_wer_score, - "ar_bleu_score": ar_bleu_score, - "rs_bleu_score": rs_bleu_score - } + return { + "word_count": word_count, + "ar_wer_score": ar_wer_score, + "as_wer_score": as_wer_score, + "rs_wer_score": rs_wer_score, + "ar_bleu_score": ar_bleu_score, + "rs_bleu_score": rs_bleu_score, + } elif project_type in [ "ConversationTranslation", "ConversationTranslationEditing", @@ -228,14 +241,15 @@ def compute_meta_stats_for_annotation(ann_obj, project_type): ann_obj.result["value"]["text"][0] ) - return {"word_count": word_count, - "sentence_count": sentence_count, - "ar_wer_score": ar_wer_score, - "as_wer_score": as_wer_score, - "rs_wer_score": rs_wer_score, - "ar_bleu_score": ar_bleu_score, - "rs_bleu_score": rs_bleu_score - } + return { + "word_count": word_count, + "sentence_count": sentence_count, + "ar_wer_score": ar_wer_score, + "as_wer_score": as_wer_score, + "rs_wer_score": rs_wer_score, + "ar_bleu_score": ar_bleu_score, + "rs_bleu_score": rs_bleu_score, + } elif project_type in [ "OCRTranscription", "OCRTranscriptionEditing", @@ -245,13 +259,14 @@ def compute_meta_stats_for_annotation(ann_obj, project_type): for r in result: if r["from_name"] == "ocr_transcribed_json": word_count += calculateWordCount(ann_obj.result) - return {"word_count": word_count, - "ar_wer_score": ar_wer_score, - "as_wer_score": as_wer_score, - "rs_wer_score": rs_wer_score, - "ar_bleu_score": ar_bleu_score, - "rs_bleu_score": rs_bleu_score - } + return { + "word_count": word_count, + "ar_wer_score": ar_wer_score, + "as_wer_score": as_wer_score, + "rs_wer_score": rs_wer_score, + "ar_bleu_score": ar_bleu_score, + "rs_bleu_score": rs_bleu_score, + } def calculateWordCount(annotation_result): diff --git a/backend/workspaces/tasks.py b/backend/workspaces/tasks.py index cda293bf4..440d8392e 100644 --- a/backend/workspaces/tasks.py +++ b/backend/workspaces/tasks.py @@ -32,7 +32,11 @@ ocr_word_count, ) from tasks.views import SentenceOperationViewSet -from tasks.utils import calculateWordCount, calculateAudioDuration, calculateSentenceCount +from tasks.utils import ( + calculateWordCount, + calculateAudioDuration, + calculateSentenceCount, +) def get_all_annotation_reports( @@ -198,7 +202,7 @@ def get_all_annotation_reports( else: total_word_count_list.append(ocr_word_count(anno.result)) elif ( - project_type in get_audio_project_types() or project_type == "AllAudioProjects" + project_type in get_audio_project_types() or project_type == "AllAudioProjects" ): for anno in submitted_tasks: meta_stats = anno.meta_stats @@ -206,13 +210,19 @@ def get_all_annotation_reports( meta_stats = [] if project_type == "AllAudioProjects": if "acoustic_normalised_duration" in meta_stats: - acoustic_normalised_duration.append(meta_stats["acoustic_normalised_duration"]) + acoustic_normalised_duration.append( + meta_stats["acoustic_normalised_duration"] + ) else: for r in anno.result: if r["from_name"] == "acoustic_normalised_transcribed_json": - acoustic_normalised_duration.append(calculateAudioDuration(r)) + acoustic_normalised_duration.append( + calculateAudioDuration(r) + ) if "acoustic_normalised_word_count" in meta_stats: - acoustic_normalised_word_count.append(meta_stats["acoustic_normalised_word_count"]) + acoustic_normalised_word_count.append( + meta_stats["acoustic_normalised_word_count"] + ) else: for r in anno.result: if r["from_name"] == "acoustic_normalised_transcribed_json": @@ -243,13 +253,19 @@ def get_all_annotation_reports( transcribed_word_count.append(calculateAudioDuration(r)) elif project_type == "AcousticNormalisedTranscriptionEditing": if "acoustic_normalised_duration" in meta_stats: - acoustic_normalised_duration.append(meta_stats["acoustic_normalised_duration"]) + acoustic_normalised_duration.append( + meta_stats["acoustic_normalised_duration"] + ) else: for r in anno.result: if r["from_name"] == "acoustic_normalised_transcribed_json": - acoustic_normalised_duration.append(calculateAudioDuration(r)) + acoustic_normalised_duration.append( + calculateAudioDuration(r) + ) if "acoustic_normalised_word_count" in meta_stats: - acoustic_normalised_word_count.append(meta_stats["acoustic_normalised_word_count"]) + acoustic_normalised_word_count.append( + meta_stats["acoustic_normalised_word_count"] + ) else: for r in anno.result: if r["from_name"] == "acoustic_normalised_transcribed_json": @@ -280,9 +296,7 @@ def get_all_annotation_reports( if r["from_name"] == "transcribed_json": transcribed_word_count.append(calculateAudioDuration(r)) try: - total_raw_audio_duration_list.append( - anno.task.data["audio_duration"] - ) + total_raw_audio_duration_list.append(anno.task.data["audio_duration"]) except: pass if "total_segment_duration" in meta_stats: @@ -298,18 +312,20 @@ def get_all_annotation_reports( only_tasks = True total_raw_audio_duration, total_word_count = 0, 0 if project_type in get_audio_project_types() or project_type == "AllAudioProjects": - acoustic_normalised_duration = convert_seconds_to_hours(sum(acoustic_normalised_duration)) + acoustic_normalised_duration = convert_seconds_to_hours( + sum(acoustic_normalised_duration) + ) verbatim_duration = convert_seconds_to_hours(sum(verbatim_duration)) transcribed_duration = convert_seconds_to_hours(sum(transcribed_duration)) - acoustic_normalised_word_count = convert_seconds_to_hours(sum(acoustic_normalised_word_count)) + acoustic_normalised_word_count = convert_seconds_to_hours( + sum(acoustic_normalised_word_count) + ) verbatim_word_count = convert_seconds_to_hours(sum(verbatim_word_count)) transcribed_word_count = convert_seconds_to_hours(sum(transcribed_word_count)) total_raw_audio_duration = convert_seconds_to_hours( sum(total_raw_audio_duration_list) ) - total_segment_duration = convert_seconds_to_hours( - sum(total_segment_duration) - ) + total_segment_duration = convert_seconds_to_hours(sum(total_segment_duration)) else: total_word_count = sum(total_word_count_list) cumulative_rejection_score_ar = 0 @@ -563,13 +579,19 @@ def get_all_review_reports( meta_stats = anno.meta_stats if project_type == "AllAudioProjects": if "acoustic_normalised_duration" in meta_stats: - acoustic_normalised_duration.append(meta_stats["acoustic_normalised_duration"]) + acoustic_normalised_duration.append( + meta_stats["acoustic_normalised_duration"] + ) else: for r in anno.result: if r["from_name"] == "acoustic_normalised_transcribed_json": - acoustic_normalised_duration.append(calculateAudioDuration(r)) + acoustic_normalised_duration.append( + calculateAudioDuration(r) + ) if "acoustic_normalised_word_count" in meta_stats: - acoustic_normalised_word_count.append(meta_stats["acoustic_normalised_word_count"]) + acoustic_normalised_word_count.append( + meta_stats["acoustic_normalised_word_count"] + ) else: for r in anno.result: if r["from_name"] == "acoustic_normalised_transcribed_json": @@ -600,13 +622,19 @@ def get_all_review_reports( transcribed_word_count.append(calculateAudioDuration(r)) elif project_type == "AcousticNormalisedTranscriptionEditing": if "acoustic_normalised_duration" in meta_stats: - acoustic_normalised_duration.append(meta_stats["acoustic_normalised_duration"]) + acoustic_normalised_duration.append( + meta_stats["acoustic_normalised_duration"] + ) else: for r in anno.result: if r["from_name"] == "acoustic_normalised_transcribed_json": - acoustic_normalised_duration.append(calculateAudioDuration(r)) + acoustic_normalised_duration.append( + calculateAudioDuration(r) + ) if "acoustic_normalised_word_count" in meta_stats: - acoustic_normalised_word_count.append(meta_stats["acoustic_normalised_word_count"]) + acoustic_normalised_word_count.append( + meta_stats["acoustic_normalised_word_count"] + ) else: for r in anno.result: if r["from_name"] == "acoustic_normalised_transcribed_json": @@ -646,27 +674,27 @@ def get_all_review_reports( except: pass try: - total_raw_audio_duration_list.append( - anno.task.data["audio_duration"] - ) + total_raw_audio_duration_list.append(anno.task.data["audio_duration"]) except: pass else: only_tasks = True total_raw_audio_duration, total_word_count = 0, 0 if project_type in get_audio_project_types() or project_type == "AllAudioProjects": - acoustic_normalised_duration = convert_seconds_to_hours(sum(acoustic_normalised_duration)) + acoustic_normalised_duration = convert_seconds_to_hours( + sum(acoustic_normalised_duration) + ) verbatim_duration = convert_seconds_to_hours(sum(verbatim_duration)) transcribed_duration = convert_seconds_to_hours(sum(transcribed_duration)) - acoustic_normalised_word_count = convert_seconds_to_hours(sum(acoustic_normalised_word_count)) + acoustic_normalised_word_count = convert_seconds_to_hours( + sum(acoustic_normalised_word_count) + ) verbatim_word_count = convert_seconds_to_hours(sum(verbatim_word_count)) transcribed_word_count = convert_seconds_to_hours(sum(transcribed_word_count)) total_raw_audio_duration = convert_seconds_to_hours( sum(total_raw_audio_duration_list) ) - total_segment_duration = convert_seconds_to_hours( - sum(total_segment_duration) - ) + total_segment_duration = convert_seconds_to_hours(sum(total_segment_duration)) else: total_word_count = sum(total_word_count_list) cumulative_rejection_score_ar = 0 @@ -849,13 +877,19 @@ def get_all_supercheck_reports( meta_stats = anno.meta_stats if project_type == "AllAudioProjects": if "acoustic_normalised_duration" in meta_stats: - acoustic_normalised_duration.append(meta_stats["acoustic_normalised_duration"]) + acoustic_normalised_duration.append( + meta_stats["acoustic_normalised_duration"] + ) else: for r in anno.result: if r["from_name"] == "acoustic_normalised_transcribed_json": - acoustic_normalised_duration.append(calculateAudioDuration(r)) + acoustic_normalised_duration.append( + calculateAudioDuration(r) + ) if "acoustic_normalised_word_count" in meta_stats: - acoustic_normalised_word_count.append(meta_stats["acoustic_normalised_word_count"]) + acoustic_normalised_word_count.append( + meta_stats["acoustic_normalised_word_count"] + ) else: for r in anno.result: if r["from_name"] == "acoustic_normalised_transcribed_json": @@ -886,13 +920,19 @@ def get_all_supercheck_reports( transcribed_word_count.append(calculateAudioDuration(r)) elif project_type == "AcousticNormalisedTranscriptionEditing": if "acoustic_normalised_duration" in meta_stats: - acoustic_normalised_duration.append(meta_stats["acoustic_normalised_duration"]) + acoustic_normalised_duration.append( + meta_stats["acoustic_normalised_duration"] + ) else: for r in anno.result: if r["from_name"] == "acoustic_normalised_transcribed_json": - acoustic_normalised_duration.append(calculateAudioDuration(r)) + acoustic_normalised_duration.append( + calculateAudioDuration(r) + ) if "acoustic_normalised_word_count" in meta_stats: - acoustic_normalised_word_count.append(meta_stats["acoustic_normalised_word_count"]) + acoustic_normalised_word_count.append( + meta_stats["acoustic_normalised_word_count"] + ) else: for r in anno.result: if r["from_name"] == "acoustic_normalised_transcribed_json": @@ -942,18 +982,20 @@ def get_all_supercheck_reports( validated_raw_audio_duration, validated_word_count = 0, 0 if project_type in get_audio_project_types() or project_type == "AllAudioProjects": - acoustic_normalised_duration = convert_seconds_to_hours(sum(acoustic_normalised_duration)) + acoustic_normalised_duration = convert_seconds_to_hours( + sum(acoustic_normalised_duration) + ) verbatim_duration = convert_seconds_to_hours(sum(verbatim_duration)) transcribed_duration = convert_seconds_to_hours(sum(transcribed_duration)) - acoustic_normalised_word_count = convert_seconds_to_hours(sum(acoustic_normalised_word_count)) + acoustic_normalised_word_count = convert_seconds_to_hours( + sum(acoustic_normalised_word_count) + ) verbatim_word_count = convert_seconds_to_hours(sum(verbatim_word_count)) transcribed_word_count = convert_seconds_to_hours(sum(transcribed_word_count)) validated_raw_audio_duration = convert_seconds_to_hours( sum(validated_raw_audio_duration_list) ) - total_segment_duration = convert_seconds_to_hours( - sum(total_segment_duration) - ) + total_segment_duration = convert_seconds_to_hours(sum(total_segment_duration)) else: validated_word_count = sum(validated_word_count_list) cumulative_rejection_score_rs = 0 From e48fc9191afd3991771d65f199e0eb5ffd93edf0 Mon Sep 17 00:00:00 2001 From: Kunal Tiwary Date: Thu, 19 Sep 2024 08:58:31 +0000 Subject: [PATCH 5/5] minor fix --- backend/tasks/utils.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/backend/tasks/utils.py b/backend/tasks/utils.py index da19d59d2..0e96f8a07 100644 --- a/backend/tasks/utils.py +++ b/backend/tasks/utils.py @@ -171,11 +171,11 @@ def compute_meta_stats_for_annotation(ann_obj, project_type): ) = (0, 0, 0, 0) for r in result: if r["from_name"] == "acoustic_normalised_transcribed_json": - acoustic_normalised_word_count += calculateWordCount(ann_obj.result) - acoustic_normalised_duration += calculateAudioDuration(ann_obj.result) + acoustic_normalised_word_count += calculateWordCount(r) + acoustic_normalised_duration += calculateAudioDuration(r) elif r["from_name"] == "verbatim_transcribed_json": - verbatim_word_count += calculateWordCount(ann_obj.result) - verbatim_duration += calculateAudioDuration(ann_obj.result) + verbatim_word_count += calculateWordCount(r) + verbatim_duration += calculateAudioDuration(r) segment_duration = get_audio_transcription_duration(result) not_null_segment_duration = get_not_null_audio_transcription_duration(result) return { @@ -195,8 +195,8 @@ def compute_meta_stats_for_annotation(ann_obj, project_type): transcribed_word_count, transcribed_duration = 0, 0 for r in result: if r["from_name"] == "transcribed_json": - transcribed_word_count += calculateWordCount(ann_obj.result) - transcribed_duration += calculateAudioDuration(ann_obj.result) + transcribed_word_count += calculateWordCount(r) + transcribed_duration += calculateAudioDuration(r) segment_duration = get_audio_transcription_duration(result) not_null_segment_duration = get_not_null_audio_transcription_duration(result) return { @@ -219,7 +219,7 @@ def compute_meta_stats_for_annotation(ann_obj, project_type): word_count = 0 for r in result: if r["type"] == "textarea": - word_count += calculateWordCount(ann_obj.result) + word_count += calculateWordCount(r) return { "word_count": word_count, "ar_wer_score": ar_wer_score, @@ -236,7 +236,7 @@ def compute_meta_stats_for_annotation(ann_obj, project_type): word_count, sentence_count = 0, 0 for r in result: if r["type"] == "textarea": - word_count += calculateWordCount(ann_obj.result) + word_count += calculateWordCount(r) sentence_count += calculateSentenceCount( ann_obj.result["value"]["text"][0] ) @@ -258,7 +258,7 @@ def compute_meta_stats_for_annotation(ann_obj, project_type): word_count = 0 for r in result: if r["from_name"] == "ocr_transcribed_json": - word_count += calculateWordCount(ann_obj.result) + word_count += calculateWordCount(r) return { "word_count": word_count, "ar_wer_score": ar_wer_score,