diff --git a/backend/functions/tasks.py b/backend/functions/tasks.py index 614641df1..c0138bd2a 100644 --- a/backend/functions/tasks.py +++ b/backend/functions/tasks.py @@ -1541,15 +1541,24 @@ def update_meta_stats( ann_obj.result ) elif project_type in get_audio_project_types(): - result_meta_stats[ann_obj.annotation_status]["Raw Audio Duration"] += task_data[ - "audio_duration" - ] - result_meta_stats[ann_obj.annotation_status][ - "Segment Duration" - ] += get_audio_transcription_duration(ann_obj.result) - result_meta_stats[ann_obj.annotation_status][ - "Not Null Segment Duration" - ] += get_not_null_audio_transcription_duration(ann_obj.result, ann_obj.id) + meta_stats = ann_obj.meta_stats + if meta_stats: + result_meta_stats[ann_obj.annotation_status][ + "Segment Duration" + ] = meta_stats["total_segment_duration"] + result_meta_stats[ann_obj.annotation_status][ + "Not Null Segment Duration" + ] = meta_stats["not_null_segment_duration"] + else: + result_meta_stats[ann_obj.annotation_status][ + "Raw Audio Duration" + ] += task_data["audio_duration"] + result_meta_stats[ann_obj.annotation_status][ + "Segment Duration" + ] += get_audio_transcription_duration(ann_obj.result) + result_meta_stats[ann_obj.annotation_status][ + "Not Null Segment Duration" + ] += get_not_null_audio_transcription_duration(ann_obj.result, ann_obj.id) def calculate_ced_between_two_annotations(annotation1, annotation2): diff --git a/backend/organizations/tasks.py b/backend/organizations/tasks.py index 5fef46978..1a8754cf9 100644 --- a/backend/organizations/tasks.py +++ b/backend/organizations/tasks.py @@ -35,6 +35,11 @@ un_pack_annotation_tasks, ) from django.db.models import Q +from tasks.utils import ( + calculateWordCount, + calculateAudioDuration, + calculateSentenceCount, +) def get_all_annotation_reports( @@ -119,27 +124,41 @@ def get_all_annotation_reports( if a.annotation_type == REVIEWER_ANNOTATION: number_of_tasks_that_has_review_annotations += 1 if ann_ann and rev_ann and not ar_done: - try: - ar_wer_score += calculate_word_error_rate_between_two_audio_transcription_annotation( - rev_ann.result, ann_ann.result, project_type - ) + meta_stats = rev_ann.meta_stats + if "word_error_rate" in meta_stats: + ar_wer_score += meta_stats["word_error_rate"] number_of_tasks_contributed_for_ar_wer += 1 ar_done = True - except Exception as e: - pass - try: - s1 = SentenceOperationViewSet() - sampleRequest = { - "annotation_result1": rev_ann.result, - "annotation_result2": ann_ann.result, - } - ar_bleu_score += float( - s1.calculate_bleu_score(sampleRequest).data["bleu_score"] - ) - number_of_tasks_contributed_for_ar_bleu += 1 - except Exception as e: - pass + else: + try: + ar_wer_score += calculate_word_error_rate_between_two_audio_transcription_annotation( + rev_ann.result, ann_ann.result, project_type + ) + number_of_tasks_contributed_for_ar_wer += 1 + ar_done = True + except Exception as e: + pass + if "bleu_score" in meta_stats: + ar_bleu_score += meta_stats["bleu_score"] + else: + try: + s1 = SentenceOperationViewSet() + sampleRequest = { + "annotation_result1": rev_ann.result, + "annotation_result2": ann_ann.result, + } + ar_bleu_score += float( + s1.calculate_bleu_score(sampleRequest).data["bleu_score"] + ) + number_of_tasks_contributed_for_ar_bleu += 1 + except Exception as e: + pass if ann_ann and sup_ann and not as_done: + meta_stats = sup_ann.meta_stats + if "word_error_rate" in meta_stats: + as_wer_score += meta_stats["word_error_rate"] + number_of_tasks_contributed_for_as_wer += 1 + as_done = True try: as_wer_score += calculate_word_error_rate_between_two_audio_transcription_annotation( sup_ann.result, ann_ann.result, project_type @@ -158,38 +177,160 @@ def get_all_annotation_reports( if project_type in ["ConversationTranslationEditing", "ConversationTranslation"] else False ) - total_audio_duration_list = [] + acoustic_normalised_duration = [] + verbatim_duration = [] + transcribed_duration = [] + acoustic_normalised_word_count = [] + verbatim_word_count = [] + transcribed_word_count = [] + total_segment_duration = [] total_raw_audio_duration_list = [] total_word_count_list = [] only_tasks = False if is_translation_project: for anno in submitted_tasks: - try: - total_word_count_list.append(anno.task.data["word_count"]) - except: - pass + meta_stats = anno.meta_stats + if "word_count" in meta_stats: + total_word_count_list.append(meta_stats["word_count"]) + else: + try: + total_word_count_list.append(anno.task.data["word_count"]) + except: + pass elif "OCRTranscription" in project_type: for anno in submitted_tasks: - total_word_count_list.append(ocr_word_count(anno.result)) + meta_stats = anno.meta_stats + if "word_count" in meta_stats: + total_word_count_list.append(meta_stats["word_count"]) + else: + total_word_count_list.append(ocr_word_count(anno.result)) elif ( project_type in get_audio_project_types() or project_type == "AllAudioProjects" ): for anno in submitted_tasks: + meta_stats = anno.meta_stats + if not meta_stats: + meta_stats = [] + if project_type == "AllAudioProjects": + if "acoustic_normalised_duration" in meta_stats: + acoustic_normalised_duration.append( + meta_stats["acoustic_normalised_duration"] + ) + else: + for r in anno.result: + if r["from_name"] == "acoustic_normalised_transcribed_json": + acoustic_normalised_duration.append( + calculateAudioDuration(r) + ) + if "acoustic_normalised_word_count" in meta_stats: + acoustic_normalised_word_count.append( + meta_stats["acoustic_normalised_word_count"] + ) + else: + for r in anno.result: + if r["from_name"] == "acoustic_normalised_transcribed_json": + acoustic_normalised_word_count.append(calculateWordCount(r)) + if "verbatim_duration" in meta_stats: + verbatim_duration.append(meta_stats["verbatim_duration"]) + else: + for r in anno.result: + if r["from_name"] == "verbatim_transcribed_json": + verbatim_duration.append(calculateAudioDuration(r)) + if "verbatim_word_count" in meta_stats: + verbatim_word_count.append(meta_stats["verbatim_word_count"]) + else: + for r in anno.result: + if r["from_name"] == "verbatim_transcribed_json": + verbatim_word_count.append(calculateWordCount(r)) + if "transcribed_duration" in meta_stats: + transcribed_duration.append(meta_stats["transcribed_duration"]) + else: + for r in anno.result: + if r["from_name"] == "transcribed_json": + transcribed_duration.append(calculateAudioDuration(r)) + if "transcribed_word_count" in meta_stats: + transcribed_word_count.append(meta_stats["transcribed_word_count"]) + else: + for r in anno.result: + if r["from_name"] == "transcribed_json": + transcribed_word_count.append(calculateAudioDuration(r)) + elif project_type == "AcousticNormalisedTranscriptionEditing": + if "acoustic_normalised_duration" in meta_stats: + acoustic_normalised_duration.append( + meta_stats["acoustic_normalised_duration"] + ) + else: + for r in anno.result: + if r["from_name"] == "acoustic_normalised_transcribed_json": + acoustic_normalised_duration.append( + calculateAudioDuration(r) + ) + if "acoustic_normalised_word_count" in meta_stats: + acoustic_normalised_word_count.append( + meta_stats["acoustic_normalised_word_count"] + ) + else: + for r in anno.result: + if r["from_name"] == "acoustic_normalised_transcribed_json": + acoustic_normalised_word_count.append(calculateWordCount(r)) + if "verbatim_duration" in meta_stats: + verbatim_duration.append(meta_stats["verbatim_duration"]) + else: + for r in anno.result: + if r["from_name"] == "verbatim_transcribed_json": + verbatim_duration.append(calculateAudioDuration(r)) + if "verbatim_word_count" in meta_stats: + verbatim_word_count.append(meta_stats["verbatim_word_count"]) + else: + for r in anno.result: + if r["from_name"] == "verbatim_transcribed_json": + verbatim_word_count.append(calculateWordCount(r)) + else: + if "transcribed_duration" in meta_stats: + transcribed_duration.append(meta_stats["transcribed_duration"]) + else: + for r in anno.result: + if r["from_name"] == "transcribed_json": + transcribed_duration.append(calculateAudioDuration(r)) + if "transcribed_word_count" in meta_stats: + transcribed_word_count.append(meta_stats["transcribed_word_count"]) + else: + for r in anno.result: + if r["from_name"] == "transcribed_json": + transcribed_word_count.append(calculateAudioDuration(r)) try: - total_audio_duration_list.append( - get_audio_transcription_duration(anno.result) - ) total_raw_audio_duration_list.append(anno.task.data["audio_duration"]) except: pass + if "total_segment_duration" in meta_stats: + total_segment_duration.append(meta_stats["total_segment_duration"]) + else: + try: + total_segment_duration.append( + get_audio_transcription_duration(anno.result) + ) + except: + pass else: only_tasks = True - - total_word_count = sum(total_word_count_list) - total_audio_duration = convert_seconds_to_hours(sum(total_audio_duration_list)) - total_raw_audio_duration = convert_seconds_to_hours( - sum(total_raw_audio_duration_list) - ) + total_raw_audio_duration, total_word_count = 0, 0 + if project_type in get_audio_project_types() or project_type == "AllAudioProjects": + acoustic_normalised_duration = convert_seconds_to_hours( + sum(acoustic_normalised_duration) + ) + verbatim_duration = convert_seconds_to_hours(sum(verbatim_duration)) + transcribed_duration = convert_seconds_to_hours(sum(transcribed_duration)) + acoustic_normalised_word_count = convert_seconds_to_hours( + sum(acoustic_normalised_word_count) + ) + verbatim_word_count = convert_seconds_to_hours(sum(verbatim_word_count)) + transcribed_word_count = convert_seconds_to_hours(sum(transcribed_word_count)) + total_raw_audio_duration = convert_seconds_to_hours( + sum(total_raw_audio_duration_list) + ) + total_segment_duration = convert_seconds_to_hours(sum(total_segment_duration)) + else: + total_word_count = sum(total_word_count_list) cumulative_rejection_score_ar = 0 if tasks_and_rejection_count_map_ar: for task, rc in tasks_and_rejection_count_map_ar.items(): @@ -200,10 +341,16 @@ def get_all_annotation_reports( "Participation Type": participation_type, "Role": role, "Type of Work": "Annotator", - "Total Segments Duration": total_audio_duration, + "Acoustic Normalised Duration": acoustic_normalised_duration, + "Verbatim Duration": verbatim_duration, + "Transcribed Duration": transcribed_duration, "Total Raw Audio Duration": total_raw_audio_duration, - "Word Count": total_word_count, + "Total Segment Duration": total_segment_duration, + "Acoustic Normalised Word Count": acoustic_normalised_word_count, + "Verbatim Word Count": verbatim_word_count, + "Transcribed Word Count": transcribed_word_count, "Submitted Tasks": submitted_tasks_count, + "Word Count": total_word_count, "Language": user_lang, "Average Word Error Rate Annotator Vs Reviewer": ar_wer_score / number_of_tasks_contributed_for_ar_wer @@ -231,17 +378,44 @@ def get_all_annotation_reports( if project_type in get_audio_project_types() or project_type == "AllAudioProjects": del result["Word Count"] + if project_type == "AcousticNormalisedTranscriptionEditing": + del result["Transcribed Duration"] + del result["Transcribed Word Count"] + elif project_type in get_audio_project_types(): + del result["Acoustic Normalised Duration"] + del result["Verbatim Duration"] + del result["Raw Audio Duration"] + del result["Acoustic Normalised Word Count"] + del result["Verbatim Word Count"] elif only_tasks: - del result["Total Segments Duration"] - del result["Total Raw Audio Duration"] del result["Word Count"] + del result["Acoustic Normalised Duration"] + del result["Verbatim Duration"] + del result["Transcribed Duration"] + del result["Raw Audio Duration"] + del result["Total Segment Duration"] + del result["Acoustic Normalised Word Count"] + del result["Verbatim Word Count"] + del result["Transcribed Word Count"] elif is_CT_OR_CTE: - del result["Total Segments Duration"] - del result["Total Raw Audio Duration"] del result["Word Count"] + del result["Acoustic Normalised Duration"] + del result["Verbatim Duration"] + del result["Transcribed Duration"] + del result["Raw Audio Duration"] + del result["Total Segment Duration"] + del result["Acoustic Normalised Word Count"] + del result["Verbatim Word Count"] + del result["Transcribed Word Count"] else: - del result["Total Segments Duration"] - del result["Total Raw Audio Duration"] + del result["Acoustic Normalised Duration"] + del result["Verbatim Duration"] + del result["Transcribed Duration"] + del result["Raw Audio Duration"] + del result["Total Segment Duration"] + del result["Acoustic Normalised Word Count"] + del result["Verbatim Word Count"] + del result["Transcribed Word Count"] return result @@ -335,26 +509,35 @@ def get_all_review_reports( if a.annotation_type == SUPER_CHECKER_ANNOTATION: number_of_tasks_that_has_sup_annotations += 1 if rev_ann and sup_ann and not rs_done: - try: - rs_wer_score += calculate_word_error_rate_between_two_audio_transcription_annotation( - sup_ann.result, rev_ann.result, project_type - ) + meta_stats = sup_ann.meta_stats + if "word_error_rate" in meta_stats: + rs_wer_score += meta_stats["word_error_rate"] number_of_tasks_contributed_for_rs_wer += 1 rs_done = True - except Exception as e: - pass - try: - s1 = SentenceOperationViewSet() - sampleRequest = { - "annotation_result1": sup_ann.result, - "annotation_result2": rev_ann.result, - } - rs_bleu_score += float( - s1.calculate_bleu_score(sampleRequest).data["bleu_score"] - ) - number_of_tasks_contributed_for_rs_bleu += 1 - except Exception as e: - pass + else: + try: + rs_wer_score += calculate_word_error_rate_between_two_audio_transcription_annotation( + sup_ann.result, rev_ann.result, project_type + ) + number_of_tasks_contributed_for_rs_wer += 1 + rs_done = True + except Exception as e: + pass + if "bleu_score" in meta_stats: + rs_bleu_score += meta_stats["bleu_score"] + else: + try: + s1 = SentenceOperationViewSet() + sampleRequest = { + "annotation_result1": sup_ann.result, + "annotation_result2": rev_ann.result, + } + rs_bleu_score += float( + s1.calculate_bleu_score(sampleRequest).data["bleu_score"] + ) + number_of_tasks_contributed_for_rs_bleu += 1 + except Exception as e: + pass submitted_tasks_count = submitted_tasks.count() project_type_lower = project_type.lower() @@ -364,38 +547,159 @@ def get_all_review_reports( if project_type in ["ConversationTranslationEditing", "ConversationTranslation"] else False ) - total_audio_duration_list = [] total_raw_audio_duration_list = [] total_word_count_list = [] + acoustic_normalised_duration = [] + verbatim_duration = [] + transcribed_duration = [] + acoustic_normalised_word_count = [] + verbatim_word_count = [] + transcribed_word_count = [] + total_segment_duration = [] only_tasks = False if is_translation_project: for anno in submitted_tasks: - try: - total_word_count_list.append(anno.task.data["word_count"]) - except: - pass + meta_stats = anno.meta_stats + if "word_count" in meta_stats: + total_word_count_list.append(meta_stats["word_count"]) + else: + try: + total_word_count_list.append(anno.task.data["word_count"]) + except: + pass elif "OCRTranscription" in project_type: for anno in submitted_tasks: - total_word_count_list.append(ocr_word_count(anno.result)) + meta_stats = anno.meta_stats + if "word_count" in meta_stats: + total_word_count_list.append(meta_stats["word_count"]) + else: + total_word_count_list.append(ocr_word_count(anno.result)) elif ( project_type in get_audio_project_types() or project_type == "AllAudioProjects" ): for anno in submitted_tasks: + meta_stats = anno.meta_stats + if project_type == "AllAudioProjects": + if "acoustic_normalised_duration" in meta_stats: + acoustic_normalised_duration.append( + meta_stats["acoustic_normalised_duration"] + ) + else: + for r in anno.result: + if r["from_name"] == "acoustic_normalised_transcribed_json": + acoustic_normalised_duration.append( + calculateAudioDuration(r) + ) + if "acoustic_normalised_word_count" in meta_stats: + acoustic_normalised_word_count.append( + meta_stats["acoustic_normalised_word_count"] + ) + else: + for r in anno.result: + if r["from_name"] == "acoustic_normalised_transcribed_json": + acoustic_normalised_word_count.append(calculateWordCount(r)) + if "verbatim_duration" in meta_stats: + verbatim_duration.append(meta_stats["verbatim_duration"]) + else: + for r in anno.result: + if r["from_name"] == "verbatim_transcribed_json": + verbatim_duration.append(calculateAudioDuration(r)) + if "verbatim_word_count" in meta_stats: + verbatim_word_count.append(meta_stats["verbatim_word_count"]) + else: + for r in anno.result: + if r["from_name"] == "verbatim_transcribed_json": + verbatim_word_count.append(calculateWordCount(r)) + if "transcribed_duration" in meta_stats: + transcribed_duration.append(meta_stats["transcribed_duration"]) + else: + for r in anno.result: + if r["from_name"] == "transcribed_json": + transcribed_duration.append(calculateAudioDuration(r)) + if "transcribed_word_count" in meta_stats: + transcribed_word_count.append(meta_stats["transcribed_word_count"]) + else: + for r in anno.result: + if r["from_name"] == "transcribed_json": + transcribed_word_count.append(calculateAudioDuration(r)) + elif project_type == "AcousticNormalisedTranscriptionEditing": + if "acoustic_normalised_duration" in meta_stats: + acoustic_normalised_duration.append( + meta_stats["acoustic_normalised_duration"] + ) + else: + for r in anno.result: + if r["from_name"] == "acoustic_normalised_transcribed_json": + acoustic_normalised_duration.append( + calculateAudioDuration(r) + ) + if "acoustic_normalised_word_count" in meta_stats: + acoustic_normalised_word_count.append( + meta_stats["acoustic_normalised_word_count"] + ) + else: + for r in anno.result: + if r["from_name"] == "acoustic_normalised_transcribed_json": + acoustic_normalised_word_count.append(calculateWordCount(r)) + if "verbatim_duration" in meta_stats: + verbatim_duration.append(meta_stats["verbatim_duration"]) + else: + for r in anno.result: + if r["from_name"] == "verbatim_transcribed_json": + verbatim_duration.append(calculateAudioDuration(r)) + if "verbatim_word_count" in meta_stats: + verbatim_word_count.append(meta_stats["verbatim_word_count"]) + else: + for r in anno.result: + if r["from_name"] == "verbatim_transcribed_json": + verbatim_word_count.append(calculateWordCount(r)) + else: + if "transcribed_duration" in meta_stats: + transcribed_duration.append(meta_stats["transcribed_duration"]) + else: + for r in anno.result: + if r["from_name"] == "transcribed_json": + transcribed_duration.append(calculateAudioDuration(r)) + if "transcribed_word_count" in meta_stats: + transcribed_word_count.append(meta_stats["transcribed_word_count"]) + else: + for r in anno.result: + if r["from_name"] == "transcribed_json": + transcribed_word_count.append(calculateAudioDuration(r)) + if "total_segment_duration" in meta_stats: + total_segment_duration.append(meta_stats["total_segment_duration"]) + else: + try: + total_segment_duration.append( + get_audio_transcription_duration(anno.result) + ) + except: + pass try: - total_audio_duration_list.append( - get_audio_transcription_duration(anno.result) - ) total_raw_audio_duration_list.append(anno.task.data["audio_duration"]) except: pass else: only_tasks = True - total_word_count = sum(total_word_count_list) - total_audio_duration = convert_seconds_to_hours(sum(total_audio_duration_list)) - total_raw_audio_duration = convert_seconds_to_hours( - sum(total_raw_audio_duration_list) - ) + total_raw_audio_duration, total_word_count = 0, 0 + if project_type in get_audio_project_types() or project_type == "AllAudioProjects": + acoustic_normalised_duration = convert_seconds_to_hours( + sum(acoustic_normalised_duration) + ) + verbatim_duration = convert_seconds_to_hours(sum(verbatim_duration)) + transcribed_duration = convert_seconds_to_hours(sum(transcribed_duration)) + acoustic_normalised_word_count = convert_seconds_to_hours( + sum(acoustic_normalised_word_count) + ) + verbatim_word_count = convert_seconds_to_hours(sum(verbatim_word_count)) + transcribed_word_count = convert_seconds_to_hours(sum(transcribed_word_count)) + total_raw_audio_duration = convert_seconds_to_hours( + sum(total_raw_audio_duration_list) + ) + total_segment_duration = convert_seconds_to_hours(sum(total_segment_duration)) + else: + total_word_count = sum(total_word_count_list) cumulative_rejection_score_ar = 0 if tasks_and_rejection_count_map_ar: for task, rc in tasks_and_rejection_count_map_ar.items(): @@ -412,8 +716,14 @@ def get_all_review_reports( "Participation Type": participation_type, "Role": role, "Type of Work": "Review", - "Total Segments Duration": total_audio_duration, + "Acoustic Normalised Duration": acoustic_normalised_duration, + "Verbatim Duration": verbatim_duration, + "Transcribed Duration": transcribed_duration, "Total Raw Audio Duration": total_raw_audio_duration, + "Total Segment Duration": total_segment_duration, + "Acoustic Normalised Word Count": acoustic_normalised_word_count, + "Verbatim Word Count": verbatim_word_count, + "Transcribed Word Count": transcribed_word_count, "Word Count": total_word_count, "Submitted Tasks": submitted_tasks_count, "Language": user_lang, @@ -440,17 +750,44 @@ def get_all_review_reports( if project_type in get_audio_project_types() or project_type == "AllAudioProjects": del result["Word Count"] + if project_type == "AcousticNormalisedTranscriptionEditing": + del result["Transcribed Duration"] + del result["Transcribed Word Count"] + elif project_type in get_audio_project_types(): + del result["Acoustic Normalised Duration"] + del result["Verbatim Duration"] + del result["Raw Audio Duration"] + del result["Acoustic Normalised Word Count"] + del result["Verbatim Word Count"] elif only_tasks: - del result["Total Segments Duration"] - del result["Total Raw Audio Duration"] del result["Word Count"] + del result["Acoustic Normalised Duration"] + del result["Verbatim Duration"] + del result["Transcribed Duration"] + del result["Raw Audio Duration"] + del result["Total Segment Duration"] + del result["Acoustic Normalised Word Count"] + del result["Verbatim Word Count"] + del result["Transcribed Word Count"] elif is_CT_OR_CTE: - del result["Total Segments Duration"] - del result["Total Raw Audio Duration"] del result["Word Count"] + del result["Acoustic Normalised Duration"] + del result["Verbatim Duration"] + del result["Transcribed Duration"] + del result["Raw Audio Duration"] + del result["Total Segment Duration"] + del result["Acoustic Normalised Word Count"] + del result["Verbatim Word Count"] + del result["Transcribed Word Count"] else: - del result["Total Segments Duration"] - del result["Total Raw Audio Duration"] + del result["Acoustic Normalised Duration"] + del result["Verbatim Duration"] + del result["Transcribed Duration"] + del result["Raw Audio Duration"] + del result["Total Segment Duration"] + del result["Acoustic Normalised Word Count"] + del result["Verbatim Word Count"] + del result["Transcribed Word Count"] return result @@ -509,26 +846,134 @@ def get_all_supercheck_reports( else False ) validated_word_count_list = [] - validated_audio_duration_list = [] + acoustic_normalised_duration = [] + verbatim_duration = [] + transcribed_duration = [] + acoustic_normalised_word_count = [] + verbatim_word_count = [] + transcribed_word_count = [] + total_segment_duration = [] validated_raw_audio_duration_list = [] only_tasks = False if is_translation_project: for anno in submitted_tasks: - try: - validated_word_count_list.append(anno.task.data["word_count"]) - except: - pass + meta_stats = anno.meta_stats + if "word_count" in meta_stats: + validated_word_count_list.append(meta_stats["word_count"]) + else: + try: + validated_word_count_list.append(anno.task.data["word_count"]) + except: + pass elif "OCRTranscription" in project_type: for anno in submitted_tasks: - validated_word_count_list.append(ocr_word_count(anno.result)) + meta_stats = anno.meta_stats + if "word_count" in meta_stats: + validated_word_count_list.append(meta_stats["word_count"]) + else: + validated_word_count_list.append(ocr_word_count(anno.result)) elif ( project_type in get_audio_project_types() or project_type == "AllAudioProjects" ): for anno in submitted_tasks: + meta_stats = anno.meta_stats + if project_type == "AllAudioProjects": + if "acoustic_normalised_duration" in meta_stats: + acoustic_normalised_duration.append( + meta_stats["acoustic_normalised_duration"] + ) + else: + for r in anno.result: + if r["from_name"] == "acoustic_normalised_transcribed_json": + acoustic_normalised_duration.append( + calculateAudioDuration(r) + ) + if "acoustic_normalised_word_count" in meta_stats: + acoustic_normalised_word_count.append( + meta_stats["acoustic_normalised_word_count"] + ) + else: + for r in anno.result: + if r["from_name"] == "acoustic_normalised_transcribed_json": + acoustic_normalised_word_count.append(calculateWordCount(r)) + if "verbatim_duration" in meta_stats: + verbatim_duration.append(meta_stats["verbatim_duration"]) + else: + for r in anno.result: + if r["from_name"] == "verbatim_transcribed_json": + verbatim_duration.append(calculateAudioDuration(r)) + if "verbatim_word_count" in meta_stats: + verbatim_word_count.append(meta_stats["verbatim_word_count"]) + else: + for r in anno.result: + if r["from_name"] == "verbatim_transcribed_json": + verbatim_word_count.append(calculateWordCount(r)) + if "transcribed_duration" in meta_stats: + transcribed_duration.append(meta_stats["transcribed_duration"]) + else: + for r in anno.result: + if r["from_name"] == "transcribed_json": + transcribed_duration.append(calculateAudioDuration(r)) + if "transcribed_word_count" in meta_stats: + transcribed_word_count.append(meta_stats["transcribed_word_count"]) + else: + for r in anno.result: + if r["from_name"] == "transcribed_json": + transcribed_word_count.append(calculateAudioDuration(r)) + elif project_type == "AcousticNormalisedTranscriptionEditing": + if "acoustic_normalised_duration" in meta_stats: + acoustic_normalised_duration.append( + meta_stats["acoustic_normalised_duration"] + ) + else: + for r in anno.result: + if r["from_name"] == "acoustic_normalised_transcribed_json": + acoustic_normalised_duration.append( + calculateAudioDuration(r) + ) + if "acoustic_normalised_word_count" in meta_stats: + acoustic_normalised_word_count.append( + meta_stats["acoustic_normalised_word_count"] + ) + else: + for r in anno.result: + if r["from_name"] == "acoustic_normalised_transcribed_json": + acoustic_normalised_word_count.append(calculateWordCount(r)) + if "verbatim_duration" in meta_stats: + verbatim_duration.append(meta_stats["verbatim_duration"]) + else: + for r in anno.result: + if r["from_name"] == "verbatim_transcribed_json": + verbatim_duration.append(calculateAudioDuration(r)) + if "verbatim_word_count" in meta_stats: + verbatim_word_count.append(meta_stats["verbatim_word_count"]) + else: + for r in anno.result: + if r["from_name"] == "verbatim_transcribed_json": + verbatim_word_count.append(calculateWordCount(r)) + else: + if "transcribed_duration" in meta_stats: + transcribed_duration.append(meta_stats["transcribed_duration"]) + else: + for r in anno.result: + if r["from_name"] == "transcribed_json": + transcribed_duration.append(calculateAudioDuration(r)) + if "transcribed_word_count" in meta_stats: + transcribed_word_count.append(meta_stats["transcribed_word_count"]) + else: + for r in anno.result: + if r["from_name"] == "transcribed_json": + transcribed_word_count.append(calculateAudioDuration(r)) + if "total_segment_duration" in meta_stats: + total_segment_duration.append(meta_stats["total_segment_duration"]) + else: + try: + total_segment_duration.append( + get_audio_transcription_duration(anno.result) + ) + except: + pass try: - validated_audio_duration_list.append( - get_audio_transcription_duration(anno.result) - ) validated_raw_audio_duration_list.append( anno.task.data["audio_duration"] ) @@ -537,13 +982,24 @@ def get_all_supercheck_reports( else: only_tasks = True - validated_word_count = sum(validated_word_count_list) - validated_audio_duration = convert_seconds_to_hours( - sum(validated_audio_duration_list) - ) - validated_raw_audio_duration = convert_seconds_to_hours( - sum(validated_raw_audio_duration_list) - ) + validated_raw_audio_duration, validated_word_count = 0, 0 + if project_type in get_audio_project_types() or project_type == "AllAudioProjects": + acoustic_normalised_duration = convert_seconds_to_hours( + sum(acoustic_normalised_duration) + ) + verbatim_duration = convert_seconds_to_hours(sum(verbatim_duration)) + transcribed_duration = convert_seconds_to_hours(sum(transcribed_duration)) + acoustic_normalised_word_count = convert_seconds_to_hours( + sum(acoustic_normalised_word_count) + ) + verbatim_word_count = convert_seconds_to_hours(sum(verbatim_word_count)) + transcribed_word_count = convert_seconds_to_hours(sum(transcribed_word_count)) + validated_raw_audio_duration = convert_seconds_to_hours( + sum(validated_raw_audio_duration_list) + ) + total_segment_duration = convert_seconds_to_hours(sum(total_segment_duration)) + else: + validated_word_count = sum(validated_word_count_list) cumulative_rejection_score_rs = 0 if tasks_and_rejection_count_map_rs: for task, rc in tasks_and_rejection_count_map_rs.items(): @@ -555,8 +1011,14 @@ def get_all_supercheck_reports( "Participation Type": participation_type, "Role": role, "Type of Work": "Supercheck", - "Total Segments Duration": validated_audio_duration, + "Acoustic Normalised Duration": acoustic_normalised_duration, + "Verbatim Duration": verbatim_duration, + "Transcribed Duration": transcribed_duration, "Total Raw Audio Duration": validated_raw_audio_duration, + "Total Segment Duration": total_segment_duration, + "Acoustic Normalised Word Count": acoustic_normalised_word_count, + "Verbatim Word Count": verbatim_word_count, + "Transcribed Word Count": transcribed_word_count, "Word Count": validated_word_count, "Submitted Tasks": submitted_tasks_count, "Language": user_lang, @@ -568,17 +1030,44 @@ def get_all_supercheck_reports( if project_type in get_audio_project_types() or project_type == "AllAudioProjects": del result["Word Count"] + if project_type == "AcousticNormalisedTranscriptionEditing": + del result["Transcribed Duration"] + del result["Transcribed Word Count"] + elif project_type in get_audio_project_types(): + del result["Acoustic Normalised Duration"] + del result["Verbatim Duration"] + del result["Raw Audio Duration"] + del result["Acoustic Normalised Word Count"] + del result["Verbatim Word Count"] elif only_tasks: - del result["Total Segments Duration"] - del result["Total Raw Audio Duration"] del result["Word Count"] + del result["Acoustic Normalised Duration"] + del result["Verbatim Duration"] + del result["Transcribed Duration"] + del result["Raw Audio Duration"] + del result["Total Segment Duration"] + del result["Acoustic Normalised Word Count"] + del result["Verbatim Word Count"] + del result["Transcribed Word Count"] elif is_CT_OR_CTE: - del result["Total Segments Duration"] - del result["Total Raw Audio Duration"] del result["Word Count"] + del result["Acoustic Normalised Duration"] + del result["Verbatim Duration"] + del result["Transcribed Duration"] + del result["Raw Audio Duration"] + del result["Total Segment Duration"] + del result["Acoustic Normalised Word Count"] + del result["Verbatim Word Count"] + del result["Transcribed Word Count"] else: - del result["Total Segments Duration"] - del result["Total Raw Audio Duration"] + del result["Acoustic Normalised Duration"] + del result["Verbatim Duration"] + del result["Transcribed Duration"] + del result["Raw Audio Duration"] + del result["Total Segment Duration"] + del result["Acoustic Normalised Word Count"] + del result["Verbatim Word Count"] + del result["Transcribed Word Count"] return result diff --git a/backend/projects/utils.py b/backend/projects/utils.py index 931271435..634066e15 100644 --- a/backend/projects/utils.py +++ b/backend/projects/utils.py @@ -173,7 +173,7 @@ def get_audio_transcription_duration(annotation_result): return audio_duration -def get_not_null_audio_transcription_duration(annotation_result, ann_id): +def get_not_null_audio_transcription_duration(annotation_result, ann_id=None): audio_duration = 0 memory = create_memory(annotation_result) for key, indexes in memory.items(): diff --git a/backend/tasks/migrations/0049_annotation_meta_stats.py b/backend/tasks/migrations/0049_annotation_meta_stats.py new file mode 100644 index 000000000..a9413e0f4 --- /dev/null +++ b/backend/tasks/migrations/0049_annotation_meta_stats.py @@ -0,0 +1,22 @@ +# Generated by Django 3.2.14 on 2024-09-19 03:43 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + dependencies = [ + ("tasks", "0048_alter_annotation_unique_together"), + ] + + operations = [ + migrations.AddField( + model_name="annotation", + name="meta_stats", + field=models.JSONField( + blank=True, + help_text="Meta statistics for the annotation result", + null=True, + verbose_name="meta_stats", + ), + ), + ] diff --git a/backend/tasks/models.py b/backend/tasks/models.py index 7fac09c7a..e636a01cb 100644 --- a/backend/tasks/models.py +++ b/backend/tasks/models.py @@ -261,6 +261,13 @@ class Annotation(models.Model): help_text=("Time when the annotation was first labeled/accepted/validated"), ) + meta_stats = models.JSONField( + blank=True, + null=True, + verbose_name="meta_stats", + help_text="Meta statistics for the annotation result", + ) + def __str__(self): return str(self.id) diff --git a/backend/tasks/utils.py b/backend/tasks/utils.py index 442901c8b..0e96f8a07 100644 --- a/backend/tasks/utils.py +++ b/backend/tasks/utils.py @@ -1,7 +1,27 @@ import os +import re from requests import RequestException import requests from dotenv import load_dotenv +from projects.utils import ( + no_of_words, + get_audio_project_types, + get_audio_transcription_duration, + get_not_null_audio_transcription_duration, + calculate_word_error_rate_between_two_audio_transcription_annotation, +) +from tasks.models import ( + Annotation, + REVIEWER_ANNOTATION, + ANNOTATOR_ANNOTATION, + SUPER_CHECKER_ANNOTATION, + ACCEPTED, + ACCEPTED_WITH_MINOR_CHANGES, + ACCEPTED_WITH_MAJOR_CHANGES, + VALIDATED, + VALIDATED_WITH_CHANGES, +) + Queued_Task_name = { "dataset.tasks.deduplicate_dataset_instance_items": "Deduplicate Dataset Instance Items", @@ -60,3 +80,214 @@ def query_flower(filters=None): return {"error": "Failed to retrieve tasks from Flower"} except RequestException as e: return {"error": f" failed to connect to flower API, {str(e)}"} + + +def compute_meta_stats_for_annotation(ann_obj, project_type): + from tasks.views import SentenceOperationViewSet + + task_obj = ann_obj.task + task_data = task_obj.data + ced_project_type_choices = ["ContextualTranslationEditing"] + result_meta_stats = {} + result = ann_obj.result + + # calculating wer and bleu scores + all_annotations = Annotation.objects.filter(task_id=task_obj.id) + ar_wer_score, as_wer_score, rs_wer_score = 0, 0, 0 + ar_bleu_score, rs_bleu_score = 0, 0 + ar_done, as_done, rs_done = False, False, False + ann_ann, rev_ann, sup_ann = "", "", "" + for a in all_annotations: + if a.annotation_type == REVIEWER_ANNOTATION and a.annotation_status in [ + ACCEPTED, + ACCEPTED_WITH_MINOR_CHANGES, + ACCEPTED_WITH_MAJOR_CHANGES, + ]: + rev_ann = a + elif a.annotation_type == SUPER_CHECKER_ANNOTATION and a.annotation_status in [ + VALIDATED, + VALIDATED_WITH_CHANGES, + ]: + sup_ann = a + elif a.annotation_type == ANNOTATOR_ANNOTATION: + ann_ann = a + if ann_ann and rev_ann and not ar_done: + try: + ar_wer_score += calculate_word_error_rate_between_two_audio_transcription_annotation( + rev_ann.result, ann_ann.result, project_type + ) + ar_done = True + except Exception as e: + pass + try: + s1 = SentenceOperationViewSet() + sampleRequest = { + "annotation_result1": rev_ann.result, + "annotation_result2": ann_ann.result, + } + ar_bleu_score += float( + s1.calculate_bleu_score(sampleRequest).data["ar_bleu_score"] + ) + except Exception as e: + pass + if rev_ann and sup_ann and not rs_done: + try: + rs_wer_score += calculate_word_error_rate_between_two_audio_transcription_annotation( + sup_ann.result, rev_ann.result, project_type + ) + rs_done = True + except Exception as e: + pass + try: + s1 = SentenceOperationViewSet() + sampleRequest = { + "annotation_result1": sup_ann.result, + "annotation_result2": rev_ann.result, + } + rs_bleu_score += float( + s1.calculate_bleu_score(sampleRequest).data["rs_bleu_score"] + ) + except Exception as e: + pass + if ann_ann and sup_ann and not as_done: + meta_stats = sup_ann.meta_stats + if "as_wer_score" in meta_stats: + as_wer_score += meta_stats["as_wer_score"] + as_done = True + try: + as_wer_score += calculate_word_error_rate_between_two_audio_transcription_annotation( + sup_ann.result, ann_ann.result, project_type + ) + as_done = True + except Exception as e: + pass + + if project_type == "AcousticNormalisedTranscriptionEditing": + ( + acoustic_normalised_word_count, + verbatim_word_count, + acoustic_normalised_duration, + verbatim_duration, + ) = (0, 0, 0, 0) + for r in result: + if r["from_name"] == "acoustic_normalised_transcribed_json": + acoustic_normalised_word_count += calculateWordCount(r) + acoustic_normalised_duration += calculateAudioDuration(r) + elif r["from_name"] == "verbatim_transcribed_json": + verbatim_word_count += calculateWordCount(r) + verbatim_duration += calculateAudioDuration(r) + segment_duration = get_audio_transcription_duration(result) + not_null_segment_duration = get_not_null_audio_transcription_duration(result) + return { + "acoustic_normalised_word_count": acoustic_normalised_word_count, + "verbatim_word_count": verbatim_word_count, + "acoustic_normalised_duration": acoustic_normalised_duration, + "verbatim_duration": verbatim_duration, + "total_segment_duration": segment_duration, + "not_null_segment_duration": not_null_segment_duration, + "ar_wer_score": ar_wer_score, + "as_wer_score": as_wer_score, + "rs_wer_score": rs_wer_score, + "ar_bleu_score": ar_bleu_score, + "rs_bleu_score": rs_bleu_score, + } + elif project_type in ["AudioTranscription", "AudioTranscriptionEditing"]: + transcribed_word_count, transcribed_duration = 0, 0 + for r in result: + if r["from_name"] == "transcribed_json": + transcribed_word_count += calculateWordCount(r) + transcribed_duration += calculateAudioDuration(r) + segment_duration = get_audio_transcription_duration(result) + not_null_segment_duration = get_not_null_audio_transcription_duration(result) + return { + "audio_word_count": transcribed_word_count, + "transcribed_duration": transcribed_duration, + "total_segment_duration": segment_duration, + "not_null_segment_duration": not_null_segment_duration, + "ar_wer_score": ar_wer_score, + "as_wer_score": as_wer_score, + "rs_wer_score": rs_wer_score, + "ar_bleu_score": ar_bleu_score, + "rs_bleu_score": rs_bleu_score, + } + elif project_type in [ + "ContextualSentenceVerification", + "ContextualSentenceVerificationAndDomainClassification", + "ContextualTranslationEditing", + "TranslationEditing", + ]: + word_count = 0 + for r in result: + if r["type"] == "textarea": + word_count += calculateWordCount(r) + return { + "word_count": word_count, + "ar_wer_score": ar_wer_score, + "as_wer_score": as_wer_score, + "rs_wer_score": rs_wer_score, + "ar_bleu_score": ar_bleu_score, + "rs_bleu_score": rs_bleu_score, + } + elif project_type in [ + "ConversationTranslation", + "ConversationTranslationEditing", + "ConversationVerification", + ]: + word_count, sentence_count = 0, 0 + for r in result: + if r["type"] == "textarea": + word_count += calculateWordCount(r) + sentence_count += calculateSentenceCount( + ann_obj.result["value"]["text"][0] + ) + + return { + "word_count": word_count, + "sentence_count": sentence_count, + "ar_wer_score": ar_wer_score, + "as_wer_score": as_wer_score, + "rs_wer_score": rs_wer_score, + "ar_bleu_score": ar_bleu_score, + "rs_bleu_score": rs_bleu_score, + } + elif project_type in [ + "OCRTranscription", + "OCRTranscriptionEditing", + "OCRSegmentCategorizationEditing", + ]: + word_count = 0 + for r in result: + if r["from_name"] == "ocr_transcribed_json": + word_count += calculateWordCount(r) + return { + "word_count": word_count, + "ar_wer_score": ar_wer_score, + "as_wer_score": as_wer_score, + "rs_wer_score": rs_wer_score, + "ar_bleu_score": ar_bleu_score, + "rs_bleu_score": rs_bleu_score, + } + + +def calculateWordCount(annotation_result): + word_count = 0 + try: + word_count = no_of_words(annotation_result["value"]["text"][0]) + except: + pass + return word_count + + +def calculateAudioDuration(annotation_result): + try: + start = annotation_result["value"]["start"] + end = annotation_result["value"]["end"] + except: + start, end = 0, 0 + pass + return abs(end - start) + + +def calculateSentenceCount(text): + sentences = re.split(r"[.!?]+", text) + return len([sentence for sentence in sentences if sentence.strip()]) diff --git a/backend/tasks/views.py b/backend/tasks/views.py index 991c4f141..7f549cfbd 100644 --- a/backend/tasks/views.py +++ b/backend/tasks/views.py @@ -59,6 +59,7 @@ from utils.date_time_conversions import utc_to_ist from django.db import IntegrityError +from .utils import compute_meta_stats_for_annotation # Create your views here. @@ -2335,6 +2336,10 @@ def partial_update(self, request, pk=None): if supercheck_status in [UNVALIDATED, REJECTED, DRAFT, SKIPPED]: task.correct_annotation = None task.save() + annotation_obj.meta_stats = compute_meta_stats_for_annotation( + annotation_obj, annotation_obj.task.project_id.project_type + ) + annotation_obj.save() annotation_response.data["message"] = response_message return annotation_response diff --git a/backend/workspaces/tasks.py b/backend/workspaces/tasks.py index 4720d6a6d..440d8392e 100644 --- a/backend/workspaces/tasks.py +++ b/backend/workspaces/tasks.py @@ -32,6 +32,11 @@ ocr_word_count, ) from tasks.views import SentenceOperationViewSet +from tasks.utils import ( + calculateWordCount, + calculateAudioDuration, + calculateSentenceCount, +) def get_all_annotation_reports( @@ -116,27 +121,41 @@ def get_all_annotation_reports( if a.annotation_type == REVIEWER_ANNOTATION: number_of_tasks_that_has_review_annotations += 1 if ann_ann and rev_ann and not ar_done: - try: - ar_wer_score += calculate_word_error_rate_between_two_audio_transcription_annotation( - rev_ann.result, ann_ann.result, project_type - ) + meta_stats = rev_ann.meta_stats + if "word_error_rate" in meta_stats: + ar_wer_score += meta_stats["word_error_rate"] number_of_tasks_contributed_for_ar_wer += 1 ar_done = True - except Exception as e: - pass - try: - s1 = SentenceOperationViewSet() - sampleRequest = { - "annotation_result1": rev_ann.result, - "annotation_result2": ann_ann.result, - } - ar_bleu_score += float( - s1.calculate_bleu_score(sampleRequest).data["bleu_score"] - ) - number_of_tasks_contributed_for_ar_bleu += 1 - except Exception as e: - pass + else: + try: + ar_wer_score += calculate_word_error_rate_between_two_audio_transcription_annotation( + rev_ann.result, ann_ann.result, project_type + ) + number_of_tasks_contributed_for_ar_wer += 1 + ar_done = True + except Exception as e: + pass + if "bleu_score" in meta_stats: + ar_bleu_score += meta_stats["bleu_score"] + else: + try: + s1 = SentenceOperationViewSet() + sampleRequest = { + "annotation_result1": rev_ann.result, + "annotation_result2": ann_ann.result, + } + ar_bleu_score += float( + s1.calculate_bleu_score(sampleRequest).data["bleu_score"] + ) + number_of_tasks_contributed_for_ar_bleu += 1 + except Exception as e: + pass if ann_ann and sup_ann and not as_done: + meta_stats = sup_ann.meta_stats + if "word_error_rate" in meta_stats: + as_wer_score += meta_stats["word_error_rate"] + number_of_tasks_contributed_for_as_wer += 1 + as_done = True try: as_wer_score += calculate_word_error_rate_between_two_audio_transcription_annotation( sup_ann.result, ann_ann.result, project_type @@ -155,38 +174,160 @@ def get_all_annotation_reports( if project_type in ["ConversationTranslationEditing", "ConversationTranslation"] else False ) - total_audio_duration_list = [] + acoustic_normalised_duration = [] + verbatim_duration = [] + transcribed_duration = [] + acoustic_normalised_word_count = [] + verbatim_word_count = [] + transcribed_word_count = [] + total_segment_duration = [] total_raw_audio_duration_list = [] total_word_count_list = [] only_tasks = False if is_translation_project: for anno in submitted_tasks: - try: - total_word_count_list.append(anno.task.data["word_count"]) - except: - pass + meta_stats = anno.meta_stats + if "word_count" in meta_stats: + total_word_count_list.append(meta_stats["word_count"]) + else: + try: + total_word_count_list.append(anno.task.data["word_count"]) + except: + pass elif "OCRTranscription" in project_type: for anno in submitted_tasks: - total_word_count_list.append(ocr_word_count(anno.result)) + meta_stats = anno.meta_stats + if "word_count" in meta_stats: + total_word_count_list.append(meta_stats["word_count"]) + else: + total_word_count_list.append(ocr_word_count(anno.result)) elif ( project_type in get_audio_project_types() or project_type == "AllAudioProjects" ): for anno in submitted_tasks: + meta_stats = anno.meta_stats + if not meta_stats: + meta_stats = [] + if project_type == "AllAudioProjects": + if "acoustic_normalised_duration" in meta_stats: + acoustic_normalised_duration.append( + meta_stats["acoustic_normalised_duration"] + ) + else: + for r in anno.result: + if r["from_name"] == "acoustic_normalised_transcribed_json": + acoustic_normalised_duration.append( + calculateAudioDuration(r) + ) + if "acoustic_normalised_word_count" in meta_stats: + acoustic_normalised_word_count.append( + meta_stats["acoustic_normalised_word_count"] + ) + else: + for r in anno.result: + if r["from_name"] == "acoustic_normalised_transcribed_json": + acoustic_normalised_word_count.append(calculateWordCount(r)) + if "verbatim_duration" in meta_stats: + verbatim_duration.append(meta_stats["verbatim_duration"]) + else: + for r in anno.result: + if r["from_name"] == "verbatim_transcribed_json": + verbatim_duration.append(calculateAudioDuration(r)) + if "verbatim_word_count" in meta_stats: + verbatim_word_count.append(meta_stats["verbatim_word_count"]) + else: + for r in anno.result: + if r["from_name"] == "verbatim_transcribed_json": + verbatim_word_count.append(calculateWordCount(r)) + if "transcribed_duration" in meta_stats: + transcribed_duration.append(meta_stats["transcribed_duration"]) + else: + for r in anno.result: + if r["from_name"] == "transcribed_json": + transcribed_duration.append(calculateAudioDuration(r)) + if "transcribed_word_count" in meta_stats: + transcribed_word_count.append(meta_stats["transcribed_word_count"]) + else: + for r in anno.result: + if r["from_name"] == "transcribed_json": + transcribed_word_count.append(calculateAudioDuration(r)) + elif project_type == "AcousticNormalisedTranscriptionEditing": + if "acoustic_normalised_duration" in meta_stats: + acoustic_normalised_duration.append( + meta_stats["acoustic_normalised_duration"] + ) + else: + for r in anno.result: + if r["from_name"] == "acoustic_normalised_transcribed_json": + acoustic_normalised_duration.append( + calculateAudioDuration(r) + ) + if "acoustic_normalised_word_count" in meta_stats: + acoustic_normalised_word_count.append( + meta_stats["acoustic_normalised_word_count"] + ) + else: + for r in anno.result: + if r["from_name"] == "acoustic_normalised_transcribed_json": + acoustic_normalised_word_count.append(calculateWordCount(r)) + if "verbatim_duration" in meta_stats: + verbatim_duration.append(meta_stats["verbatim_duration"]) + else: + for r in anno.result: + if r["from_name"] == "verbatim_transcribed_json": + verbatim_duration.append(calculateAudioDuration(r)) + if "verbatim_word_count" in meta_stats: + verbatim_word_count.append(meta_stats["verbatim_word_count"]) + else: + for r in anno.result: + if r["from_name"] == "verbatim_transcribed_json": + verbatim_word_count.append(calculateWordCount(r)) + else: + if "transcribed_duration" in meta_stats: + transcribed_duration.append(meta_stats["transcribed_duration"]) + else: + for r in anno.result: + if r["from_name"] == "transcribed_json": + transcribed_duration.append(calculateAudioDuration(r)) + if "transcribed_word_count" in meta_stats: + transcribed_word_count.append(meta_stats["transcribed_word_count"]) + else: + for r in anno.result: + if r["from_name"] == "transcribed_json": + transcribed_word_count.append(calculateAudioDuration(r)) try: - total_audio_duration_list.append( - get_audio_transcription_duration(anno.result) - ) total_raw_audio_duration_list.append(anno.task.data["audio_duration"]) except: pass + if "total_segment_duration" in meta_stats: + total_segment_duration.append(meta_stats["total_segment_duration"]) + else: + try: + total_segment_duration.append( + get_audio_transcription_duration(anno.result) + ) + except: + pass else: only_tasks = True - - total_word_count = sum(total_word_count_list) - total_audio_duration = convert_seconds_to_hours(sum(total_audio_duration_list)) - total_raw_audio_duration = convert_seconds_to_hours( - sum(total_raw_audio_duration_list) - ) + total_raw_audio_duration, total_word_count = 0, 0 + if project_type in get_audio_project_types() or project_type == "AllAudioProjects": + acoustic_normalised_duration = convert_seconds_to_hours( + sum(acoustic_normalised_duration) + ) + verbatim_duration = convert_seconds_to_hours(sum(verbatim_duration)) + transcribed_duration = convert_seconds_to_hours(sum(transcribed_duration)) + acoustic_normalised_word_count = convert_seconds_to_hours( + sum(acoustic_normalised_word_count) + ) + verbatim_word_count = convert_seconds_to_hours(sum(verbatim_word_count)) + transcribed_word_count = convert_seconds_to_hours(sum(transcribed_word_count)) + total_raw_audio_duration = convert_seconds_to_hours( + sum(total_raw_audio_duration_list) + ) + total_segment_duration = convert_seconds_to_hours(sum(total_segment_duration)) + else: + total_word_count = sum(total_word_count_list) cumulative_rejection_score_ar = 0 if tasks_and_rejection_count_map_ar: for task, rc in tasks_and_rejection_count_map_ar.items(): @@ -198,8 +339,14 @@ def get_all_annotation_reports( "Participation Type": participation_type, "Role": role, "Type of Work": "Annotator", - "Total Segments Duration": total_audio_duration, + "Acoustic Normalised Duration": acoustic_normalised_duration, + "Verbatim Duration": verbatim_duration, + "Transcribed Duration": transcribed_duration, "Total Raw Audio Duration": total_raw_audio_duration, + "Total Segment Duration": total_segment_duration, + "Acoustic Normalised Word Count": acoustic_normalised_word_count, + "Verbatim Word Count": verbatim_word_count, + "Transcribed Word Count": transcribed_word_count, "Word Count": total_word_count, "Submitted Tasks": submitted_tasks_count, "Language": user_lang, @@ -229,17 +376,44 @@ def get_all_annotation_reports( if project_type in get_audio_project_types() or project_type == "AllAudioProjects": del result["Word Count"] + if project_type == "AcousticNormalisedTranscriptionEditing": + del result["Transcribed Duration"] + del result["Transcribed Word Count"] + elif project_type in get_audio_project_types(): + del result["Acoustic Normalised Duration"] + del result["Verbatim Duration"] + del result["Raw Audio Duration"] + del result["Acoustic Normalised Word Count"] + del result["Verbatim Word Count"] elif only_tasks: - del result["Total Segments Duration"] - del result["Total Raw Audio Duration"] del result["Word Count"] + del result["Acoustic Normalised Duration"] + del result["Verbatim Duration"] + del result["Transcribed Duration"] + del result["Raw Audio Duration"] + del result["Total Segment Duration"] + del result["Acoustic Normalised Word Count"] + del result["Verbatim Word Count"] + del result["Transcribed Word Count"] elif is_CT_OR_CTE: - del result["Total Segments Duration"] - del result["Total Raw Audio Duration"] del result["Word Count"] + del result["Acoustic Normalised Duration"] + del result["Verbatim Duration"] + del result["Transcribed Duration"] + del result["Raw Audio Duration"] + del result["Total Segment Duration"] + del result["Acoustic Normalised Word Count"] + del result["Verbatim Word Count"] + del result["Transcribed Word Count"] else: - del result["Total Segments Duration"] - del result["Total Raw Audio Duration"] + del result["Acoustic Normalised Duration"] + del result["Verbatim Duration"] + del result["Transcribed Duration"] + del result["Raw Audio Duration"] + del result["Total Segment Duration"] + del result["Acoustic Normalised Word Count"] + del result["Verbatim Word Count"] + del result["Transcribed Word Count"] return result @@ -333,26 +507,35 @@ def get_all_review_reports( if a.annotation_type == SUPER_CHECKER_ANNOTATION: number_of_tasks_that_has_sup_annotations += 1 if rev_ann and sup_ann and not rs_done: - try: - rs_wer_score += calculate_word_error_rate_between_two_audio_transcription_annotation( - sup_ann.result, rev_ann.result, project_type - ) + meta_stats = sup_ann.meta_stats + if "word_error_rate" in meta_stats: + rs_wer_score += meta_stats["word_error_rate"] number_of_tasks_contributed_for_rs_wer += 1 rs_done = True - except Exception as e: - pass - try: - s1 = SentenceOperationViewSet() - sampleRequest = { - "annotation_result1": sup_ann.result, - "annotation_result2": rev_ann.result, - } - rs_bleu_score += float( - s1.calculate_bleu_score(sampleRequest).data["bleu_score"] - ) - number_of_tasks_contributed_for_rs_bleu += 1 - except Exception as e: - pass + else: + try: + rs_wer_score += calculate_word_error_rate_between_two_audio_transcription_annotation( + sup_ann.result, rev_ann.result, project_type + ) + number_of_tasks_contributed_for_rs_wer += 1 + rs_done = True + except Exception as e: + pass + if "bleu_score" in meta_stats: + rs_bleu_score += meta_stats["bleu_score"] + else: + try: + s1 = SentenceOperationViewSet() + sampleRequest = { + "annotation_result1": sup_ann.result, + "annotation_result2": rev_ann.result, + } + rs_bleu_score += float( + s1.calculate_bleu_score(sampleRequest).data["bleu_score"] + ) + number_of_tasks_contributed_for_rs_bleu += 1 + except Exception as e: + pass submitted_tasks_count = submitted_tasks.count() project_type_lower = project_type.lower() @@ -362,37 +545,158 @@ def get_all_review_reports( if project_type in ["ConversationTranslationEditing", "ConversationTranslation"] else False ) - total_audio_duration_list = [] total_raw_audio_duration_list = [] total_word_count_list = [] + acoustic_normalised_duration = [] + verbatim_duration = [] + transcribed_duration = [] + acoustic_normalised_word_count = [] + verbatim_word_count = [] + transcribed_word_count = [] + total_segment_duration = [] only_tasks = False if is_translation_project: for anno in submitted_tasks: - try: - total_word_count_list.append(anno.task.data["word_count"]) - except: - pass + meta_stats = anno.meta_stats + if "word_count" in meta_stats: + total_word_count_list.append(meta_stats["word_count"]) + else: + try: + total_word_count_list.append(anno.task.data["word_count"]) + except: + pass elif "OCRTranscription" in project_type: for anno in submitted_tasks: - total_word_count_list.append(ocr_word_count(anno.result)) + meta_stats = anno.meta_stats + if "word_count" in meta_stats: + total_word_count_list.append(meta_stats["word_count"]) + else: + total_word_count_list.append(ocr_word_count(anno.result)) elif ( project_type in get_audio_project_types() or project_type == "AllAudioProjects" ): for anno in submitted_tasks: + meta_stats = anno.meta_stats + if project_type == "AllAudioProjects": + if "acoustic_normalised_duration" in meta_stats: + acoustic_normalised_duration.append( + meta_stats["acoustic_normalised_duration"] + ) + else: + for r in anno.result: + if r["from_name"] == "acoustic_normalised_transcribed_json": + acoustic_normalised_duration.append( + calculateAudioDuration(r) + ) + if "acoustic_normalised_word_count" in meta_stats: + acoustic_normalised_word_count.append( + meta_stats["acoustic_normalised_word_count"] + ) + else: + for r in anno.result: + if r["from_name"] == "acoustic_normalised_transcribed_json": + acoustic_normalised_word_count.append(calculateWordCount(r)) + if "verbatim_duration" in meta_stats: + verbatim_duration.append(meta_stats["verbatim_duration"]) + else: + for r in anno.result: + if r["from_name"] == "verbatim_transcribed_json": + verbatim_duration.append(calculateAudioDuration(r)) + if "verbatim_word_count" in meta_stats: + verbatim_word_count.append(meta_stats["verbatim_word_count"]) + else: + for r in anno.result: + if r["from_name"] == "verbatim_transcribed_json": + verbatim_word_count.append(calculateWordCount(r)) + if "transcribed_duration" in meta_stats: + transcribed_duration.append(meta_stats["transcribed_duration"]) + else: + for r in anno.result: + if r["from_name"] == "transcribed_json": + transcribed_duration.append(calculateAudioDuration(r)) + if "transcribed_word_count" in meta_stats: + transcribed_word_count.append(meta_stats["transcribed_word_count"]) + else: + for r in anno.result: + if r["from_name"] == "transcribed_json": + transcribed_word_count.append(calculateAudioDuration(r)) + elif project_type == "AcousticNormalisedTranscriptionEditing": + if "acoustic_normalised_duration" in meta_stats: + acoustic_normalised_duration.append( + meta_stats["acoustic_normalised_duration"] + ) + else: + for r in anno.result: + if r["from_name"] == "acoustic_normalised_transcribed_json": + acoustic_normalised_duration.append( + calculateAudioDuration(r) + ) + if "acoustic_normalised_word_count" in meta_stats: + acoustic_normalised_word_count.append( + meta_stats["acoustic_normalised_word_count"] + ) + else: + for r in anno.result: + if r["from_name"] == "acoustic_normalised_transcribed_json": + acoustic_normalised_word_count.append(calculateWordCount(r)) + if "verbatim_duration" in meta_stats: + verbatim_duration.append(meta_stats["verbatim_duration"]) + else: + for r in anno.result: + if r["from_name"] == "verbatim_transcribed_json": + verbatim_duration.append(calculateAudioDuration(r)) + if "verbatim_word_count" in meta_stats: + verbatim_word_count.append(meta_stats["verbatim_word_count"]) + else: + for r in anno.result: + if r["from_name"] == "verbatim_transcribed_json": + verbatim_word_count.append(calculateWordCount(r)) + else: + if "transcribed_duration" in meta_stats: + transcribed_duration.append(meta_stats["transcribed_duration"]) + else: + for r in anno.result: + if r["from_name"] == "transcribed_json": + transcribed_duration.append(calculateAudioDuration(r)) + if "transcribed_word_count" in meta_stats: + transcribed_word_count.append(meta_stats["transcribed_word_count"]) + else: + for r in anno.result: + if r["from_name"] == "transcribed_json": + transcribed_word_count.append(calculateAudioDuration(r)) + if "total_segment_duration" in meta_stats: + total_segment_duration.append(meta_stats["total_segment_duration"]) + else: + try: + total_segment_duration.append( + get_audio_transcription_duration(anno.result) + ) + except: + pass try: - total_audio_duration_list.append( - get_audio_transcription_duration(anno.result) - ) total_raw_audio_duration_list.append(anno.task.data["audio_duration"]) except: pass else: only_tasks = True - total_word_count = sum(total_word_count_list) - total_audio_duration = convert_seconds_to_hours(sum(total_audio_duration_list)) - total_raw_audio_duration = convert_seconds_to_hours( - sum(total_raw_audio_duration_list) - ) + total_raw_audio_duration, total_word_count = 0, 0 + if project_type in get_audio_project_types() or project_type == "AllAudioProjects": + acoustic_normalised_duration = convert_seconds_to_hours( + sum(acoustic_normalised_duration) + ) + verbatim_duration = convert_seconds_to_hours(sum(verbatim_duration)) + transcribed_duration = convert_seconds_to_hours(sum(transcribed_duration)) + acoustic_normalised_word_count = convert_seconds_to_hours( + sum(acoustic_normalised_word_count) + ) + verbatim_word_count = convert_seconds_to_hours(sum(verbatim_word_count)) + transcribed_word_count = convert_seconds_to_hours(sum(transcribed_word_count)) + total_raw_audio_duration = convert_seconds_to_hours( + sum(total_raw_audio_duration_list) + ) + total_segment_duration = convert_seconds_to_hours(sum(total_segment_duration)) + else: + total_word_count = sum(total_word_count_list) cumulative_rejection_score_ar = 0 if tasks_and_rejection_count_map_ar: for task, rc in tasks_and_rejection_count_map_ar.items(): @@ -409,8 +713,14 @@ def get_all_review_reports( "Participation Type": participation_type, "Role": role, "Type of Work": "Review", - "Total Segments Duration": total_audio_duration, + "Acoustic Normalised Duration": acoustic_normalised_duration, + "Verbatim Duration": verbatim_duration, + "Transcribed Duration": transcribed_duration, "Total Raw Audio Duration": total_raw_audio_duration, + "Total Segment Duration": total_segment_duration, + "Acoustic Normalised Word Count": acoustic_normalised_word_count, + "Verbatim Word Count": verbatim_word_count, + "Transcribed Word Count": transcribed_word_count, "Word Count": total_word_count, "Submitted Tasks": submitted_tasks_count, "Language": user_lang, @@ -437,17 +747,44 @@ def get_all_review_reports( if project_type in get_audio_project_types() or project_type == "AllAudioProjects": del result["Word Count"] + if project_type == "AcousticNormalisedTranscriptionEditing": + del result["Transcribed Duration"] + del result["Transcribed Word Count"] + elif project_type in get_audio_project_types(): + del result["Acoustic Normalised Duration"] + del result["Verbatim Duration"] + del result["Raw Audio Duration"] + del result["Acoustic Normalised Word Count"] + del result["Verbatim Word Count"] elif only_tasks: - del result["Total Segments Duration"] - del result["Total Raw Audio Duration"] del result["Word Count"] + del result["Acoustic Normalised Duration"] + del result["Verbatim Duration"] + del result["Transcribed Duration"] + del result["Raw Audio Duration"] + del result["Total Segment Duration"] + del result["Acoustic Normalised Word Count"] + del result["Verbatim Word Count"] + del result["Transcribed Word Count"] elif is_CT_OR_CTE: - del result["Total Segments Duration"] - del result["Total Raw Audio Duration"] del result["Word Count"] + del result["Acoustic Normalised Duration"] + del result["Verbatim Duration"] + del result["Transcribed Duration"] + del result["Raw Audio Duration"] + del result["Total Segment Duration"] + del result["Acoustic Normalised Word Count"] + del result["Verbatim Word Count"] + del result["Transcribed Word Count"] else: - del result["Total Segments Duration"] - del result["Total Raw Audio Duration"] + del result["Acoustic Normalised Duration"] + del result["Verbatim Duration"] + del result["Transcribed Duration"] + del result["Raw Audio Duration"] + del result["Total Segment Duration"] + del result["Acoustic Normalised Word Count"] + del result["Verbatim Word Count"] + del result["Transcribed Word Count"] return result @@ -507,26 +844,134 @@ def get_all_supercheck_reports( else False ) validated_word_count_list = [] - validated_audio_duration_list = [] + acoustic_normalised_duration = [] + verbatim_duration = [] + transcribed_duration = [] + acoustic_normalised_word_count = [] + verbatim_word_count = [] + transcribed_word_count = [] + total_segment_duration = [] validated_raw_audio_duration_list = [] only_tasks = False if is_translation_project: for anno in submitted_tasks: - try: - validated_word_count_list.append(anno.task.data["word_count"]) - except: - pass + meta_stats = anno.meta_stats + if "word_count" in meta_stats: + validated_word_count_list.append(meta_stats["word_count"]) + else: + try: + validated_word_count_list.append(anno.task.data["word_count"]) + except: + pass elif "OCRTranscription" in project_type: for anno in submitted_tasks: - validated_word_count_list.append(ocr_word_count(anno.result)) + meta_stats = anno.meta_stats + if "word_count" in meta_stats: + validated_word_count_list.append(meta_stats["word_count"]) + else: + validated_word_count_list.append(ocr_word_count(anno.result)) elif ( project_type in get_audio_project_types() or project_type == "AllAudioProjects" ): for anno in submitted_tasks: + meta_stats = anno.meta_stats + if project_type == "AllAudioProjects": + if "acoustic_normalised_duration" in meta_stats: + acoustic_normalised_duration.append( + meta_stats["acoustic_normalised_duration"] + ) + else: + for r in anno.result: + if r["from_name"] == "acoustic_normalised_transcribed_json": + acoustic_normalised_duration.append( + calculateAudioDuration(r) + ) + if "acoustic_normalised_word_count" in meta_stats: + acoustic_normalised_word_count.append( + meta_stats["acoustic_normalised_word_count"] + ) + else: + for r in anno.result: + if r["from_name"] == "acoustic_normalised_transcribed_json": + acoustic_normalised_word_count.append(calculateWordCount(r)) + if "verbatim_duration" in meta_stats: + verbatim_duration.append(meta_stats["verbatim_duration"]) + else: + for r in anno.result: + if r["from_name"] == "verbatim_transcribed_json": + verbatim_duration.append(calculateAudioDuration(r)) + if "verbatim_word_count" in meta_stats: + verbatim_word_count.append(meta_stats["verbatim_word_count"]) + else: + for r in anno.result: + if r["from_name"] == "verbatim_transcribed_json": + verbatim_word_count.append(calculateWordCount(r)) + if "transcribed_duration" in meta_stats: + transcribed_duration.append(meta_stats["transcribed_duration"]) + else: + for r in anno.result: + if r["from_name"] == "transcribed_json": + transcribed_duration.append(calculateAudioDuration(r)) + if "transcribed_word_count" in meta_stats: + transcribed_word_count.append(meta_stats["transcribed_word_count"]) + else: + for r in anno.result: + if r["from_name"] == "transcribed_json": + transcribed_word_count.append(calculateAudioDuration(r)) + elif project_type == "AcousticNormalisedTranscriptionEditing": + if "acoustic_normalised_duration" in meta_stats: + acoustic_normalised_duration.append( + meta_stats["acoustic_normalised_duration"] + ) + else: + for r in anno.result: + if r["from_name"] == "acoustic_normalised_transcribed_json": + acoustic_normalised_duration.append( + calculateAudioDuration(r) + ) + if "acoustic_normalised_word_count" in meta_stats: + acoustic_normalised_word_count.append( + meta_stats["acoustic_normalised_word_count"] + ) + else: + for r in anno.result: + if r["from_name"] == "acoustic_normalised_transcribed_json": + acoustic_normalised_word_count.append(calculateWordCount(r)) + if "verbatim_duration" in meta_stats: + verbatim_duration.append(meta_stats["verbatim_duration"]) + else: + for r in anno.result: + if r["from_name"] == "verbatim_transcribed_json": + verbatim_duration.append(calculateAudioDuration(r)) + if "verbatim_word_count" in meta_stats: + verbatim_word_count.append(meta_stats["verbatim_word_count"]) + else: + for r in anno.result: + if r["from_name"] == "verbatim_transcribed_json": + verbatim_word_count.append(calculateWordCount(r)) + else: + if "transcribed_duration" in meta_stats: + transcribed_duration.append(meta_stats["transcribed_duration"]) + else: + for r in anno.result: + if r["from_name"] == "transcribed_json": + transcribed_duration.append(calculateAudioDuration(r)) + if "transcribed_word_count" in meta_stats: + transcribed_word_count.append(meta_stats["transcribed_word_count"]) + else: + for r in anno.result: + if r["from_name"] == "transcribed_json": + transcribed_word_count.append(calculateAudioDuration(r)) + if "total_segment_duration" in meta_stats: + total_segment_duration.append(meta_stats["total_segment_duration"]) + else: + try: + total_segment_duration.append( + get_audio_transcription_duration(anno.result) + ) + except: + pass try: - validated_audio_duration_list.append( - get_audio_transcription_duration(anno.result) - ) validated_raw_audio_duration_list.append( anno.task.data["audio_duration"] ) @@ -535,13 +980,24 @@ def get_all_supercheck_reports( else: only_tasks = True - validated_word_count = sum(validated_word_count_list) - validated_audio_duration = convert_seconds_to_hours( - sum(validated_audio_duration_list) - ) - validated_raw_audio_duration = convert_seconds_to_hours( - sum(validated_raw_audio_duration_list) - ) + validated_raw_audio_duration, validated_word_count = 0, 0 + if project_type in get_audio_project_types() or project_type == "AllAudioProjects": + acoustic_normalised_duration = convert_seconds_to_hours( + sum(acoustic_normalised_duration) + ) + verbatim_duration = convert_seconds_to_hours(sum(verbatim_duration)) + transcribed_duration = convert_seconds_to_hours(sum(transcribed_duration)) + acoustic_normalised_word_count = convert_seconds_to_hours( + sum(acoustic_normalised_word_count) + ) + verbatim_word_count = convert_seconds_to_hours(sum(verbatim_word_count)) + transcribed_word_count = convert_seconds_to_hours(sum(transcribed_word_count)) + validated_raw_audio_duration = convert_seconds_to_hours( + sum(validated_raw_audio_duration_list) + ) + total_segment_duration = convert_seconds_to_hours(sum(total_segment_duration)) + else: + validated_word_count = sum(validated_word_count_list) cumulative_rejection_score_rs = 0 if tasks_and_rejection_count_map_rs: for task, rc in tasks_and_rejection_count_map_rs.items(): @@ -553,8 +1009,14 @@ def get_all_supercheck_reports( "Participation Type": participation_type, "Role": role, "Type of Work": "Supercheck", - "Total Segments Duration": validated_audio_duration, + "Acoustic Normalised Duration": acoustic_normalised_duration, + "Verbatim Duration": verbatim_duration, + "Transcribed Duration": transcribed_duration, "Total Raw Audio Duration": validated_raw_audio_duration, + "Total Segment Duration": total_segment_duration, + "Acoustic Normalised Word Count": acoustic_normalised_word_count, + "Verbatim Word Count": verbatim_word_count, + "Transcribed Word Count": transcribed_word_count, "Word Count": validated_word_count, "Submitted Tasks": submitted_tasks_count, "Language": user_lang, @@ -566,19 +1028,44 @@ def get_all_supercheck_reports( if project_type in get_audio_project_types() or project_type == "AllAudioProjects": del result["Word Count"] + if project_type == "AcousticNormalisedTranscriptionEditing": + del result["Transcribed Duration"] + del result["Transcribed Word Count"] + elif project_type in get_audio_project_types(): + del result["Acoustic Normalised Duration"] + del result["Verbatim Duration"] + del result["Raw Audio Duration"] + del result["Acoustic Normalised Word Count"] + del result["Verbatim Word Count"] elif only_tasks: - del result["Total Segments Duration"] - del result["Total Raw Audio Duration"] del result["Word Count"] + del result["Acoustic Normalised Duration"] + del result["Verbatim Duration"] + del result["Transcribed Duration"] + del result["Raw Audio Duration"] + del result["Total Segment Duration"] + del result["Acoustic Normalised Word Count"] + del result["Verbatim Word Count"] + del result["Transcribed Word Count"] elif is_CT_OR_CTE: - del result["Total Segments Duration"] - del result["Total Raw Audio Duration"] del result["Word Count"] + del result["Acoustic Normalised Duration"] + del result["Verbatim Duration"] + del result["Transcribed Duration"] + del result["Raw Audio Duration"] + del result["Total Segment Duration"] + del result["Acoustic Normalised Word Count"] + del result["Verbatim Word Count"] + del result["Transcribed Word Count"] else: - del result["Total Segments Duration"] - del result["Total Raw Audio Duration"] - - return result + del result["Acoustic Normalised Duration"] + del result["Verbatim Duration"] + del result["Transcribed Duration"] + del result["Raw Audio Duration"] + del result["Total Segment Duration"] + del result["Acoustic Normalised Word Count"] + del result["Verbatim Word Count"] + del result["Transcribed Word Count"] @shared_task(queue="reports")