Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

initial changes in partial_update #1107

Open
wants to merge 9 commits into
base: dev
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 18 additions & 9 deletions backend/functions/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -1541,15 +1541,24 @@ def update_meta_stats(
ann_obj.result
)
elif project_type in get_audio_project_types():
result_meta_stats[ann_obj.annotation_status]["Raw Audio Duration"] += task_data[
"audio_duration"
]
result_meta_stats[ann_obj.annotation_status][
"Segment Duration"
] += get_audio_transcription_duration(ann_obj.result)
result_meta_stats[ann_obj.annotation_status][
"Not Null Segment Duration"
] += get_not_null_audio_transcription_duration(ann_obj.result, ann_obj.id)
meta_stats = ann_obj.meta_stats
if meta_stats:
result_meta_stats[ann_obj.annotation_status][
"Segment Duration"
] = meta_stats["total_segment_duration"]
result_meta_stats[ann_obj.annotation_status][
"Not Null Segment Duration"
] = meta_stats["not_null_segment_duration"]
else:
result_meta_stats[ann_obj.annotation_status][
"Raw Audio Duration"
] += task_data["audio_duration"]
result_meta_stats[ann_obj.annotation_status][
"Segment Duration"
] += get_audio_transcription_duration(ann_obj.result)
result_meta_stats[ann_obj.annotation_status][
"Not Null Segment Duration"
] += get_not_null_audio_transcription_duration(ann_obj.result, ann_obj.id)


def calculate_ced_between_two_annotations(annotation1, annotation2):
Expand Down
695 changes: 592 additions & 103 deletions backend/organizations/tasks.py

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion backend/projects/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -173,7 +173,7 @@ def get_audio_transcription_duration(annotation_result):
return audio_duration


def get_not_null_audio_transcription_duration(annotation_result, ann_id):
def get_not_null_audio_transcription_duration(annotation_result, ann_id=None):
audio_duration = 0
memory = create_memory(annotation_result)
for key, indexes in memory.items():
Expand Down
22 changes: 22 additions & 0 deletions backend/tasks/migrations/0049_annotation_meta_stats.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
# Generated by Django 3.2.14 on 2024-09-19 03:43

from django.db import migrations, models


class Migration(migrations.Migration):
dependencies = [
("tasks", "0048_alter_annotation_unique_together"),
]

operations = [
migrations.AddField(
model_name="annotation",
name="meta_stats",
field=models.JSONField(
blank=True,
help_text="Meta statistics for the annotation result",
null=True,
verbose_name="meta_stats",
),
),
]
7 changes: 7 additions & 0 deletions backend/tasks/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -261,6 +261,13 @@ class Annotation(models.Model):
help_text=("Time when the annotation was first labeled/accepted/validated"),
)

meta_stats = models.JSONField(
blank=True,
null=True,
verbose_name="meta_stats",
help_text="Meta statistics for the annotation result",
)

def __str__(self):
return str(self.id)

Expand Down
231 changes: 231 additions & 0 deletions backend/tasks/utils.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,27 @@
import os
import re
from requests import RequestException
import requests
from dotenv import load_dotenv
from projects.utils import (
no_of_words,
get_audio_project_types,
get_audio_transcription_duration,
get_not_null_audio_transcription_duration,
calculate_word_error_rate_between_two_audio_transcription_annotation,
)
from tasks.models import (
Annotation,
REVIEWER_ANNOTATION,
ANNOTATOR_ANNOTATION,
SUPER_CHECKER_ANNOTATION,
ACCEPTED,
ACCEPTED_WITH_MINOR_CHANGES,
ACCEPTED_WITH_MAJOR_CHANGES,
VALIDATED,
VALIDATED_WITH_CHANGES,
)


Queued_Task_name = {
"dataset.tasks.deduplicate_dataset_instance_items": "Deduplicate Dataset Instance Items",
Expand Down Expand Up @@ -60,3 +80,214 @@ def query_flower(filters=None):
return {"error": "Failed to retrieve tasks from Flower"}
except RequestException as e:
return {"error": f" failed to connect to flower API, {str(e)}"}


def compute_meta_stats_for_annotation(ann_obj, project_type):
from tasks.views import SentenceOperationViewSet

task_obj = ann_obj.task
task_data = task_obj.data
ced_project_type_choices = ["ContextualTranslationEditing"]
result_meta_stats = {}
result = ann_obj.result

# calculating wer and bleu scores
all_annotations = Annotation.objects.filter(task_id=task_obj.id)
ar_wer_score, as_wer_score, rs_wer_score = 0, 0, 0
ar_bleu_score, rs_bleu_score = 0, 0
ar_done, as_done, rs_done = False, False, False
ann_ann, rev_ann, sup_ann = "", "", ""
for a in all_annotations:
if a.annotation_type == REVIEWER_ANNOTATION and a.annotation_status in [
ACCEPTED,
ACCEPTED_WITH_MINOR_CHANGES,
ACCEPTED_WITH_MAJOR_CHANGES,
]:
rev_ann = a
elif a.annotation_type == SUPER_CHECKER_ANNOTATION and a.annotation_status in [
VALIDATED,
VALIDATED_WITH_CHANGES,
]:
sup_ann = a
elif a.annotation_type == ANNOTATOR_ANNOTATION:
ann_ann = a
if ann_ann and rev_ann and not ar_done:
try:
ar_wer_score += calculate_word_error_rate_between_two_audio_transcription_annotation(
rev_ann.result, ann_ann.result, project_type
)
ar_done = True
except Exception as e:
pass
try:
s1 = SentenceOperationViewSet()
sampleRequest = {
"annotation_result1": rev_ann.result,
"annotation_result2": ann_ann.result,
}
ar_bleu_score += float(
s1.calculate_bleu_score(sampleRequest).data["ar_bleu_score"]
)
except Exception as e:
pass
if rev_ann and sup_ann and not rs_done:
try:
rs_wer_score += calculate_word_error_rate_between_two_audio_transcription_annotation(
sup_ann.result, rev_ann.result, project_type
)
rs_done = True
except Exception as e:
pass
try:
s1 = SentenceOperationViewSet()
sampleRequest = {
"annotation_result1": sup_ann.result,
"annotation_result2": rev_ann.result,
}
rs_bleu_score += float(
s1.calculate_bleu_score(sampleRequest).data["rs_bleu_score"]
)
except Exception as e:
pass
if ann_ann and sup_ann and not as_done:
meta_stats = sup_ann.meta_stats
if "as_wer_score" in meta_stats:
as_wer_score += meta_stats["as_wer_score"]
as_done = True
try:
as_wer_score += calculate_word_error_rate_between_two_audio_transcription_annotation(
sup_ann.result, ann_ann.result, project_type
)
as_done = True
except Exception as e:
pass

if project_type == "AcousticNormalisedTranscriptionEditing":
(
acoustic_normalised_word_count,
verbatim_word_count,
acoustic_normalised_duration,
verbatim_duration,
) = (0, 0, 0, 0)
for r in result:
if r["from_name"] == "acoustic_normalised_transcribed_json":
acoustic_normalised_word_count += calculateWordCount(r)
acoustic_normalised_duration += calculateAudioDuration(r)
elif r["from_name"] == "verbatim_transcribed_json":
verbatim_word_count += calculateWordCount(r)
verbatim_duration += calculateAudioDuration(r)
segment_duration = get_audio_transcription_duration(result)
not_null_segment_duration = get_not_null_audio_transcription_duration(result)
return {
"acoustic_normalised_word_count": acoustic_normalised_word_count,
"verbatim_word_count": verbatim_word_count,
"acoustic_normalised_duration": acoustic_normalised_duration,
"verbatim_duration": verbatim_duration,
"total_segment_duration": segment_duration,
"not_null_segment_duration": not_null_segment_duration,
"ar_wer_score": ar_wer_score,
"as_wer_score": as_wer_score,
"rs_wer_score": rs_wer_score,
"ar_bleu_score": ar_bleu_score,
"rs_bleu_score": rs_bleu_score,
}
elif project_type in ["AudioTranscription", "AudioTranscriptionEditing"]:
transcribed_word_count, transcribed_duration = 0, 0
for r in result:
if r["from_name"] == "transcribed_json":
transcribed_word_count += calculateWordCount(r)
transcribed_duration += calculateAudioDuration(r)
segment_duration = get_audio_transcription_duration(result)
not_null_segment_duration = get_not_null_audio_transcription_duration(result)
return {
"audio_word_count": transcribed_word_count,
"transcribed_duration": transcribed_duration,
"total_segment_duration": segment_duration,
"not_null_segment_duration": not_null_segment_duration,
"ar_wer_score": ar_wer_score,
"as_wer_score": as_wer_score,
"rs_wer_score": rs_wer_score,
"ar_bleu_score": ar_bleu_score,
"rs_bleu_score": rs_bleu_score,
}
elif project_type in [
"ContextualSentenceVerification",
"ContextualSentenceVerificationAndDomainClassification",
"ContextualTranslationEditing",
"TranslationEditing",
]:
word_count = 0
for r in result:
if r["type"] == "textarea":
word_count += calculateWordCount(r)
return {
"word_count": word_count,
"ar_wer_score": ar_wer_score,
"as_wer_score": as_wer_score,
"rs_wer_score": rs_wer_score,
"ar_bleu_score": ar_bleu_score,
"rs_bleu_score": rs_bleu_score,
}
elif project_type in [
"ConversationTranslation",
"ConversationTranslationEditing",
"ConversationVerification",
]:
word_count, sentence_count = 0, 0
for r in result:
if r["type"] == "textarea":
word_count += calculateWordCount(r)
sentence_count += calculateSentenceCount(
ann_obj.result["value"]["text"][0]
)

return {
"word_count": word_count,
"sentence_count": sentence_count,
"ar_wer_score": ar_wer_score,
"as_wer_score": as_wer_score,
"rs_wer_score": rs_wer_score,
"ar_bleu_score": ar_bleu_score,
"rs_bleu_score": rs_bleu_score,
}
elif project_type in [
"OCRTranscription",
"OCRTranscriptionEditing",
"OCRSegmentCategorizationEditing",
]:
word_count = 0
for r in result:
if r["from_name"] == "ocr_transcribed_json":
word_count += calculateWordCount(r)
return {
"word_count": word_count,
"ar_wer_score": ar_wer_score,
"as_wer_score": as_wer_score,
"rs_wer_score": rs_wer_score,
"ar_bleu_score": ar_bleu_score,
"rs_bleu_score": rs_bleu_score,
}


def calculateWordCount(annotation_result):
word_count = 0
try:
word_count = no_of_words(annotation_result["value"]["text"][0])
except:
pass
return word_count


def calculateAudioDuration(annotation_result):
try:
start = annotation_result["value"]["start"]
end = annotation_result["value"]["end"]
except:
start, end = 0, 0
pass
return abs(end - start)


def calculateSentenceCount(text):
sentences = re.split(r"[.!?]+", text)
return len([sentence for sentence in sentences if sentence.strip()])
5 changes: 5 additions & 0 deletions backend/tasks/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@

from utils.date_time_conversions import utc_to_ist
from django.db import IntegrityError
from .utils import compute_meta_stats_for_annotation

# Create your views here.

Expand Down Expand Up @@ -2335,6 +2336,10 @@ def partial_update(self, request, pk=None):
if supercheck_status in [UNVALIDATED, REJECTED, DRAFT, SKIPPED]:
task.correct_annotation = None
task.save()
annotation_obj.meta_stats = compute_meta_stats_for_annotation(
annotation_obj, annotation_obj.task.project_id.project_type
)
annotation_obj.save()
annotation_response.data["message"] = response_message
return annotation_response

Expand Down
Loading
Loading