calculating reliability of vtc against all human annotations together #377

alecristia · 2022-06-14T07:54:44Z

alecristia
Jun 14, 2022
Maintainer

I'm working in the solomon dataset, where there have been 2 annotation campaigns with 4 and 3 human annotations respectively. I'd like to get the most informed estimate of precision & recall, or more simply, F-score for each of the 4 speaker types of VTC. To do so, I adapted a script as follows:

Original

import argparse
from datetime import timedelta
import numpy as np
import pandas as pd
from os.path import join as opj
from os.path import basename
from os import makedirs
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import normalize

import seaborn as sns
import matplotlib.pyplot as plt
from pyannote.core import notebook

from ChildProject.projects import ChildProject
from ChildProject.annotations import AnnotationManager
from ChildProject.metrics import *

parser = argparse.ArgumentParser(description = 'stats')
parser.add_argument('destination', help = 'destination')
parser.add_argument('--drop-overlap', help = 'drop overlap', action = 'store_true')
args = parser.parse_args()

speakers = ['CHI', 'OCH', 'FEM', 'MAL']

makedirs(args.destination)

project = ChildProject('.')
am = AnnotationManager(project)
am.read()

# annotated clips statistics
annotations = am.annotations[am.annotations['set'].str.startswith('eaf_2021')]
segments = am.get_collapsed_segments(annotations)
segments = segments[segments['speaker_type'].isin(speakers)]

segments['duration'] = segments['segment_offset'] - segments['segment_onset']
stats = segments.groupby(['set', 'speaker_type']).agg(
    duration = ('duration', 'sum')
)
stats['duration'] = stats['duration'].apply(lambda d: str(timedelta(seconds=d/1000)))
stats.to_csv(opj(args.destination, 'stats_all.csv'))

# intersection statistics
intersection = am.intersection(annotations)
segments = am.get_collapsed_segments(intersection)
segments = segments[segments['speaker_type'].isin(speakers)]

segments['duration'] = segments['segment_offset'] - segments['segment_onset']
stats = segments.groupby(['set', 'speaker_type']).agg(
    duration = ('duration', 'sum')
)
stats['duration'] = stats['duration'].apply(lambda d: str(timedelta(seconds=d/1000)))
stats.to_csv(opj(args.destination, 'stats_intersection.csv'))

def confusion(am: AnnotationManager, A: str, B: str):
    speakers = ['CHI', 'OCH', 'FEM', 'MAL']
    annotations = am.annotations[am.annotations['set'].isin([A, B])]
    intersection = am.intersection(annotations)
    segments = am.get_collapsed_segments(intersection)
    segments = segments[segments['speaker_type'].isin(speakers)]

    _A = segments_to_grid(segments[segments['set'] == A], 0, segments['segment_offset'].max(), 10, 'speaker_type', speakers)
    _B = segments_to_grid(segments[segments['set'] == B], 0, segments['segment_offset'].max(), 10, 'speaker_type', speakers)

    speakers.extend(['none'])

    # CD vs VTC
    confusion_counts = _A.T @ _B

    fig, axes = plt.subplots(nrows = 1, ncols = 2, figsize=(6.4*2, 4.8))

    confusion = confusion_counts/np.sum(_A,axis = 0)[:,None]

    sns.heatmap(confusion, annot = True, fmt = '.2f', ax = axes[0], cmap = 'Reds')
    axes[0].set_xlabel(B)
    axes[0].set_ylabel(A)
    axes[0].xaxis.set_ticklabels(speakers)
    axes[0].yaxis.set_ticklabels(speakers)

    confusion_counts = np.transpose(confusion_counts)
    confusion = confusion_counts/np.sum(_B, axis = 0)[:,None]

    sns.heatmap(confusion, annot = True, fmt = '.2f', ax = axes[1], cmap = 'Reds')
    axes[1].set_xlabel(A)
    axes[1].set_ylabel(B)
    axes[1].xaxis.set_ticklabels(speakers)
    axes[1].yaxis.set_ticklabels(speakers)

    plt.savefig(opj(args.destination, 'agreement_{}_{}.png'.format(basename(A), basename(B))), bbox_inches = 'tight')

def inspect(am: AnnotationManager, A: str, B: str, destination: str, shorten: int = 0):
    intersection = AnnotationManager.intersection(am.annotations, [A, B])

    if shorten > 0:
        intersection['midpoint'] = ((intersection['range_offset']+intersection['range_onset'])/2).astype(int)
        intersection['range_onset'] = intersection['range_onset'].clip(lower = intersection['midpoint'] - int(shorten*1000/2))
        intersection['range_offset'] = intersection['range_offset'].clip(upper = intersection['midpoint'] + int(shorten*1000/2))

    segments = am.get_collapsed_segments(intersection)
    segments = segments[segments['speaker_type'].isin(['CHI', 'OCH', 'MAL', 'FEM'])]

    for position, _segments in segments.groupby('position'):
        _segments['segment_onset'] /= 1000
        _segments['segment_offset'] /= 1000

        A_segments = _segments[_segments['set'] == A]
        B_segments = _segments[_segments['set'] == B]

        _A = segments_to_annotation(A_segments, 'speaker_type')
        _B = segments_to_annotation(B_segments, 'speaker_type')

        fig, [ax1, ax2] = plt.subplots(2, 1)

        if len(_A):
            notebook.plot_annotation(_A, ax = ax1)
            ax1.set_xlabel("{} ({}:{:.0f}:{:.0f})".format(A, A_segments['raw_filename'].iloc[0], A_segments['range_onset'].iloc[0]/1000, A_segments['range_offset'].iloc[0]/1000))
        
        if len(_B):
            notebook.plot_annotation(_B, ax = ax2)
            ax2.set_xlabel("{} ({}:{:.0f}:{:.0f})".format(B, B_segments['raw_filename'].iloc[0], B_segments['range_onset'].iloc[0]/1000, B_segments['range_offset'].iloc[0]/1000))

        ax1.set_yticks([])
        ax2.set_yticks([])

        fig.tight_layout() 
        plt.savefig(opj(destination, f"{position:.0f}.png"))

        notebook.reset()
        plt.close(fig)

# confusion(am, 'eaf_2021/JC', 'vtc')
# confusion(am, 'eaf_2021/EC', 'vtc')
confusion(am, 'eaf_2021/JC', 'eaf_2021/EC')
confusion(am, 'eaf_2021/JC', 'eaf_2021/MA')
confusion(am, 'eaf_2021/MA', 'eaf_2021/EC')

makedirs(opj(args.destination, 'coders_JC_EC'), exist_ok=True)
makedirs(opj(args.destination, 'coders_JC_MA'), exist_ok=True)
makedirs(opj(args.destination, 'coders_MA_EC'), exist_ok=True)

inspect(am, 'eaf_2021/JC', 'eaf_2021/EC', opj(args.destination, 'coders_JC_EC'))
inspect(am, 'eaf_2021/JC', 'eaf_2021/MA', opj(args.destination, 'coders_JC_MA'))
inspect(am, 'eaf_2021/MA', 'eaf_2021/EC', opj(args.destination, 'coders_MA_EC'))
# inspect(am, 'vtc', 'eaf_2021/JC', opj(args.destination, 'JC_vtc'))
# inspect(am, 'vtc', 'eaf_2021/EC', opj(args.destination, 'EC_vtc'))

Adapted

removed lines that were unnecessary
extracted lines from a function, since the process only needs to be done once, so no point in having the flexibility
changed the line in which subsets of annotations are made to take into account three sets (line annotations = am.annotations[(am.annotations['set'].str.startswith('eaf_2021') ) |(am.annotations['set'].str.startswith('solis') ) | (am.annotations['set'].str.startswith('vtc') ) ] for the statistics
changed the line in which subsets A and B (that will be compared against each other) are created, so that subset A contains the two sets by humans (_A = segments_to_grid(segments[segments['set'].str.startswith('eaf_2021') | segments['set'].str.startswith('solis') ] ))

import argparse
from datetime import timedelta
import numpy as np
import pandas as pd
from os.path import join as opj
from os.path import basename
from os import makedirs
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import normalize

import seaborn as sns
import matplotlib.pyplot as plt
from pyannote.core import notebook

from ChildProject.projects import ChildProject
from ChildProject.annotations import AnnotationManager
from ChildProject.metrics import *

from re import match

parser = argparse.ArgumentParser(description = 'stats')
parser.add_argument('destination', help = 'destination')
parser.add_argument('--drop-overlap', help = 'drop overlap', action = 'store_true')
args = parser.parse_args()

speakers = ['CHI', 'OCH', 'FEM', 'MAL']

#makedirs(args.destination)

project = ChildProject('.')
am = AnnotationManager(project)
am.read()

# annotated clips statistics
annotations = am.annotations[(am.annotations['set'].str.startswith('eaf_2021') ) |(am.annotations['set'].str.startswith('solis') ) | (am.annotations['set'].str.startswith('vtc') ) ]
segments = am.get_collapsed_segments(annotations)
segments = segments[segments['speaker_type'].isin(speakers)]

segments['duration'] = segments['segment_offset'] - segments['segment_onset']
stats = segments.groupby(['set', 'speaker_type']).agg(
    duration = ('duration', 'sum')
)
stats['duration'] = stats['duration'].apply(lambda d: str(timedelta(seconds=d/1000)))
stats.to_csv(opj(args.destination, 'stats_all.csv'))

# intersection statistics
intersection = am.intersection(annotations)
segments = am.get_collapsed_segments(intersection)
segments = segments[segments['speaker_type'].isin(speakers)]

segments['duration'] = segments['segment_offset'] - segments['segment_onset']
stats = segments.groupby(['set', 'speaker_type']).agg(
    duration = ('duration', 'sum')
)
stats['duration'] = stats['duration'].apply(lambda d: str(timedelta(seconds=d/1000)))
stats.to_csv(opj(args.destination, 'stats_intersection.csv'))


_A = segments_to_grid(segments[segments['set'].str.startswith('eaf_2021') | segments['set'].str.startswith('solis') ] )
_B = segments_to_grid(segments[segments['set'].str.startswith('vtc')  ] )

speakers.extend(['none'])

confusion_counts = _A.T @ _B

fig, axes = plt.subplots(nrows = 1, ncols = 2, figsize=(6.4*2, 4.8))

confusion = confusion_counts/np.sum(_A,axis = 0)[:,None]

sns.heatmap(confusion, annot = True, fmt = '.2f', ax = axes[0], cmap = 'Reds')
axes[0].set_xlabel(B)
axes[0].set_ylabel(A)
axes[0].xaxis.set_ticklabels(speakers)
axes[0].yaxis.set_ticklabels(speakers)

confusion_counts = np.transpose(confusion_counts)
confusion = confusion_counts/np.sum(_B, axis = 0)[:,None]

sns.heatmap(confusion, annot = True, fmt = '.2f', ax = axes[1], cmap = 'Reds')
axes[1].set_xlabel(A)
axes[1].set_ylabel(B)
axes[1].xaxis.set_ticklabels(speakers)
axes[1].yaxis.set_ticklabels(speakers)

plt.savefig(opj(args.destination, 'agreement_{}_{}.png'.format(basename(A), basename(B))), bbox_inches = 'tight')

I get an error in line 48:

File "./scripts/eaf_vs_vtc.py", line 48, in
segments = am.get_collapsed_segments(intersection)
File "/scratch1/home/acristia/ChildProjectVenv/lib/python3.7/site-packages/ChildProject/annotations.py", line 993, in get_collapsed_segments
annotations["range_offset"] - annotations["range_onset"]

My best guess right now is that line annotations = am.annotations[(am.annotations['set'].str.startswith('eaf_2021') ) |(am.annotations['set'].str.startswith('solis') ) | (am.annotations['set'].str.startswith('vtc') ) ] does not do what I think. To debug, I'll next start a python session and try to inspect how that line works.

I started a python session with python, then copy-pasted each of the lines of the script one by one. I skipped the parser set of lines, since those are there to read the arguments via command line.

annotations ended up having 1241 rows and 12 columns, and it looks perfectly reasonable:
>>> annotations

            set              recording_filename  time_seek  ...          imported_at  error package_version
390           vtc  1_CW3_CH3_AJ01_AJ18_190609.WAV          0  ...  2021-03-06 19:08:19    NaN           0.0.1
391           vtc  1_CW2_CH2_AJ01_AJ02_190609.WAV          0  ...  2021-03-06 19:08:22    NaN           0.0.1
392           vtc  2_CW2_CH2_AJ01_AJ09_190609.WAV          0  ...  2021-03-06 19:08:24    NaN           0.0.1
393           vtc  1_CW1_CH1_AJ01_AJ16_190609.WAV          0  ...  2021-03-06 19:08:19    NaN           0.0.1
394           vtc  1_CW2_CH2_AJ02_AJ04_190613.WAV          0  ...  2021-03-06 19:08:22    NaN           0.0.1
...           ...                             ...        ...  ...                  ...    ...             ...
2008  eaf_2021/MA  2_CW2_CH2_AJ01_AJ09_190609.WAV          0  ...  2021-12-14 09:56:11    NaN           0.0.3
2009  eaf_2021/MA  2_CW2_CH2_AJ01_AJ09_190609.WAV          0  ...  2021-12-14 09:56:11    NaN           0.0.3
2010  eaf_2021/MA  2_CW2_CH2_AJ01_AJ09_190609.WAV          0  ...  2021-12-14 09:56:11    NaN           0.0.3
2011  eaf_2021/MA  2_CW2_CH2_AJ01_AJ09_190609.WAV          0  ...  2021-12-14 09:56:11    NaN           0.0.3
2012  eaf_2021/MA  2_CW2_CH2_AJ01_AJ09_190609.WAV          0  ...  2021-12-14 09:56:11    NaN           0.0.3

The following line, which caused an error when running the script as such, caused no problem here, and segments has some content:

>>> segments

      segment_onset  segment_offset speaker_id speaker_type vcm_type  ...      position  ling_type  addresseee phonemes  syllables
0       4.500000e+04    4.775600e+04        FA1          FEM      NaN  ...  4.500000e+04        NaN         NaN      NaN        NaN
1       4.686000e+04    5.024800e+04        UC1          OCH      NaN  ...  4.500000e+04        NaN         NaN      NaN        NaN
2       4.816400e+04    4.927000e+04        FA1          FEM      NaN  ...  4.500000e+04        NaN         NaN      NaN        NaN
3       5.045000e+04    5.281000e+04        FA1          FEM      NaN  ...  4.500000e+04        NaN         NaN      NaN        NaN
4       5.135300e+04    5.300200e+04        UC1          OCH      NaN  ...  4.500000e+04        NaN         NaN      NaN        NaN
...              ...             ...        ...          ...      ...  ...           ...        ...         ...      ...        ...
16210   2.131020e+10    2.131020e+10        NaN       SPEECH      NaN  ...  2.126340e+10        NaN         NaN      NaN        NaN
16211   2.131075e+10    2.131075e+10        NaN       SPEECH      NaN  ...  2.126340e+10        NaN         NaN      NaN        NaN
16212   2.131075e+10    2.131075e+10        NaN          MAL      NaN  ...  2.126340e+10        NaN         NaN      NaN        NaN
16213   2.131103e+10    2.131103e+10        NaN          MAL      NaN  ...  2.126340e+10        NaN         NaN      NaN        NaN
16214   2.131114e+10    2.131114e+10        NaN          MAL      NaN  ...  2.126340e+10        NaN         NaN      NaN        NaN
[6445263 rows x 28 columns]

After subsetting to the key speakers, 4576557 rows remain.

Outside of python, I unlocked the stats file, which appeared to have been created previously, so I could update it.

So the problem actually arises when creating the intersection across annotations:
>>> intersection

Empty DataFrame
Columns: []
Index: []

that's weird! there should be some overlap between VTC and the human annotation! And indeed there is:

>>> annotations[annotations['recording_filename']=='2_CW2_CH2_AJ01_AJ09_190609.WAV']

         set              recording_filename  time_seek  range_onset  ...          imported_at error package_version    duration

392 vtc 2_CW2_CH2_AJ01_AJ09_190609.WAV 0 0 ... 2021-03-06 19:08:24 NaN 0.0.1 65900488.0
1664 eaf_2021/EC 2_CW2_CH2_AJ01_AJ09_190609.WAV 0 2220000 ... 2021-12-14 09:56:10 NaN 0.0.3 15000.0
1665 eaf_2021/EC 2_CW2_CH2_AJ01_AJ09_190609.WAV 0 3810000 ... 2021-12-14 09:56:10 NaN 0.0.3 15000.0
1666 eaf_2021/EC 2_CW2_CH2_AJ01_AJ09_190609.WAV 0 7995000 ... 2021-12-14 09:56:10 NaN 0.0.3 15000.0
1667 eaf_2021/EC 2_CW2_CH2_AJ01_AJ09_190609.WAV 0 8055000 ... 2021-12-14 09:56:10 NaN 0.0.3 15000.0
... ... ... ... ... ... ... ... ... ...
2008 eaf_2021/MA 2_CW2_CH2_AJ01_AJ09_190609.WAV 0 38790000 ... 2021-12-14 09:56:11 NaN 0.0.3 15000.0
2009 eaf_2021/MA 2_CW2_CH2_AJ01_AJ09_190609.WAV 0 57630000 ... 2021-12-14 09:56:11 NaN 0.0.3 15000.0
2010 eaf_2021/MA 2_CW2_CH2_AJ01_AJ09_190609.WAV 0 58920000 ... 2021-12-14 09:56:11 NaN 0.0.3 15000.0
2011 eaf_2021/MA 2_CW2_CH2_AJ01_AJ09_190609.WAV 0 59145000 ... 2021-12-14 09:56:11 NaN 0.0.3 15000.0
2012 eaf_2021/MA 2_CW2_CH2_AJ01_AJ09_190609.WAV 0 60705000 ... 2021-12-14 09:56:11 NaN 0.0.3 15000.0

Answered by alecristia

Jun 16, 2022

What I ended up doing is to get the vtc-human agreement for every human, and the human-human agreement. Then in the paper, I reported the weighted average F-score of VTC-human (ie giving more weight to coders that had coded more data -- so NOT based on how much their annotations would overlap) as well as the weighted average F-score of human-human. So this allows us to answer the question of how much more or less accurate VTC is than the humans who have done the coding, compared to other humans who have done the coding.

View full answer

alecristia · 2022-06-14T11:15:11Z

alecristia
Jun 14, 2022
Maintainer Author

not yet an answer, but progress: if instead of startwith + options, you use "isin", the intersection is not empty:

annotations = am.annotations[(am.annotations['set'].isin(['eaf_2021','solis','vtc']) ) ]
intersection = am.intersection(annotations)
segments = am.get_collapsed_segments(intersection)
segments = segments[segments['speaker_type'].isin(speakers)]

segments['duration'] = segments['segment_offset'] - segments['segment_onset']
stats = segments.groupby(['set', 'speaker_type']).agg(
    duration = ('duration', 'sum')
)
stats['duration'] = stats['duration'].apply(lambda d: str(timedelta(seconds=d/1000)))
stats.to_csv(opj(args.destination, 'stats_intersection.csv'))

stats contains:

                                   duration
set speaker_type                          
vtc CHI           25 days, 12:26:44.711000
    FEM           27 days, 21:28:12.089000
    MAL            11 days, 4:57:08.863000
    OCH           22 days, 16:47:09.846000

not sure how to understand these numbers though

intersection contains:

     set                 recording_filename  time_seek  range_onset  ...          imported_at error package_version    duration
390  vtc     1_CW3_CH3_AJ01_AJ18_190609.WAV          0            0  ...  2021-03-06 19:08:19   NaN           0.0.1  62897922.0
391  vtc     1_CW2_CH2_AJ01_AJ02_190609.WAV          0            0  ...  2021-03-06 19:08:22   NaN           0.0.1  50312818.0
392  vtc     2_CW2_CH2_AJ01_AJ09_190609.WAV          0            0  ...  2021-03-06 19:08:24   NaN           0.0.1  65900488.0
393  vtc     1_CW1_CH1_AJ01_AJ16_190609.WAV          0            0  ...  2021-03-06 19:08:19   NaN           0.0.1  59695848.0
394  vtc     1_CW2_CH2_AJ02_AJ04_190613.WAV          0            0  ...  2021-03-06 19:08:22   NaN           0.0.1  68280379.0
..   ...                                ...        ...          ...  ...                  ...   ...             ...         ...
773  vtc  01_CW07_CH07_LM08_LM08_190711.WAV          0            0  ...  2021-03-06 19:08:37   NaN           0.0.1  85131513.0
774  vtc  01_CW02_CH02_LM09_LM06_190716.WAV          0            0  ...  2021-03-06 19:08:35   NaN           0.0.1  70942523.0
775  vtc  01_CW04_CH04_LM09_LM25_190716.WAV          0            0  ...  2021-03-06 19:08:36   NaN           0.0.1  66189800.0
776  vtc  01_CW06_CH06_LM09_LM13_190716.WAV          0            0  ...  2021-03-06 19:08:37   NaN           0.0.1  53033726.0
777  vtc  01_CW08_CH08_LM09_LM34_190716.WAV          0            0  ...  2021-03-06 19:08:35   NaN           0.0.1  67946548.0
[382 rows x 13 columns]

2 replies

alecristia Jun 14, 2022
Maintainer Author

looks like the intersection consists exclusively of vtc???

alecristia Jun 14, 2022
Maintainer Author

isin versus startswith is the issue: for isin, you should provide both the set (solis) and the annotators' code (eg solis/NM, not just solis)

alecristia · 2022-06-14T11:20:48Z

alecristia
Jun 14, 2022
Maintainer Author

I also checked, and the dataset passes validation:

(ChildProjectVenv) (base) [acristia@oberon solomon]$ child-project validate . --ignore-recordings
warning: ./metadata/children.csv: unknown columns 'n_child_in_family,village_id,raw_mother_id,hhid,raw_father_id,child_number' in children, exepected columns are: experiment,child_id,child_dob,location_id,child_sex,language,languages,mat_ed,fat_ed,car_ed,monoling,monoling_criterion,normative,normative_criterion,mother_id,father_id,order_of_birth,n_of_siblings,household_size,dob_criterion,dob_accuracy
warning: ./metadata/children.csv: 'NA' is not a proper date/time for column 'child_dob' (expected %Y-%m-%d) on line 65
warning: ./metadata/children.csv: 'NA' is not a proper date/time for column 'child_dob' (expected %Y-%m-%d) on line 119
warning: ./metadata/children.csv: 'NA' is not a proper date/time for column 'child_dob' (expected %Y-%m-%d) on line 149
warning: ./metadata/children.csv: 'NA' is not a proper date/time for column 'child_dob' (expected %Y-%m-%d) on line 208
warning: ./metadata/recordings.csv: unknown columns 'nrow,Date_Collected_MM_DD_YYYY,Date_Collected_original,time_recording_started,Enumerator_ID_Name,RA_Name,annotated' in recordings, exepected columns are: experiment,child_id,date_iso,start_time,recording_device_type,recording_filename,duration,session_id,session_offset,recording_device_id,experimenter,location_id,its_filename,upl_filename,trs_filename,lena_id,might_feature_gaps,start_time_accuracy,noisy_setting,notes
warning: ./metadata/recordings.csv: 'NA' is not a proper date/time for column 'start_time' (expected %H:%M) on line 172
warning: ./metadata/recordings.csv: 'NA' is not a proper date/time for column 'start_time' (expected %H:%M) on line 180
warning: ./metadata/recordings.csv: 'NA' is not a proper date/time for column 'start_time' (expected %H:%M) on line 208
warning: ./metadata/recordings.csv: 'NA' is not a proper date/time for column 'start_time' (expected %H:%M) on line 381
warning: ./metadata/recordings.csv: 'NA' is not a proper date/time for column 'start_time' (expected %H:%M) on line 389
warning: ./metadata/recordings.csv: 'NA' is not a proper date/time for column 'start_time' (expected %H:%M) on line 417
validation successfully completed with 12 warning(s).

0 replies

alecristia · 2022-06-14T11:22:30Z

alecristia
Jun 14, 2022
Maintainer Author

the am.read() step does spit out a lot of garble:

[csv,vtc_rttm,vcm_rttm,alice,its,TextGrid,eaf,cha,NA]", "./metadata/annotations.csv: 'nan' is not a permitted value for column 'format' on line 1157, should be any of [csv,vtc_rttm,vcm_rttm,alice,its,TextGrid,eaf,cha,NA]", "./metadata/annotations.csv: 'nan' is not a permitted value for column 'format' on line 1158, should be any of [csv,vtc_rttm,vcm_rttm,alice,its,TextGrid,eaf,cha,NA]", "./metadata/annotations.csv: 'nan' is not a permitted value for column 'format' on line 1159, should be any of [csv,vtc_rttm,vcm_rttm,alice,its,TextGrid,eaf,cha,NA]"])

0 replies

lucasgautheron · 2022-06-14T11:48:06Z

lucasgautheron
Jun 14, 2022
Maintainer

Are you trying to get a global F-score, or one F-score per annotator?
In which case, I'd recommend re-using more of the original code. The function confusion(am: AnnotationManager, A: str, B: str): allows to derive the confusion matrices (and, therefore, the F-score as well) for each annotator separately in a more stratightfoward way.

If you collect all annotations from the vtc + all annotators at once, and apply `am.intersect' on them, you will only get the portion that is covered by ALL of them (so every annotator + the vtc), which might be none. The original code computes the intersection for each annotator separately, derive the confusion matrice, and move on to the next annotator.

Hope that clears things up a bit?

0 replies

alecristia · 2022-06-14T12:07:11Z

alecristia
Jun 14, 2022
Maintainer Author

thanks, i get it now! i want a global score, collapsing across all human annotators. i wonder how i can hack the system to do that… --

…

--------------------------------------------------------------- Alex (Alejandrina) Cristia Researcher, CNRS Laboratoire de Sciences Cognitives et Psycholinguistique 29, rue d'Ulm, 75005, Paris, FRANCE My site: www.acristia.org --------------------------------------------------------------- If you donate, ask me about effective charities <https://effectivealtruism.us8.list-manage.com/track/click?u=52b028e7f799cca137ef74763&id=206509456c&e=6c0b626a8f>. / Si vous faites des dons, demandez moi sur le don efficace <https://www.altruismeefficacefrance.org/guide-don-efficace-1/>.

1 reply

lucasgautheron Jun 14, 2022
Maintainer

It's a good question. It will depend on how you would like to handle those cases where several annotators annotated the same portion of audio. Select one of these annotations as the reference at random? Or averaging scores across annotators for these portions?
Plus, the intersection patterns between annotators might be complex (I don't know).

alecristia · 2022-06-16T07:24:01Z

alecristia
Jun 16, 2022
Maintainer Author

What I ended up doing is to get the vtc-human agreement for every human, and the human-human agreement. Then in the paper, I reported the weighted average F-score of VTC-human (ie giving more weight to coders that had coded more data -- so NOT based on how much their annotations would overlap) as well as the weighted average F-score of human-human. So this allows us to answer the question of how much more or less accurate VTC is than the humans who have done the coding, compared to other humans who have done the coding.

0 replies

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

calculating reliability of vtc against all human annotations together #377

{{title}}

{{editor}}'s edit

{{editor}}'s edit

Replies: 6 comments 3 replies

{{title}}

{{title}}

{{title}}

{{title}}

{{title}}

{{title}}

{{title}}

{{title}}

{{editor}}'s edit

{{editor}}'s edit

{{title}}

{{editor}}'s edit

{{editor}}'s edit

Select a reply

calculating reliability of vtc against all human annotations together #377

alecristia Jun 14, 2022 Maintainer

Replies: 6 comments · 3 replies

alecristia Jun 14, 2022 Maintainer Author

alecristia Jun 14, 2022 Maintainer Author

alecristia Jun 14, 2022 Maintainer Author

alecristia Jun 14, 2022 Maintainer Author

alecristia Jun 14, 2022 Maintainer Author

lucasgautheron Jun 14, 2022 Maintainer

alecristia Jun 14, 2022 Maintainer Author

lucasgautheron Jun 14, 2022 Maintainer

alecristia Jun 16, 2022 Maintainer Author

alecristia
Jun 14, 2022
Maintainer

Replies: 6 comments 3 replies

alecristia
Jun 14, 2022
Maintainer Author

alecristia Jun 14, 2022
Maintainer Author

alecristia Jun 14, 2022
Maintainer Author

alecristia
Jun 14, 2022
Maintainer Author

alecristia
Jun 14, 2022
Maintainer Author

lucasgautheron
Jun 14, 2022
Maintainer

alecristia
Jun 14, 2022
Maintainer Author

lucasgautheron Jun 14, 2022
Maintainer

alecristia
Jun 16, 2022
Maintainer Author