diff --git a/CHANGELOG.md b/CHANGELOG.md index 5ff175d69..5320ee346 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,7 @@ All notable changes to this project will be documented in this file. ### Added +- conversations summary extraction pipeline - docs and tests for init command - docs and tests for automated-import command diff --git a/ChildProject/annotations.py b/ChildProject/annotations.py index 23a0f7245..89dcc9b71 100644 --- a/ChildProject/annotations.py +++ b/ChildProject/annotations.py @@ -11,7 +11,7 @@ import logging from . import __version__ -from .pipelines.derivations import DERIVATIONS +from .pipelines.derivations import DERIVATIONS, conversations from .projects import ChildProject from .converters import * from .tables import IndexTable, IndexColumn, assert_dataframe, assert_columns_presence @@ -1967,8 +1967,8 @@ def clip_segments(segments: pd.DataFrame, start: int, stop: int) -> pd.DataFrame start = int(start) stop = int(stop) - segments["segment_onset"].clip(lower=start, upper=stop, inplace=True) - segments["segment_offset"].clip(lower=start, upper=stop, inplace=True) + segments["segment_onset"] = segments["segment_onset"].clip(lower=start, upper=stop) + segments["segment_offset"] = segments["segment_offset"].clip(lower=start, upper=stop) segments = segments[segments["segment_offset"] > segments["segment_onset"]] diff --git a/ChildProject/cmdline.py b/ChildProject/cmdline.py index 933789574..75d24f463 100755 --- a/ChildProject/cmdline.py +++ b/ChildProject/cmdline.py @@ -1,6 +1,8 @@ #!/usr/bin/env python3 from .projects import ChildProject, RAW_RECORDINGS, METADATA_FOLDER, RECORDINGS_CSV, CHILDREN_CSV from .annotations import AnnotationManager +from .pipelines.conversations import ConversationsPipeline +from .pipelines.conversations import ConversationsSpecificationPipeline from .converters import extensions from .pipelines.samplers import SamplerPipeline from .pipelines.eafbuilder import EafBuilderPipeline @@ -744,6 +746,8 @@ def main(): register_pipeline("anonymize", AnonymizationPipeline) register_pipeline("metrics", MetricsPipeline) register_pipeline("metrics-specification", MetricsSpecificationPipeline) + register_pipeline("conversations-summary", ConversationsPipeline) + register_pipeline("conversations-specification", ConversationsSpecificationPipeline) args = parser.parse_args() args.func(args) diff --git a/ChildProject/pipelines/conversationFunctions.py b/ChildProject/pipelines/conversationFunctions.py new file mode 100644 index 000000000..6328983b6 --- /dev/null +++ b/ChildProject/pipelines/conversationFunctions.py @@ -0,0 +1,180 @@ +import pandas as pd +import numpy as np +import ast +import re +import functools +from typing import Union, Set, Tuple + +""" +This file lists all the metrics functions commonly used. +New metrics can be added by defining new functions for the Conversations class to use : + - Create a new function using the same arguments (i.e. segments, duration, **kwargs) + - Define calculation of the metric with: + - segments, which is a dataframe containing all the relevant annotated segments to use. It contains the + annotation content (https://childproject.readthedocs.io/en/latest/format.html#id10) joined with the annotation + index info (https://childproject.readthedocs.io/en/latest/format.html#id11) as well as any column that was + requested to be added to the results by the user using --child-cols or --rec-cols (eg --child-cols child_dob, + languages will make columns 'child_dob' and 'languages' available) + - duration which is the duration of audio annotated in milliseconds + - kwargs, whatever keyword parameter you chose to pass to the function (except 'name', 'callable', 'set' which can + not be used). This will need to be given with the list of metrics when called + - Wrap you function with the 'conversationFunction' decorator to make it callable by the pipeline, read conversationFunction help + for more info + +!! Metrics functions should still behave and return the correct result when receiving an empty dataframe +""" +RESERVED = {'name', 'callable'} # arguments reserved usage. use other keyword labels. + + +def conversationFunction(args: set = set()): + """Decorator for all metrics functions to make them ready to be called by the pipeline. + + :param args: set of required keyword arguments for that function, raise ValueError if were not given \ + you cannot use keywords [name, callable, set] as they are reserved + :type args: set + :return: new function to substitute the metric function + :rtype: Callable + """ + + def decorator(function): + for a in args: + if a in RESERVED: + raise ValueError( + 'Error when defining {} with required argument {}, you cannot use reserved keywords {},\ + change your required argument name'.format( + function.__name__, a, RESERVED)) + + @functools.wraps(function) + def new_func(segments: pd.DataFrame, **kwargs): + for arg in args: + if arg not in kwargs: + raise ValueError(f"{function.__name__} metric needs an argument <{arg}>") + + res = function(segments, **kwargs) + + return res + + return new_func + + return decorator + + +@conversationFunction() +def who_initiated(segments: pd.DataFrame): + """speaker type who spoke first in the conversation + + Required keyword arguments: + """ + return segments.iloc[0]['speaker_type'] + + +@conversationFunction() +def who_finished(segments: pd.DataFrame): + """speaker type who spoke last in the conversation + + Required keyword arguments: + """ + return segments[segments['segment_offset'] == segments['segment_offset'].max()].iloc[0]['speaker_type'] + +@conversationFunction() +def participants(segments: pd.DataFrame): + """list of speakers participating in the conversation, '/' separated + + Required keyword arguments: + """ + return '/'.join(segments['speaker_type'].unique()) + +@conversationFunction() +def voc_total_dur(segments: pd.DataFrame): + """summed duration of all speech in the conversation (ms) N.B. can be higher than conversation duration as + speakers may speak at the same time, resulting in multiple spoken segments happening simultaneously + + Required keyword arguments: + """ + return segments['voc_duration'].sum() + + +@conversationFunction({'speaker'}) +def is_speaker(segments: pd.DataFrame, **kwargs): + """is a specific speaker type present in the conversation + + Required keyword arguments: + - speaker : speaker_type label + """ + return kwargs["speaker"] in segments['speaker_type'].tolist() + + +@conversationFunction({'speaker'}) +def voc_speaker_count(segments: pd.DataFrame, **kwargs): + """number of vocalizations produced by a given speaker + + Required keyword arguments: + - speaker : speaker_type label + """ + return segments[segments['speaker_type'] == kwargs["speaker"]]['speaker_type'].count() + + +@conversationFunction({'speaker'}) +def voc_speaker_dur(segments: pd.DataFrame, **kwargs): + """summed duration of speech for a given speaker in the conversation + + Required keyword arguments: + - speaker : speaker_type label + """ + return segments[segments['speaker_type'] == kwargs["speaker"]]['voc_duration'].sum(min_count=1) + + +@conversationFunction({'speaker'}) +def voc_dur_contribution(segments: pd.DataFrame, **kwargs): + """contribution of a given speaker in the conversation compared to others, in terms of total speech duration + + Required keyword arguments: + - speaker : speaker_type label + """ + speaker_total = segments[segments['speaker_type'] == kwargs["speaker"]]['voc_duration'].sum(min_count=1) + total = segments['voc_duration'].sum() + return speaker_total / total + + +@conversationFunction() +def assign_conv_type(segments: pd.DataFrame): + """Compute the conversation type (overheard, dyadic_XXX, peer, parent, triadic_XXX, multiparty) depending on the + participants + + Required keyword arguments: + """ + #pd.Categorical(['overheard', 'dyadic_FEM', 'dyadic_MAL', 'peer', 'parent', 'triadic_FEM', 'triadic_MAL', 'multiparty']) + speaker_present = {} + for speaker in ['CHI', 'FEM', 'MAL', 'OCH']: + speaker_present[speaker] = [speaker in segments['speaker_type'].tolist()] + speaker_df = pd.DataFrame.from_dict(speaker_present).iloc[0, :] + + if not speaker_df['CHI']: + return 'overheard' + + elif speaker_df['CHI']: + if not speaker_df['OCH'] and speaker_df[['FEM', 'MAL']].sum() == 1: + if speaker_df['FEM']: + return 'dyadic_FEM' + + if speaker_df['MAL']: + return 'dyadic_MAL' + + if speaker_df['OCH'] and speaker_df[['FEM', 'MAL']].sum() == 0: + return 'peer' + + if not speaker_df['OCH'] and speaker_df[['FEM', 'MAL']].sum() == 2: + return 'parent' + + if speaker_df['OCH'] and speaker_df[['FEM', 'MAL']].sum() == 1: + if speaker_df['FEM']: + return 'triadic_FEM' + if speaker_df['MAL']: + return 'triadic_MAL' + + if speaker_df[['OCH', 'FEM', 'MAL']].sum() == 3: + return 'multiparty' + return np.nan + + + diff --git a/ChildProject/pipelines/conversations.py b/ChildProject/pipelines/conversations.py new file mode 100644 index 000000000..4f1f25b1e --- /dev/null +++ b/ChildProject/pipelines/conversations.py @@ -0,0 +1,681 @@ +from abc import ABC, abstractmethod +import os +import itertools +import argparse +import datetime +import multiprocessing as mp +import logging +import functools + +import numpy as np +import pandas as pd +from typing import Union, List +import yaml +from git import Repo +from git.exc import InvalidGitRepositoryError + +import ChildProject +from ChildProject.pipelines.pipeline import Pipeline +from ChildProject.annotations import AnnotationManager + +from ChildProject.tables import assert_dataframe, assert_columns_presence, read_csv_with_dtype +import ChildProject.pipelines.conversationFunctions as convfunc +from ..utils import TimeInterval + +import time # RM + +pipelines = {} + +# Create a logger for the module (file) +logger_conversations = logging.getLogger(__name__) +# messages are propagated to the higher level logger (ChildProject), used in cmdline.py +logger_conversations.propagate = True + +class Conversations(ABC): + """ + Main class for generating a conversational extraction from a project object and a list of desired features + + :param project: ChildProject instance of the target dataset. + :type project: ChildProject.projects.ChildProject + :param setname: set to extract conversations from (recording_filename, experiment, child_id, session_id, segments), defaults to 'recording_filename', 'segments' is mandatory if passing the segments argument + :type setname: str + :param features_list: pandas DataFrame containing the desired features (features functions are in conversationsFunctions.py) + :type features_list: pd.DataFrame + :param recordings: recordings to sample from; if None, all recordings will be sampled, defaults to None + :type recordings: Union[str, List[str], pd.DataFrame], optional + :param from_time: If specified (in HH:MM:SS format), ignore annotations outside of the given time-range, defaults to None + :type from_time: str, optional + :param to_time: If specified (in HH:MM:SS format), ignore annotations outside of the given time-range, defaults to None + :type to_time: str, optional + :param rec_cols: comma separated columns from recordings.csv to include in the outputted extraction (optional), recording_filename,session_id,child_id,duration are always included if possible and dont need to be specified. Any column that is not unique for a given unit (eg date_iso for a child_id being recorded on multiple days) will output a value + :type rec_cols: str, optional + :param child_cols: comma separated columns from children.csv to include in the outputted extraction (optional), None by default + :type child_cols: str, optional + :param threads: amount of threads to run on, defaults to 1 + :type threads: int, optional + """ + + def __init__( + self, + project: ChildProject.projects.ChildProject, + setname: str, + features_list: pd.DataFrame, + recordings: Union[str, List[str], pd.DataFrame] = None, + from_time: str = None, + to_time: str = None, + rec_cols: str = None, #metadata + child_cols: str = None, #metadata + threads: int = 1, + ): + + self.project = project + self.am = ChildProject.annotations.AnnotationManager(self.project) + self.threads = int(threads) + self.conversations = None + + # check that the callable column is either a callable function or a string that can be found as being part of + # the list of features in ChildProject/pipelines/conversationFunctions.py + def check_callable(row): + if callable(row["callable"]): return row["callable"] + if isinstance(row["callable"], str): + try: + f = getattr(convfunc, row["callable"]) + except Exception: + raise ValueError( + "{} function is not defined and was not found in ChildProject/pipelines/conversationFunctions.py".format( + row["callable"])) + return f + else: + raise ValueError( + "{} cannot be evaluated as a feature, must be a callable object or a string".format(row["callable"])) + + self.features_df = features_list + # block checking presence of required columns and evaluates the callable functions + if isinstance(features_list, pd.DataFrame): + if ({'callable', 'name'}).issubset(features_list.columns): + features_list["callable"] = features_list.apply(check_callable, axis=1) + try: + features_list = features_list.set_index('name', verify_integrity=True) + except ValueError as e: + raise ValueError("features_list parameter has duplicates in 'name' column") from e + features_list['args'] = features_list.drop(['callable'], axis=1).apply( + lambda row: row.dropna().to_dict(), axis=1) + features_list = features_list[['callable', 'args']] + else: + raise ValueError("features_list parameter must contain at least the columns [callable,name]") + else: + raise ValueError("features_list parameter must be a pandas DataFrame") + + if setname not in self.am.annotations["set"].values: + raise ValueError( + f"annotation set '{setname}' was not found in the index; " + "check spelling and make sure the set was properly imported." + ) + self.set = setname + self.features_dict = features_list.to_dict(orient="index") + + # necessary columns to construct the conversations + join_columns = { + "recording_filename", + "child_id", + "duration", + "session_id", + "session_offset", + } + # get existing columns of the dataset for recordings + correct_cols = set(self.project.recordings.columns) + + if rec_cols: + # when user requests recording columns, build the list and verify they exist (warn otherwise) + rec_cols = set(rec_cols.split(",")) + for i in rec_cols: + if i not in correct_cols: + logger_conversations.warning( + "requested column <{}> does not exist in recordings.csv,\ + ignoring this column. existing columns are : {}".format( + i, correct_cols)) + rec_cols &= correct_cols + # add wanted columns to the one we already get + join_columns.update(rec_cols) + self.rec_cols = rec_cols + + join_columns &= correct_cols + + # join dataset annotation with their info in recordings.csv + self.am.annotations = self.am.annotations.merge( + self.project.recordings[list(join_columns)], + left_on="recording_filename", + right_on="recording_filename", + ) + + # get existing columns of the dataset for children + correct_cols = set(self.project.children.columns) + if child_cols: + # when user requests children columns, build the list and verify they exist (warn otherwise) + child_cols = set(child_cols.split(",")) + child_cols.add("child_id") + for i in child_cols: + if i not in correct_cols: + logger_conversations.warning( + "requested column <{}> does not exist in children.csv, ignoring this column. existing\ + columns are : {}".format(i, correct_cols)) + child_cols &= correct_cols + self.child_cols = child_cols + + # join dataset annotation with their info in children.csv + self.am.annotations = self.am.annotations.merge( + self.project.children[list(child_cols)], + left_on="child_id", + right_on="child_id", + ) + else: + self.child_cols = None + + if recordings is None: + self.recordings = self.project.recordings['recording_filename'].to_list() + else: + self.recordings = Pipeline.recordings_from_list(recordings) + + # turn from_time and to to_time to datetime objects + if from_time: + try: + self.from_time = datetime.datetime.strptime(from_time, "%H:%M:%S") + except ValueError: + raise ValueError( + f"invalid value for from_time ('{from_time}'); should have HH:MM:SS format instead") + else: + self.from_time = None + + if to_time: + try: + self.to_time = datetime.datetime.strptime(to_time, "%H:%M:%S") + except ValueError: + raise ValueError(f"invalid value for to_time ('{to_time}'); should have HH:MM:SS format instead") + else: + self.to_time = None + + def __init_subclass__(cls, **kwargs): + super().__init_subclass__(**kwargs) + pipelines[cls.SUBCOMMAND] = cls + + def _process_conversation(self, conversation, rec): #process recording line + """for one conversation block compute the list of required features and store return the results as a dictionary + + :param conversation: index and Series of the unit to process, to be modified with the results + :type conversation: pd.DataFrame + :param rec: recording_filename to which belongs that conversation + :type rec: str + :return: dict containing all the computed features result for that unit + :rtype: dict + """ + segments = conversation + + # results that are included regardless of the required list + result = {'conversation_onset': segments.iloc[0]['segment_onset'], + 'conversation_offset': segments['segment_offset'].max(), + 'voc_count': segments['speaker_type'].count(), + 'conv_count': conversation.name, + 'interval_last_conv': conversation.iloc[0]['time_since_last_conv'], + 'recording_filename': rec, + } + # apply the functions required + for i in self.features_dict: + result[i] = self.features_dict[i]["callable"](segments, **self.features_dict[i]['args']) + + return result + + def _process_recording(self, recording, grouper): + """for one recording, get the segments required, group by conversation and launch computation for each block + + :param recording: recording_filename to which belongs that conversation + :type recording: str + :return: dict containing all the computed features result for that unit + :rtype: list[dict] + """ + segments = self.retrieve_segments(recording) + segments['voc_duration'] = segments['segment_offset'] - segments['segment_onset'] + + # compute the duration between conversation and previous one + terminals = segments[segments['conv_count'].shift(-1) != segments['conv_count']] + terminals.index += 1 + steps = (segments[segments['conv_count'].shift(1) != segments['conv_count']]['segment_onset'] - + terminals['segment_offset']).dropna() + steps.index = segments.loc[steps.index, 'conv_count'] + segments['time_since_last_conv'] = segments['conv_count'].map(steps) + + conversations = segments.groupby(grouper, group_keys=True) + + # keep as Series?? + extractions = conversations.apply( + self._process_conversation, rec=recording).to_list() if len(conversations) else [] + # extractions = [self._process_conversation(block) for block in conversations] + + return extractions + + def extract(self): + """from the initiated self.features_dict, compute each row feature (handles threading) + Once the Conversation class is initialized, call this function to extract the features and populate + self.conversations + + :return: DataFrame of computed features + :rtype: pandas.DataFrame + """ + grouper = 'conv_count' + if self.threads == 1: + + results = list(itertools.chain.from_iterable(map(functools.partial(self._process_recording, grouper=grouper), self.recordings))) + else: + with mp.Pool( + processes=self.threads if self.threads >= 1 else mp.cpu_count() + ) as pool: + results = list(itertools.chain.from_iterable(pool.map(functools.partial(self._process_recording, grouper=grouper), self.recordings))) + + self.conversations = pd.DataFrame(results) if len(results) else pd.DataFrame(columns=[grouper]) + + # now add the rec_cols and child_cols in the result + if self.rec_cols: + if self.child_cols: + recs = self.project.recordings.drop(columns=( + [col for col in self.project.recordings.columns if (col not in self.rec_cols + and col != 'recording_filename' + and col != 'child_id')] + )) + chis = self.project.children.drop(columns=( + [col for col in self.project.children.columns if (col not in self.child_cols + and col != 'child_id')] + )) + meta = recs.merge(chis, how='inner', on='child_id') + self.conversations = self.conversations.merge(meta, how='left', on='recording_filename') + if 'child_id' not in self.child_cols and 'child_id' not in self.rec_cols: + self.conversations.drop(columns=['child_id']) + else: + recs = self.project.recordings.drop(columns=( + [col for col in self.project.recordings.columns if (col not in self.rec_cols + and col != 'recording_filename' + and col != 'child_id')] + )) + self.conversations = self.conversations.merge(recs, how='left', on='recording_filename') + elif self.child_cols: + chis = self.project.children.drop(columns=( + [col for col in self.project.children.columns if (col not in self.child_cols + and col != 'child_id')] + )) + meta = chis.merge(self.project.recordings[['recording_filename', 'child_id']], how='inner', on='child_id') + self.conversations = self.conversations.merge(meta, how='left', on='recording_filename') + if 'child_id' not in self.child_cols: + self.conversations.drop(columns=['child_id']) + + if not self.conversations.shape[0]: + logger_conversations.warning("The extraction did not find any conversation") + return self.conversations + + def retrieve_segments(self, recording: str): + """from a list of sets and a row identifying the unit computed, return the relevant annotation segments + + :param recording: recording + :type recording: str + :return: relevant annotation segments + :rtype: pandas.DataFrame + """ + annotations = self.am.annotations[recording == self.am.annotations['recording_filename']] + annotations = annotations[annotations["set"] == self.set] + # restrict to time ranges + if self.from_time and self.to_time: + matches = self.am.get_within_time_range( + annotations, TimeInterval(self.from_time, self.to_time)) + else: + matches = annotations + + if matches.shape[0]: + segments = self.am.get_segments(matches) + if not segments.shape[0]: + # no annotations for that unit + return pd.DataFrame(columns=list(set([c.name for c in AnnotationManager.SEGMENTS_COLUMNS if c.required] + + list(annotations.columns) + ['conv_count']))) + segments = segments.dropna(subset=['conv_count']) + else: + # no annotations for that unit + return pd.DataFrame(columns=list(set([c.name for c in AnnotationManager.SEGMENTS_COLUMNS if c.required] + + list(annotations.columns) + ['conv_count']))) + + return segments.reset_index(drop=True) + + +class CustomConversations(Conversations): + """conversations extraction from a csv file. + Extracts a number of features listed in a csv file as a dataframe. + the csv file must contain the columns : + - 'callable' which is the name of the wanted feature from the list of available features + - 'name' is the name to give to that feature + - any other necessary argument for the given feature (eg the is_speaker feature requires the 'speaker' argument: add a column 'speaker' in the csv file and fill its cells for this feature with the wanted value (CHI|FEM|MAL|OCH)) + + :param project: ChildProject instance of the target dataset. + :type project: ChildProject.projects.ChildProject + :param setname: name of the set to extract conversations from + :type setname: str + :param features: name of the csv file listing the features to extract + :type features: str + :param recordings: recordings to sample from; if None, all recordings will be sampled, defaults to None + :type recordings: Union[str, List[str], pd.DataFrame], optional + :param from_time: If specified (in HH:MM:SS format), ignore annotations outside of the given time-range, defaults to None + :type from_time: str, optional + :param to_time: If specified (in HH:MM:SS format), ignore annotations outside of the given time-range, defaults to None + :type to_time: str, optional + :param rec_cols: comma separated columns from recordings.csv to include in the outputted conversations (optional), recording_filename,session_id,child_id,duration are always included if possible and dont need to be specified. Any column that is not unique for a given unit (eg date_iso for a child_id being recorded on multiple days) will output a value + :type rec_cols: str, optional + :param child_cols: comma separated columns from children.csv to include in the outputted conversations (optional), None by default + :type child_cols: str, optional + :param threads: amount of threads to run on, defaults to 1 + :type threads: int, optional + """ + + SUBCOMMAND = "custom" + + def __init__( + self, + project: ChildProject.projects.ChildProject, + setname: str, + features: str, + recordings: Union[str, List[str], pd.DataFrame] = None, + from_time: str = None, + to_time: str = None, + rec_cols: str = None, + child_cols: str = None, + threads: int = 1, + ): + features_df = pd.read_csv(features) + + super().__init__(project, setname, features_df, recordings=recordings, + from_time=from_time, to_time=to_time, rec_cols=rec_cols, + child_cols=child_cols, threads=threads) + + @staticmethod + def add_parser(subparsers, subcommand): + parser = subparsers.add_parser(subcommand, help="custom conversation extraction") + parser.add_argument("features", + help="name of the csv file containing the list of features to extract", + ) + + +class StandardConversations(Conversations): + """ACLEW conversations extractor. + Extracts a number of conversations from the ACLEW pipeline annotations, which includes: + + - The Voice Type Classifier by Lavechin et al. (arXiv:2005.12656) + - The Automatic LInguistic Unit Count Estimator (ALICE) by Räsänen et al. (doi:10.3758/s13428-020-01460-x) + - The VoCalisation Maturity model (VCMNet) by Al Futaisi et al. (doi:10.1145/3340555.3353751) + + :param project: ChildProject instance of the target dataset. + :type project: ChildProject.projects.ChildProject + :param vtc: name of the set associated to the VTC annotations + :type vtc: str + :param alice: name of the set associated to the ALICE annotations + :type alice: str + :param vcm: name of the set associated to the VCM annotations + :type vcm: str + :param recordings: recordings to sample from; if None, all recordings will be sampled, defaults to None + :type recordings: Union[str, List[str], pd.DataFrame], optional + :param from_time: If specified (in HH:MM:SS format), ignore annotations outside of the given time-range, defaults to None + :type from_time: str, optional + :param to_time: If specified (in HH:MM:SS format), ignore annotations outside of the given time-range, defaults to None + :type to_time: str, optional + :param rec_cols: comma separated columns from recordings.csv to include in the outputted conversations (optional), recording_filename,session_id,child_id,duration are always included if possible and dont need to be specified. Any column that is not unique for a given unit (eg date_iso for a child_id being recorded on multiple days) will output a value + :type rec_cols: str, optional + :param child_cols: comma separated columns from children.csv to include in the outputted conversations (optional), None by default + :type child_cols: str, optional + :param threads: amount of threads to run on, defaults to 1 + :type threads: int, optional + """ + + SUBCOMMAND = "standard" + + def __init__( + self, + project: ChildProject.projects.ChildProject, + setname: str = "vtc/conversations", + recordings: Union[str, List[str], pd.DataFrame] = None, + from_time: str = None, + to_time: str = None, + rec_cols: str = None, + child_cols: str = None, + threads: int = 1, + ): + + features = np.array([ + ["who_initiated", "initiator", pd.NA], + ["who_finished", "finisher", pd.NA], + ["voc_total_dur", "total_duration_of_vocalisations", pd.NA], + ["voc_speaker_count", "CHI_voc_count", 'CHI'], + ["voc_speaker_count", "FEM_voc_count", 'FEM'], + ["voc_speaker_count", "MAL_voc_count", 'MAL'], + ["voc_speaker_count", "OCH_voc_count", 'OCH'], + ["voc_speaker_dur", "CHI_voc_dur", 'CHI'], + ["voc_speaker_dur", "FEM_voc_dur", 'FEM'], + ["voc_speaker_dur", "MAL_voc_dur", 'MAL'], + ["voc_speaker_dur", "OCH_voc_dur", 'OCH'], + ]) + + features = pd.DataFrame(features, columns=["callable", "name", "speaker"]) + + super().__init__(project, setname, features, recordings=recordings, + from_time=from_time, to_time=to_time, + rec_cols=rec_cols, child_cols=child_cols, + threads=threads) + + @staticmethod + def add_parser(subparsers, subcommand): + parser = subparsers.add_parser(subcommand, help="standard conversation extraction") + + +class ConversationsPipeline(Pipeline): + def __init__(self): + self.destination = None + self.project = None + self.conversations = None + self.parameters_path = None + + def run(self, path, destination, pipeline, func=None, **kwargs): + self.destination = destination + # build a dictionary with all parameters used + parameters = locals() + parameters = { + key: parameters[key] + for key in parameters + if key not in ["self", "kwargs", "func"] # not sure what func parameter is for, seems unecessary to keep + } + for key in kwargs: # add all kwargs to dictionary + parameters[key] = kwargs[key] + + self.project = ChildProject.projects.ChildProject(path) + self.project.read() + + try: + datarepo = Repo(path) + parameters['dataset_hash'] = datarepo.head.object.hexsha + except InvalidGitRepositoryError: + logger_conversations.warning("Your dataset is not currently a git repository") + + if pipeline not in pipelines: + raise NotImplementedError(f"invalid pipeline '{pipeline}'") + + conversations = pipelines[pipeline](self.project, **kwargs) + conversations.extract() + + self.conversations = conversations.conversations + self.conversations.to_csv(self.destination, index=False) + + # get the df of features used from the Conversations class + features_df = conversations.features_df + features_df['callable'] = features_df.apply(lambda row: row['callable'].__name__, + axis=1) # from the callables used, find their name back + parameters['features_list'] = [{k: v for k, v in m.items() if pd.notnull(v)} for m in + features_df.to_dict(orient='records')] + date = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") + # create a yaml file with all the parameters used + self.parameters_path = os.path.splitext(self.destination)[0] + "_parameters_{}.yml".format(date) + logger_conversations.info("exported conversations to {}".format(self.destination)) + yaml.dump( + { + "package_version": ChildProject.__version__, + "date": date, + "parameters": parameters, + }, + open(self.parameters_path, "w+"), sort_keys=False, + ) + logger_conversations.info("exported sampler parameters to {}".format(self.parameters_path)) + + return self.conversations + + @staticmethod + def setup_parser(parser): + parser.add_argument("path", help="path to the dataset") + parser.add_argument("destination", help="segments destination") + + subparsers = parser.add_subparsers(help="pipeline", dest="pipeline") + for pipeline in pipelines: + pipelines[pipeline].add_parser(subparsers, pipeline) + + parser.add_argument( + "--set", + help="Set to use to get the conversation annotations", + required=True, + dest='setname' + ) + + parser.add_argument( + "--recordings", + help=("path to a CSV dataframe containing the list of recordings to sample from (by default, all recordings" + " will be sampled). The CSV should have one column named recording_filename."), + default=None, + ) + + parser.add_argument( + "-f", + "--from-time", + help="time range start in HH:MM:SS format (optional)", + default=None, + ) + + parser.add_argument( + "-t", + "--to-time", + help="time range end in HH:MM:SS format (optional)", + default=None, + ) + + parser.add_argument( + "--rec-cols", + help=("comma separated columns from recordings.csv to include in the outputted conversations (optional)," + " NA if ambiguous"), + default=None, + ) + + parser.add_argument( + "--child-cols", + help=("comma separated columns from children.csv to include in the outputted conversations (optional)," + " NA if ambiguous"), + default=None, + ) + + parser.add_argument( + "--threads", help="amount of threads to run on", default=1, type=int + ) + + +class ConversationsSpecificationPipeline(Pipeline): + def __init__(self): + self.destination = None + self.project = None + self.conversations = None + self.parameters_path = None + + def run(self, parameters_input, func=None): + # build a dictionary with all parameters used + parameters = None + with open(parameters_input, "r") as stream: + try: + parameters = yaml.safe_load(stream) + if 'parameters' in parameters: + parameters = parameters['parameters'] + except yaml.YAMLError as exc: + raise yaml.YAMLError( + "parsing of the parameters file {} failed. See above exception for more details".format( + parameters_input)) from exc + + if parameters: + if "path" not in parameters: + raise ValueError( + ("the parameter file {} must contain at least the 'path' key specifying the path to the " + "dataset".format(parameters_input))) + if "destination" not in parameters: + raise ValueError( + ("the parameter file {} must contain the 'destination' key specifying the file to output " + "the conversations to".format(parameters_input))) + if "features_list" not in parameters: + raise ValueError( + ("the parameter file {} must contain the 'features_list' key containing the list of the desired " + "features".format(parameters_input))) + try: + features_df = pd.DataFrame(parameters["features_list"]) + except Exception as e: + raise ValueError( + "The 'features_list' key in {} must be a list of elements".format(parameters_input)) from e + else: + raise ValueError("could not find any parameters in {}".format(parameters_input)) + + try: + datarepo = Repo(parameters["path"]) + parameters['dataset_hash'] = datarepo.head.object.hexsha + except InvalidGitRepositoryError: + logger_conversations.warning("Your dataset is not currently a git repository") + + self.project = ChildProject.projects.ChildProject(parameters["path"]) + self.project.read() + + self.destination = parameters['destination'] + + unwanted_keys = {'features', 'pipeline'} + for i in unwanted_keys: + if i in parameters: + del parameters[i] + + arguments = { + key: parameters[key] + for key in parameters + if key not in {"features_list", "path", "destination", "dataset_hash"} + } + try: + conversations = Conversations(self.project, features_list=features_df, **arguments) + except TypeError as e: + raise ValueError('Unrecognized parameter found {}'.format(e.args[0][46:])) from e + conversations.extract() + + self.conversations = conversations.conversations + self.conversations.to_csv(self.destination, index=False) + + # get the df of features used from the Conversations class + features_df = conversations.features_df + print(features_df) + features_df['callable'] = features_df.apply(lambda row: row['callable'].__name__, + axis=1) # from the callables used, find their name back + parameters['features_list'] = [{k: v for k, v in m.items() if pd.notnull(v)} for m in + features_df.to_dict(orient='records')] + date = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") + # create a yaml file with all the parameters used + self.parameters_path = os.path.splitext(self.destination)[0] + "_parameters_{}.yml".format(date) + logger_conversations.info("exported conversations to {}".format(self.destination)) + yaml.dump( + { + "package_version": ChildProject.__version__, + "date": date, + "parameters": parameters, + }, + open(self.parameters_path, "w+"), sort_keys=False, + ) + logger_conversations.info("exported conversations parameters to {}".format(self.parameters_path)) + + return self.conversations + + @staticmethod + def setup_parser(parser): + parser.add_argument("parameters_input", help="path to the yml file with all parameters") diff --git a/ChildProject/pipelines/derivations.py b/ChildProject/pipelines/derivations.py index 4a05c8c52..0d9c0c25f 100644 --- a/ChildProject/pipelines/derivations.py +++ b/ChildProject/pipelines/derivations.py @@ -175,7 +175,7 @@ def conversations(project, return df else: - return pd.DataFrame([]) + return pd.DataFrame([], columns=['segment_onset', 'raw_filename', 'segment_offset']) def remove_overlaps(project, diff --git a/ChildProject/pipelines/metrics.py b/ChildProject/pipelines/metrics.py index 8ab609353..459fadf6c 100644 --- a/ChildProject/pipelines/metrics.py +++ b/ChildProject/pipelines/metrics.py @@ -3,6 +3,8 @@ import argparse import datetime import multiprocessing as mp +import logging + import numpy as np import pandas as pd from typing import Union, List @@ -21,6 +23,11 @@ pipelines = {} +# Create a logger for the module (file) +logger_metrics = logging.getLogger(__name__) +# messages are propagated to the higher level logger (ChildProject), used in cmdline.py +logger_metrics.propagate = True + class Metrics(ABC): """ Main class for generating metrics from a project object and a list of desired metrics @@ -444,7 +451,7 @@ def __init__( def add_parser(subparsers, subcommand): parser = subparsers.add_parser(subcommand, help="metrics from a csv file") parser.add_argument("metrics", - help="name if the csv file containing the list of metrics", + help="name of the csv file containing the list of metrics", ) class LenaMetrics(Metrics): @@ -652,7 +659,7 @@ def __init__( @staticmethod def add_parser(subparsers, subcommand): - parser = subparsers.add_parser(subcommand, help="LENA metrics") + parser = subparsers.add_parser(subcommand, help="ACLEW metrics") parser.add_argument("--vtc", help="vtc set", default="vtc") parser.add_argument("--alice", help="alice set", default="alice") parser.add_argument("--vcm", help="vcm set", default="vcm") diff --git a/docs/source/_ext/directives.py b/docs/source/_ext/directives.py index 2e7143ae8..0ae9a410c 100644 --- a/docs/source/_ext/directives.py +++ b/docs/source/_ext/directives.py @@ -10,7 +10,7 @@ from ChildProject.projects import ChildProject from ChildProject.annotations import AnnotationManager -from ChildProject.pipelines import metricsFunctions +from ChildProject.pipelines import metricsFunctions, conversationFunctions import subprocess @@ -119,6 +119,20 @@ def __init__(self, *args, **kwargs): 'Required arguments': wrap(arguments,25), } df.append(df_entry) + elif array == 'list-conversation-metrics': + ignores = {'conversationFunction'} + metrics = getmembers(conversationFunctions, isfunction) + for name, func in metrics: + if name in ignores : continue + doc = func.__doc__.split('Required keyword arguments:',1) + description = cleandoc(doc[0]) + arguments = cleandoc(doc[1]) if len(doc) > 1 else "" + df_entry = { + 'Callable': name, + 'Description': wrap(description, 45), + 'Required arguments': wrap(arguments,35), + } + df.append(df_entry) self.options['file'] = '{}.csv'.format(array) self.options['header-rows'] = 1 diff --git a/docs/source/annotations.rst b/docs/source/annotations.rst index eb4307be4..08b314156 100644 --- a/docs/source/annotations.rst +++ b/docs/source/annotations.rst @@ -111,6 +111,8 @@ remove them from the index. child-project remove-annotations /path/to/dataset --set vtc +.. _derive-annotations: + Derive annotations ~~~~~~~~~~~~~~~~~~ diff --git a/docs/source/conversations.rst b/docs/source/conversations.rst new file mode 100644 index 000000000..a8cedefd7 --- /dev/null +++ b/docs/source/conversations.rst @@ -0,0 +1,101 @@ +Conversations summary extraction +-------------------------------- + +Overview +~~~~~~~~ + +This package allows to extract descriptive statistics on identified conversations in recordings. The set used for the extraction must contain conversation annotations which is to say have the columns ``segment_onset``, ``segment_offset``, ``speaker_type`` and ``conv_count``. +The :ref:`derive-annotations` pipeline can be used to derivate conversation annotations from diarized annotations; we recommend using this on vtc automated annotations to have automated conversation annotations. +A csv file containing the statistics is produced along with a YML parameter file storing all the options used for the extractions + +.. clidoc:: + + child-project conversations-summary --help + +The conversation extraction will always have the following columns: + +.. csv-table:: + :header: "column", "info" + :widths: 19, 30 + :stub-columns: 1 + + conversation_onset, start of the conversation (ms) inside the recording + conversation_offset, end of the conversation (ms) inside the recording + voc_count, number of vocalizations inside the conversation + conv_count, identifier of the conversation (unique across the recording) + interval_last_conv, interval (ms) between the end of previous conversation end start of the current conversation (NA for first) + recording_filename, recording of the conversation + +The list of supported functions is shown below: + +.. _list-conversation-metrics: + +.. custom-table:: + :header: list-conversation-metrics + +Standard Conversations +~~~~~~~~~~~~~~~~~~~~~~ + +The Standard pipeline will extract a list of usual metrics that can be obtained from conversations. Using this pipeline with a set containing conversation annotations + will output: + +.. csv-table:: + :header: "metric", "name", "speaker" + :widths: 19, 19, 19 + :stub-columns: 1 + + "who_initiated", "initiator", + "who_finished", "finisher", + "voc_total_dur", "total_duration_of_vocalisations", + "voc_speaker_count", "CHI_voc_count", 'CHI' + "voc_speaker_count", "FEM_voc_count", 'FEM' + "voc_speaker_count", "MAL_voc_count", 'MAL' + "voc_speaker_count", "OCH_voc_count", 'OCH' + "voc_speaker_dur", "CHI_voc_dur", 'CHI' + "voc_speaker_dur", "FEM_voc_dur", 'FEM' + "voc_speaker_dur", "MAL_voc_dur", 'MAL' + "voc_speaker_dur", "OCH_voc_dur", 'OCH' + +.. clidoc:: + + child-project conversations-summary /path/to/dataset output.csv standard --help + +Custom Conversations +~~~~~~~~~~~~~~~~~~~~ + +.. _list-structure: + +The Custom conversations pipeline allows you to provide your own list of desired metric to the pipeline to be extracted. +The list must be in a csv file containing the following colums: + +- callable (required) : name of the metric to extract, see :ref:`the list ` +- name (required) : name to use in the resulting metrics. If none is given, a default name will be used. Use this to extract the same metric for different sets and avoid name clashes. +- (depending on the requirements of the metric you chose) : For each required argument of a metric, add a column of that argument's name. + +This is an example of a csv file we use to extract conversation metrics. +We want to extract who initiated the conversation, who finished it, the list of speakers involved and the percentage of speech produced by the target child (CHI) in each conversation and the same for female adult speakers (FEM). +So we write 5 lines, one for each metric, we give the reference to the metric (as they are in the table above), the name that we want in the final output, and for some of them, the required argument(s). + +.. csv-table:: + :header: "metric", "name", "speaker" + :widths: 20, 10, 20 + + who_initiated, "initiator", + who_finished, "finisher", + participants, "participants", + voc_dur_contribution, chi_dur_contrib, CHI + voc_dur_contribution, fem_dur_contrib, FEM + +.. clidoc:: + + child-project conversations-summary /path/to/dataset output.csv custom --help + +Conversations extraction from parameter file +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +To facilitate the extraction of conversations, one can simply use an exhaustive yml parameter file to launch a new extraction. +This file has the exact same structure as the one produced by the pipeline. So you can use the output parameter file of a previous extraction to rerun the same analysis. + +.. clidoc:: + + child-project conversations-specification --help diff --git a/docs/source/index.rst b/docs/source/index.rst index e6bb07ab4..9e40cf2bb 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -30,6 +30,7 @@ Welcome to ChildProject's documentation! tools annotations metrics + conversations processors samplers elan diff --git a/examples/valid_raw_data/annotations/alice/.DS_Store b/examples/valid_raw_data/annotations/alice/.DS_Store deleted file mode 100644 index 1d960c0fb..000000000 Binary files a/examples/valid_raw_data/annotations/alice/.DS_Store and /dev/null differ diff --git a/tests/data/conversations_parameters.yml b/tests/data/conversations_parameters.yml new file mode 100644 index 000000000..c245193a5 --- /dev/null +++ b/tests/data/conversations_parameters.yml @@ -0,0 +1,45 @@ +package_version: 0.2.0 +date: '20240705_184314' +parameters: + path: output/conversations + destination: output/conversations/extra/std_conv.csv + pipeline: standard + setname: custom_conv + recordings: null + from_time: null + to_time: null + rec_cols: null + child_cols: null + threads: 1 + dataset_hash: 710ebfe1f4b118f8a48f69d896c6675bb72ef6ba + features_list: + - callable: who_initiated + name: initiator + - callable: who_finished + name: finisher + - callable: voc_total_dur + name: total_duration_of_vocalisations + - callable: voc_speaker_count + name: CHI_voc_count + speaker: CHI + - callable: voc_speaker_count + name: FEM_voc_count + speaker: FEM + - callable: voc_speaker_count + name: MAL_voc_count + speaker: MAL + - callable: voc_speaker_count + name: OCH_voc_count + speaker: OCH + - callable: voc_speaker_dur + name: CHI_voc_dur + speaker: CHI + - callable: voc_speaker_dur + name: FEM_voc_dur + speaker: FEM + - callable: voc_speaker_dur + name: MAL_voc_dur + speaker: MAL + - callable: voc_speaker_dur + name: OCH_voc_dur + speaker: OCH diff --git a/tests/data/list_features_conv.csv b/tests/data/list_features_conv.csv new file mode 100644 index 000000000..1802ad41b --- /dev/null +++ b/tests/data/list_features_conv.csv @@ -0,0 +1,6 @@ +callable,name,speaker +who_initiated,initiator, +who_finished,finisher, +participants,participants, +voc_dur_contribution,chi_dur_contrib,CHI +voc_dur_contribution,fem_dur_contrib,FEM diff --git a/tests/test_annotations.py b/tests/test_annotations.py index 7808c34a2..2d3f67b8b 100644 --- a/tests/test_annotations.py +++ b/tests/test_annotations.py @@ -285,6 +285,31 @@ def test_multiple_imports(project, am, input_file, ow, rimported, rerrors, excep assert len(errors) == 0 and len(warnings) == 0, "malformed annotation indexes detected" +def test_import_incorrect_data_types(project, am): + incorrect_types = pd.DataFrame({ + "segment_onset": ["0", "10", "20"], + "segment_offset": ["5", "15", "25"], + "speaker_type": ["CHI", "FEM", "MAN"], + "time_since_last_conv": ["nan", "15", "5"], + "conv_count": [1, 1, 2] + }) + + with pytest.raises(Exception): + am.import_annotations( + pd.DataFrame( + [{"set": "incorrect_types_conv", + "raw_filename": "file.its", + "time_seek": 0, + "recording_filename": "sound.wav", + "range_onset": 0, + "range_offset": 30000000, + "format": "csv", + }] + ), + import_function=partial(fake_vocs, incorrect_types), + ) + + # function used as a derivation function, it should throw errors if not returning dataframe or without required columns def dv_func(a, b, x, type): if type == 'number': diff --git a/tests/test_conversationFunctions.py b/tests/test_conversationFunctions.py new file mode 100644 index 000000000..b3a122e83 --- /dev/null +++ b/tests/test_conversationFunctions.py @@ -0,0 +1,31 @@ +import pandas as pd +import pytest + +import ChildProject.pipelines.conversationFunctions as cf + +@pytest.fixture(scope="function") +def segments(request): + segments = pd.read_csv("tests/data/csv.csv").dropna(subset=['speaker_type']) + segments['voc_duration'] = segments['segment_offset'] - segments['segment_onset'] + + yield segments + +@pytest.mark.parametrize("function,parameters,truth", + [(cf.who_initiated, {}, 'CHI'), + (cf.who_finished, {}, 'MAL'), + (cf.participants, {}, 'CHI/OCH/FEM/MAL'), + (cf.voc_total_dur, {}, 15034), + (cf.is_speaker, {'speaker': 'XXX'}, False), + (cf.is_speaker, {'speaker': 'OCH'}, True), + (cf.voc_speaker_count, {'speaker': 'CHI'}, 1), + (cf.voc_speaker_count, {'speaker': 'OCH'}, 3), + (cf.voc_speaker_dur, {'speaker': 'MAL'}, 3924), + (cf.voc_speaker_dur, {'speaker': 'FEM'}, 3459), + (cf.voc_dur_contribution, {'speaker': 'FEM'}, 3459/15034), + (cf.voc_dur_contribution, {'speaker': 'OCH'}, 6794/15034), + (cf.assign_conv_type, {}, 'multiparty'), + ]) +def test_conversations(segments, function, parameters, truth): + result = function(segments, **parameters) + + assert result == truth \ No newline at end of file diff --git a/tests/test_conversations.py b/tests/test_conversations.py new file mode 100644 index 000000000..12dfa80c8 --- /dev/null +++ b/tests/test_conversations.py @@ -0,0 +1,287 @@ +from functools import partial +import numpy as np +import os +import pandas as pd +import pytest +import shutil +from pathlib import Path + +from ChildProject.projects import ChildProject +from ChildProject.annotations import AnnotationManager +from ChildProject.pipelines.conversations import (Conversations, StandardConversations, CustomConversations, + ConversationsSpecificationPipeline) + +from ChildProject.pipelines.conversationFunctions import conversationFunction, RESERVED + +PATH = Path('output/conversations') + + +def fake_vocs(data, filename): + return data + + +@pytest.fixture(scope="function") +def project(request): + if os.path.exists(PATH): + # shutil.copytree(src="examples/valid_raw_data", dst="output/annotations") + shutil.rmtree(PATH) + shutil.copytree(src="examples/valid_raw_data", dst=PATH) + + project = ChildProject(PATH) + project.read() + + yield project + + +@pytest.fixture(scope="function") +def am(request, project): + am = AnnotationManager(project) + project.recordings['duration'] = [100000000, 2000000] #force longer durations to allow for imports + yield am + +@pytest.fixture(scope="function") +def segments(request): + segments = pd.read_csv("tests/data/csv.csv") + segments.loc[2:4, 'conv_count'] = 1 + segments.loc[8:9, 'conv_count'] = 2 + segments.loc[10:11, 'conv_count'] = 3 + + yield segments + + +def test_failures(project): + features = pd.DataFrame([["who_initiated", "initiator", pd.NA], + ["who_finished", "finisher", pd.NA], + ["voc_speaker_count", "CHI_voc_count", 'CHI'], + ], columns=['callable', 'name', 'speaker']) + + exception_caught = False + try: + standard = StandardConversations(project, setname="unknown") + except ValueError as e: + exception_caught = True + + assert ( + exception_caught is True + ), "StandardConversations failed to throw an exception despite an invalid set being provided" + + exception_caught = False + try: + custom = CustomConversations(project, setname="unknown", features='tests/data/list_features_conv.csv') + except ValueError as e: + exception_caught = True + + assert ( + exception_caught is True + ), "CustomConversations failed to throw an exception despite an invalid set being provided" + + +@pytest.mark.parametrize("error,col_change,new_value", + [(ValueError, 'name', 'finisher'), + (ValueError, 'callable', 'made_up_function'), + (TypeError, 'speaker', 'FEM'), + (None, None, None), + ]) +def test_conversations(project, am, segments, error, col_change, new_value): + + am.import_annotations( + pd.DataFrame( + [{ "set": "custom_conv", + "raw_filename": "file.its", + "time_seek": 0, + "recording_filename": "sound.wav", + "range_onset": 0, + "range_offset": 30000000, + "format": "csv", + }] + ), + import_function=partial(fake_vocs, segments), + ) + + features = pd.DataFrame([["who_initiated", "initiator", pd.NA], + ["who_finished", "finisher", pd.NA], + ["voc_speaker_count", "CHI_voc_count", 'CHI'], + ], columns=['callable', 'name', 'speaker']) + + if error: + with pytest.raises(error): + features.iloc[0, features.columns.get_loc(col_change)] = new_value + cm = Conversations(project, 'custom_conv', features) + cm.extract() + else: + cm = Conversations(project, 'custom_conv', features) + results = cm.extract() + + cm.conversations.to_csv("tests/truth/python_conversations.csv",index=False) + truth = pd.read_csv("tests/truth/python_conversations.csv") + + pd.testing.assert_frame_equal(truth, results) + + pd.testing.assert_frame_equal(results, truth, check_like=True) + +#TODO adapt +def test_standard(project, am, segments): + am.import_annotations( + pd.DataFrame( + [{"set": "custom_conv", + "raw_filename": "file.its", + "time_seek": 0, + "recording_filename": "sound.wav", + "range_onset": 0, + "range_offset": 30000000, + "format": "csv", + }] + ), + import_function=partial(fake_vocs, segments), + ) + + std = StandardConversations(project, setname='custom_conv',rec_cols='date_iso', child_cols='experiment,child_dob') + std.extract() + + # std.conversations.to_csv("tests/truth/standard_conversations.csv", index=False) + truth = pd.read_csv("tests/truth/standard_conversations.csv") + + pd.testing.assert_frame_equal(std.conversations, truth, check_like=True) + + +#TODO adapt +def test_custom(project, am, segments): + am.import_annotations( + pd.DataFrame( + [{"set": "custom_conv", + "raw_filename": "file.its", + "time_seek": 0, + "recording_filename": "sound.wav", + "range_onset": 0, + "range_offset": 30000000, + "format": "csv", + }] + ), + import_function=partial(fake_vocs, segments), + ) + + parameters = "tests/data/list_features_conv.csv" + + cm = CustomConversations(project, 'custom_conv', parameters) + cm.extract() + + # cm.conversations.to_csv("tests/truth/custom_conversations.csv", index=False) + truth = pd.read_csv("tests/truth/custom_conversations.csv") + + pd.testing.assert_frame_equal(cm.conversations, truth, check_like=True) + + +#TODO adapt +def test_specs(project, am, segments): + am.import_annotations( + pd.DataFrame( + [{"set": "custom_conv", + "raw_filename": "file.its", + "time_seek": 0, + "recording_filename": "sound.wav", + "range_onset": 0, + "range_offset": 30000000, + "format": "csv", + }] + ), + import_function=partial(fake_vocs, segments), + ) + + csp = ConversationsSpecificationPipeline() + + parameters = "tests/data/conversations_parameters.yml" + csp.run(parameters) + + output = pd.read_csv(csp.destination) + output.to_csv("tests/truth/specs_conversations.csv", index=False) + truth = pd.read_csv("tests/truth/specs_conversations.csv") + + pd.testing.assert_frame_equal(output, truth, check_like=True) + + new_params = csp.parameters_path + csp.run(new_params) + + output = pd.read_csv(csp.destination) + + pd.testing.assert_frame_equal(output, truth, check_like=True) + + +def test_empty_conversations(project, am): + empty_segments = pd.DataFrame(columns=["segment_onset", "segment_offset", "speaker_type", "time_since_last_conv", "conv_count"]) + + am.import_annotations( + pd.DataFrame( + [{"set": "empty_conv", + "raw_filename": "file.its", + "time_seek": 0, + "recording_filename": "sound.wav", + "range_onset": 0, + "range_offset": 30000000, + "format": "csv", + }] + ), + import_function=partial(fake_vocs, empty_segments), + ) + + std = StandardConversations(project, setname='empty_conv') + results = std.extract() + + assert results.empty, "The result should be empty for an empty dataset" + + +def test_single_entry_conversation(project, am): + single_segment = pd.DataFrame({ + "segment_onset": [0], + "segment_offset": [5], + "speaker_type": ["CHI"], + "time_since_last_conv": [np.nan], + "conv_count": [1] + }) + + am.import_annotations( + pd.DataFrame( + [{"set": "single_conv", + "raw_filename": "file.its", + "time_seek": 0, + "recording_filename": "sound.wav", + "range_onset": 0, + "range_offset": 30000000, + "format": "csv", + }] + ), + import_function=partial(fake_vocs, single_segment), + ) + + std = StandardConversations(project, setname='single_conv') + results = std.extract() + + assert len(results) == 1, "The result should contain one conversation for a single entry dataset" + + +def test_unsorted_annotations(project, am): + unsorted_segments = pd.DataFrame({ + "segment_onset": [20, 0, 10], + "segment_offset": [25, 5, 15], + "speaker_type": ["FEM", "CHI", "MAN"], + "time_since_last_conv": [5, np.nan, 15], + "conv_count": [2, 1, 1] + }) + + am.import_annotations( + pd.DataFrame( + [{"set": "unsorted_conv", + "raw_filename": "file.its", + "time_seek": 0, + "recording_filename": "sound.wav", + "range_onset": 0, + "range_offset": 30000000, + "format": "csv", + }] + ), + import_function=partial(fake_vocs, unsorted_segments), + ) + + std = StandardConversations(project, setname='unsorted_conv') + results = std.extract() + + assert not results.empty, "The result should not be empty for unsorted annotations" diff --git a/tests/test_metrics.py b/tests/test_metrics.py index a7fed4088..275fe5ff9 100644 --- a/tests/test_metrics.py +++ b/tests/test_metrics.py @@ -36,7 +36,7 @@ def am(request, project): project.recordings['duration'] = [100000000, 2000000] #force longer durations to allow for imports yield am -#decorating functions with reserved kwargs should fail +# decorating functions with reserved kwargs should fail @pytest.mark.parametrize("error", [ValueError, ]) def test_decorator(error): for reserved in RESERVED: diff --git a/tests/truth/custom_conversations.csv b/tests/truth/custom_conversations.csv new file mode 100644 index 000000000..72e34c70d --- /dev/null +++ b/tests/truth/custom_conversations.csv @@ -0,0 +1,4 @@ +conversation_onset,conversation_offset,voc_count,conv_count,interval_last_conv,recording_filename,initiator,finisher,participants,chi_dur_contrib,fem_dur_contrib +1984136,1988951,3,1.0,,sound.wav,CHI,FEM,CHI/OCH/FEM,0.1286786786786787,0.5193693693693694 +28284010,28287945,2,2.0,26295059.0,sound.wav,OCH,OCH,OCH/MAL,, +28288492,28294692,2,3.0,2917.0,sound.wav,OCH,MAL,OCH/MAL,, diff --git a/tests/truth/python_conversations.csv b/tests/truth/python_conversations.csv new file mode 100644 index 000000000..3f1b46cc0 --- /dev/null +++ b/tests/truth/python_conversations.csv @@ -0,0 +1,4 @@ +conversation_onset,conversation_offset,voc_count,conv_count,interval_last_conv,recording_filename,initiator,finisher,CHI_voc_count +1984136,1988951,3,1.0,,sound.wav,CHI,FEM,1 +28284010,28287945,2,2.0,26295059.0,sound.wav,OCH,OCH,0 +28288492,28294692,2,3.0,2917.0,sound.wav,OCH,MAL,0 diff --git a/tests/truth/specs_conversations.csv b/tests/truth/specs_conversations.csv new file mode 100644 index 000000000..30bdc9c23 --- /dev/null +++ b/tests/truth/specs_conversations.csv @@ -0,0 +1,4 @@ +conversation_onset,conversation_offset,voc_count,conv_count,interval_last_conv,recording_filename,initiator,finisher,total_duration_of_vocalisations,CHI_voc_count,FEM_voc_count,MAL_voc_count,OCH_voc_count,CHI_voc_dur,FEM_voc_dur,MAL_voc_dur,OCH_voc_dur +1984136,1988951,3,1.0,,sound.wav,CHI,FEM,6660,1,1,0,1,857.0,3459.0,,2344 +28284010,28287945,2,2.0,26295059.0,sound.wav,OCH,OCH,4089,0,0,1,1,,,154.0,3935 +28288492,28294692,2,3.0,2917.0,sound.wav,OCH,MAL,1001,0,0,1,1,,,486.0,515 diff --git a/tests/truth/standard_conversations.csv b/tests/truth/standard_conversations.csv new file mode 100644 index 000000000..a01f62023 --- /dev/null +++ b/tests/truth/standard_conversations.csv @@ -0,0 +1,4 @@ +conversation_onset,conversation_offset,voc_count,conv_count,interval_last_conv,recording_filename,initiator,finisher,total_duration_of_vocalisations,CHI_voc_count,FEM_voc_count,MAL_voc_count,OCH_voc_count,CHI_voc_dur,FEM_voc_dur,MAL_voc_dur,OCH_voc_dur,child_id,date_iso,experiment,child_dob +1984136,1988951,3,1.0,,sound.wav,CHI,FEM,6660,1,1,0,1,857.0,3459.0,,2344,1,2020-04-20,test,2020-01-01 +28284010,28287945,2,2.0,26295059.0,sound.wav,OCH,OCH,4089,0,0,1,1,,,154.0,3935,1,2020-04-20,test,2020-01-01 +28288492,28294692,2,3.0,2917.0,sound.wav,OCH,MAL,1001,0,0,1,1,,,486.0,515,1,2020-04-20,test,2020-01-01