From 1ae49568f26482149f91638f9448aef898ef062e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Agata=20Kozio=C5=82?= Date: Tue, 27 Feb 2024 16:20:26 +0100 Subject: [PATCH 01/44] Created summarise_conversations() --- ChildProject/cmdline.py | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/ChildProject/cmdline.py b/ChildProject/cmdline.py index 8d0872f6..df658979 100755 --- a/ChildProject/cmdline.py +++ b/ChildProject/cmdline.py @@ -290,6 +290,42 @@ def derive_annotations(args): logger.error("\n".join(errors)) logger.error("\n".join(warnings)) +@subcommand( + [ + arg("source", help="project path"), + arg("--input-set", "-i", help="input set", required=True, type=str), + arg("--threads", help="amount of threads to run on", type=int, default=0), + arg("--overwrite-existing", "--ow", + help="overwrites existing summary file if should generate the same output file (useful when reimporting)", + action='store_true'), + ] +) +def summarise_conversations(args): + """generate summary metrics for conversations""" + + project = ChildProject(args.source) + + perform_validation(project, require_success=True, ignore_recordings=True) + + am = AnnotationManager(project) + imported, errors_der = am.summarise_conversations(args.input_set, args.threads, overwrite_existing=args.overwrite_existing) + + if errors_der is not None and errors_der.shape[0] > 0: + logger.error('Conversational summary generation failed for %d entry/ies', errors_der.shape[0]) + logger.debug(errors_der) + + if imported is not None and imported.shape[0] > 0: + errors, warnings = am.validate(imported, threads=args.threads) + + if len(am.errors) > 0: + logger.error( + "in the resulting annotations %s errors and %s warnings were found", + len(am.errors) + len(errors), + len(warnings), + ) # Is it right ? + logger.error("\n".join(am.errors)) + logger.error("\n".join(errors)) + logger.error("\n".join(warnings)) @subcommand( [ From 5f03bcbd7d123e221096f320ca25fbb555c310f1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Agata=20Kozio=C5=82?= Date: Tue, 27 Feb 2024 16:20:32 +0100 Subject: [PATCH 02/44] Create conversations.py --- ChildProject/pipelines/conversations.py | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 ChildProject/pipelines/conversations.py diff --git a/ChildProject/pipelines/conversations.py b/ChildProject/pipelines/conversations.py new file mode 100644 index 00000000..4aed1d8b --- /dev/null +++ b/ChildProject/pipelines/conversations.py @@ -0,0 +1,5 @@ +import numpy as np +import pandas as pd + +def conversations(): + pass \ No newline at end of file From 72415df2300b8bcb3e0f68cdf6f96d7d92490fd9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Agata=20Kozio=C5=82?= Date: Tue, 27 Feb 2024 16:40:52 +0100 Subject: [PATCH 03/44] Created _summarise_conversations --- ChildProject/annotations.py | 204 +++++++++++++++++++++++++++++++++++- 1 file changed, 203 insertions(+), 1 deletion(-) diff --git a/ChildProject/annotations.py b/ChildProject/annotations.py index c730c700..0118f68a 100644 --- a/ChildProject/annotations.py +++ b/ChildProject/annotations.py @@ -11,7 +11,7 @@ import logging from . import __version__ -from .pipelines.derivations import DERIVATIONS +from .pipelines.derivations import DERIVATIONS, conversations from .projects import ChildProject from .converters import * from .tables import IndexTable, IndexColumn, assert_dataframe, assert_columns_presence @@ -923,6 +923,208 @@ def derive_annotations(self, return imported, errors + def _summarise_conversations( + self, + annotation: dict, + overwrite_existing: bool = False, + ): + """import and convert ``annotation``. This function should not be called outside of this class. + + :param import_function: If callable, ``import_function`` will be called to convert the input annotation into a dataframe. Otherwise, the conversion will be performed by a built-in function. + :type import_function: Callable[[str], pd.DataFrame] + :param output_set: name of the new set of derived annotations + :type output_set: str + :param params: Optional parameters. With ```new_tiers```, the corresponding EAF tiers will be imported + :type params: dict + :param annotation: input annotation dictionary (attributes defined according to :ref:`ChildProject.annotations.AnnotationManager.SEGMENTS_COLUMNS`) + :type annotation: dict + :param overwrite_existing: choose if lines with the same set and annotation_filename should be overwritten + :type overwrite_existing: bool + :return: output annotation dictionary (attributes defined according to :ref:`ChildProject.annotations.AnnotationManager.SEGMENTS_COLUMNS`) + :rtype: dict + """ + + source_recording = os.path.splitext(annotation["recording_filename"])[0] + annotation_filename = "{}_{}_{}.csv".format( + source_recording, annotation["range_onset"], annotation["range_offset"] + ) + output_filename = os.path.join( + "extra", annotation_filename + ) + + # # check if the annotation file already exists in dataset (same filename and same set) + # if self.annotations[(self.annotations['set'] == output_set) & + # (self.annotations['annotation_filename'] == annotation_filename)].shape[0] > 0: + # if overwrite_existing: + # logger_annotations.warning("Derived file %s will be overwritten", output_filename) + # + # else: + # logger_annotations.warning("File %s already exists. To overwrite, specify parameter ''overwrite_existing''", output_filename) + # return annotation_result + + # find if there are annotation indexes in the same set that overlap the new annotation + # as it is not possible to annotate multiple times the same audio stretch in the same set + # ovl_annots = self.annotations[(self.annotations['set'] == output_set) & + # (self.annotations[ + # 'annotation_filename'] != annotation_filename) & # this condition avoid matching a line that should be overwritten (so has the same annotation_filename), it is dependent on the previous block!!! + # (self.annotations['recording_filename'] == annotation['recording_filename']) & + # (self.annotations['range_onset'] < annotation['range_offset']) & + # (self.annotations['range_offset'] > annotation['range_onset']) + # ] + # if ovl_annots.shape[0] > 0: + # array_tup = list( + # ovl_annots[['set', 'recording_filename', 'range_onset', 'range_offset']].itertuples(index=False, + # name=None)) + # annotation_result[ + # "error"] = f"derivation for set <{output_set}> recording <{annotation['recording_filename']}> from {annotation['range_onset']} to {annotation['range_offset']} cannot continue because it overlaps with these existing annotation lines: {array_tup}" + # logger_annotations.error("Error: %s", annotation['error']) + # # (f"Error: {annotation['error']}") + # annotation_result["imported_at"] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + # return annotation_result + + path = os.path.join( + self.project.path, + "annotations", + annotation["set"], + "converted", #EXPAND + annotation["annotation_filename"], + ) + + #TODO CHECK FOR DTYPES + df_input = pd.read_csv(path) + df = None + + try: + df = conversations(df_input) + # if callable(import_function): + # df = import_function(df_input) + # elif import_function in DERIVATIONS.keys(): + # df = DERIVATIONS[import_function](df_input) + # else: + # raise ValueError( + # "derivation value '{}' unknown, use one of {}".format(import_function, DERIVATIONS.keys()) + # ) + except: + annotation["error"] = traceback.format_exc() + logger_annotations.error("An error occurred while processing '%s'", path, exc_info=True) + + if df is None or not isinstance(df, pd.DataFrame): + annotation_result["imported_at"] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + return annotation_result + + if not df.shape[1]: + df = pd.DataFrame(columns=[c.name for c in self.SEGMENTS_COLUMNS]) + + df["raw_filename"] = annotation["raw_filename"] + + df["segment_onset"] += np.int64(annotation["time_seek"]) + df["segment_offset"] += np.int64(annotation["time_seek"]) + df["segment_onset"] = df["segment_onset"].astype(np.int64) + df["segment_offset"] = df["segment_offset"].astype(np.int64) + + annotation_result["time_seek"] = np.int64(annotation["time_seek"]) + annotation_result["range_onset"] = np.int64(annotation["range_onset"]) + annotation_result["range_offset"] = np.int64(annotation["range_offset"]) + + df = AnnotationManager.clip_segments( + df, annotation_result["range_onset"], annotation_result["range_offset"] + ) + + sort_columns = ["segment_onset", "segment_offset"] + if "speaker_type" in df.columns: + sort_columns.append("speaker_type") + + df.sort_values(sort_columns, inplace=True) + + os.makedirs( + os.path.dirname(os.path.join(self.project.path, output_filename)), + exist_ok=True, + ) + df.to_csv(os.path.join(self.project.path, output_filename), index=False) + + annotation_result["annotation_filename"] = annotation_filename + annotation_result["imported_at"] = datetime.datetime.now().strftime( + "%Y-%m-%d %H:%M:%S" + ) + annotation_result["package_version"] = __version__ + + return annotation_result + + def summarise_conversations(self, + input_set: str, + output_set: str, + derivation_function: Union[str, Callable], + threads: int = -1, + overwrite_existing: bool = False, + ) -> (pd.DataFrame, pd.DataFrame): + """Derive annotations. + + :param input_set: name of the set of annotations to be derived + :rtype: str + :param output_set: name of the new set of derived annotations + :rtype: str + :param derivation_function: name of the derivation type to be performed + :rtype: Union[str, Callable] + :param threads: If > 1, conversions will be run on ``threads`` threads, defaults to -1 + :type threads: int, optional + :param overwrite_existing: choice if lines with the same set and annotation_filename should be overwritten + :type overwrite_existing: bool, optional + :return: tuple of dataframe of derived annotations, as in :ref:`format-annotations` and dataframe of errors + :rtype: tuple (pd.DataFrame, pd.DataFrame) + """ + input_processed = self.annotations[self.annotations['set'] == input_set].copy() + assert not input_processed.empty, "Input set {0} does not exist".format(input_set) + + if threads == 1: + imported = input_processed.apply( + partial(self._summarise_conversations, + overwrite_existing=overwrite_existing + ), axis=1 + ).to_dict(orient="records") + else: + + with mp.Pool(processes=threads if threads > 0 else mp.cpu_count()) as pool: + imported = pool.map( + partial(self._summarise_conversations, + overwrite_existing=overwrite_existing + ), + input_processed.to_dict(orient="records"), + ) + + imported = pd.DataFrame(imported) + imported.drop( + list(set(imported.columns) - {c.name for c in self.INDEX_COLUMNS}), + axis=1, + inplace=True, + ) + + if 'error' in imported.columns: + errors = imported[~imported["error"].isnull()] + imported = imported[imported["error"].isnull()] + # when errors occur, separate them in a different csv in extra + if errors.shape[0] > 0: + output = os.path.join(self.project.path, "extra", + "errors_conv_summary_{}.csv".format(datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))) + errors.to_csv(output, index=False) + logger_annotations.info("Errors summary exported to %s", output) + else: + errors = None + + self.read() + self.annotations = pd.concat([self.annotations, imported], sort=False) + # at this point, 2 lines with same set and annotation_filename can happen if specified overwrite, + # dropping duplicates remove the first importation and keeps the more recent one + self.annotations = self.annotations.sort_values('imported_at').drop_duplicates( + subset=["set", "recording_filename", "range_onset", "range_offset"], keep='last') + self.write() + + sets = set(input_processed['set'].unique()) + outdated_sets = self._check_for_outdated_merged_sets(sets=sets) + for warning in outdated_sets: + logger_annotations.warning("warning: %s", warning) + + return imported, errors + def get_subsets(self, annotation_set: str, recursive: bool = False) -> List[str]: """Retrieve the list of subsets belonging to a given set of annotations. From 24822a87a0a8d70204d4c3737f5566f5c029c804 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Agata=20Kozio=C5=82?= Date: Wed, 28 Feb 2024 11:34:34 +0100 Subject: [PATCH 04/44] Create conversation metrics class --- ChildProject/cmdline.py | 1 + .../pipelines/conversationFunctions.py | 635 +++++++++++++ ChildProject/pipelines/conversations.py | 886 +++++++++++++++++- 3 files changed, 1520 insertions(+), 2 deletions(-) create mode 100644 ChildProject/pipelines/conversationFunctions.py diff --git a/ChildProject/cmdline.py b/ChildProject/cmdline.py index df658979..e5e4de98 100755 --- a/ChildProject/cmdline.py +++ b/ChildProject/cmdline.py @@ -677,6 +677,7 @@ def main(): register_pipeline("anonymize", AnonymizationPipeline) register_pipeline("metrics", MetricsPipeline) register_pipeline("metrics-specification", MetricsSpecificationPipeline) + #register_pipeline("conversations-summary", ConversationsPipeline) args = parser.parse_args() args.func(args) diff --git a/ChildProject/pipelines/conversationFunctions.py b/ChildProject/pipelines/conversationFunctions.py new file mode 100644 index 00000000..35d9dec0 --- /dev/null +++ b/ChildProject/pipelines/conversationFunctions.py @@ -0,0 +1,635 @@ +# define functions to calculate metrics +import pandas as pd +import numpy as np +import ast +import re +import functools +from typing import Union, Set, Tuple + +""" +This file lists all the metrics functions commonly used. +New metrics can be added by defining new functions for the Metrics class to use : + - Create a new function using the same arguments (i.e. annotations, duration, **kwargs) + - Define calculation of the metric with: + - annotations, which is a dataframe containing all the relevant annotated segments to use. It contains the + annotation content (https://childproject.readthedocs.io/en/latest/format.html#id10) joined with the annotation + index info (https://childproject.readthedocs.io/en/latest/format.html#id11) as well as any column that was + requested to be added to the results by the user using --child-cols or --rec-cols (eg --child-cols child_dob, + languages will make columns 'child_dob' and 'languages' available) + - duration which is the duration of audio annotated in milliseconds + - kwargs, whatever keyword parameter you chose to pass to the function (except 'name', 'callable', 'set' which can + not be used). This will need to be given with the list of metrics when called + - Wrap you function with the 'metricFunction' decorator to make it callable by the pipeline, read metricFunction help + for more info + +!! Metrics functions should still behave and return the correct result when receiving an empty dataframe +""" + +# error message in case of missing columns in annotations +MISSING_COLUMNS = 'The given set <{}> does not have the required column(s) <{}> for computing the {} metric' + +RESERVED = {'set', 'name', 'callable'} # arguments reserved usage. use other keyword labels. + + +def metricFunction(args: set, columns: Union[Set[str], Tuple[Set[str], ...]], empty_value=0, default_name: str = None): + """Decorator for all metrics functions to make them ready to be called by the pipeline. + + :param args: set of required keyword arguments for that function, raise ValueError if were not given \ + you cannot use keywords [name, callable, set] as they are reserved + :type args: set + :param columns: required columns in the dataframe given, missing columns raise ValueError + :type columns: set + :param default_name: default name to use for the metric in the resulting dataframe. Every keyword argument found in the name will be replaced by its value (e.g. 'voc_speaker_ph' uses kwarg 'speaker' so if speaker = 'CHI', name will be 'voc_chi_ph'). if no name is given, the __name__ of the function is used + :type default_name: str + :param empty_value: value to return when annotations are empty but the unit was annotated (e.g. 0 for counts like voc_speaker_ph , None for proportions like lp_n) + :type empty_value: float|int + :return: new function to substitute the metric function + :rtype: Callable + """ + + def decorator(function): + for a in args: + if a in RESERVED: + raise ValueError( + 'Error when defining {} with required argument {}, you cannot use reserved keywords {},\ + change your required argument name'.format( + function.__name__, a, RESERVED)) + + @functools.wraps(function) + def new_func(annotations: pd.DataFrame, duration: int, **kwargs): + for arg in args: + if arg not in kwargs: + raise ValueError(f"{function.__name__} metric needs an argument <{arg}>") + # if a name is explicitly given, use it + if 'name' in kwargs and not pd.isnull(kwargs['name']) and kwargs['name']: + metric_name = kwargs['name'] + # else if a default name for the function exists, use the function name + elif default_name: + metric_name = default_name + # else, no name was found, use the name of the function + else: + metric_name = function.__name__ + + metric_name_replaced = metric_name + # metric_name is the basename used to designate this metric (voc_speaker_ph), + # metric_name_replaced replaces the values of kwargs + # found in the name by their values, giving the metric name for that instance only (voc_chi_ph) + for arg in kwargs: + metric_name_replaced = re.sub(arg, str(kwargs[arg]).lower(), metric_name_replaced) + if annotations.shape[0]: + # if multiple possibilities of columns, explore each and fail only if each combination is missing + # a column, if one possibility, fail if a column is missing + if isinstance(columns, tuple) and len(columns) > 0 and isinstance(columns[0], set): + missing_columns = [] + for possible_cols in columns: + possible_missing = possible_cols - set(annotations.columns) + if possible_missing: + missing_columns.append(possible_missing) + # if we have as many cases of missing columns as possibilities, we can't compute the metric + if len(missing_columns) == len(columns): + raise ValueError( + MISSING_COLUMNS.format(annotations['set'].iloc[0], + ' or '.join([str(s) for s in missing_columns]), + metric_name)) + else: + missing_columns = columns - set(annotations.columns) + if missing_columns: + raise ValueError( + MISSING_COLUMNS.format(annotations['set'].iloc[0], missing_columns, metric_name)) + res = function(annotations, duration, **kwargs) + else: # no annotation for that unit + res = empty_value if duration else None # duration != 0 => was annotated but not segments there + return metric_name_replaced, res + + return new_func + + return decorator + + +def peak_hour_metric(empty_value=0): + """ + empty_value : should repeat the empty value of the metric function wrapper (as this will be used for empty periods) + """ + + def decorator(function): + """Decorator a metric function to select the maximum value observed over 1h periods. function is prefixed with + 'peak_' + """ + + @functools.wraps(function) + def new_func(annotations: pd.DataFrame, duration: int, **kwargs): + # time to consider for periods, here 1h by default, else put it in kwargs + period_time = 3600000 if 'period_time' not in kwargs else kwargs['period_time'] + periods = duration // period_time # number of hours to consider + + # what hour it belongs to (we made the choice of using onset to choose the hour) + annotations['hour_number_metric'] = annotations['segment_onset'] // period_time + + result_array = np.array([]) + for i in range(periods): + # select the annotations for this hour + period_annotations = annotations[annotations['hour_number_metric'] == i] + + if period_annotations.shape[0]: + # compute metric for the period + metric = function(period_annotations, period_time, **kwargs) + else: + metric = empty_value + + result_array = np.append(result_array, metric) # store the result + + # if we have results, return the max, else return NaN + if len(result_array): + return np.nanmax(result_array) + else: + return np.nan + + # wraps will give the same name and doc, so we need to slightly edit them for the peak function + new_func.__doc__ = "Computing the peak for 1h for the following metric:\n" + function.__doc__ + new_func.__name__ = "peak_" + function.__name__ + new_func.__qualname__ = "peak_" + function.__qualname__ + return new_func + + return decorator + + +def per_hour_metric(): + """ + """ + + def decorator(function): + """Decorator creating a metric function controlling the original value by time. function is suffixed with '_ph' + """ + + @functools.wraps(function) + def new_func(annotations: pd.DataFrame, duration: int, **kwargs): + # time to consider for periods, here 1h by default, else put it in kwargs + return function(annotations, duration, **kwargs) * (3600000 / duration) + + # wraps will give the same name and doc, so we need to slightly edit them for the peak function + new_func.__doc__ = function.__doc__ + "This value is a 'per hour' value." + new_func.__name__ = function.__name__ + '_ph' + new_func.__qualname__ = function.__qualname__ + '_ph' + return new_func + + return decorator + + +def voc_speaker(annotations: pd.DataFrame, duration: int, **kwargs): + """number of vocalizations for a given speaker type + + Required keyword arguments: + - speaker : speaker_type to use + """ + return annotations[annotations["speaker_type"] == kwargs["speaker"]].shape[0] + + +# Decorate for the peak metric, per hour metric, and then the classic metric to avoid conflicts of decoration +peak_voc_speaker = metricFunction({"speaker"}, {"speaker_type"})(peak_hour_metric()(voc_speaker)) +voc_speaker_ph = metricFunction({"speaker"}, {"speaker_type"})(per_hour_metric()(voc_speaker)) +voc_speaker = metricFunction({"speaker"}, {"speaker_type"})(voc_speaker) + + +def voc_dur_speaker(annotations: pd.DataFrame, duration: int, **kwargs): + """total duration of vocalizations by a given speaker type in milliseconds per hour + + Required keyword arguments: + - speaker : speaker_type to use + """ + return annotations[annotations["speaker_type"] == kwargs["speaker"]]["duration"].sum() + + +# Decorate for the peak metric, per hour metric, and then the classic metric to avoid conflicts of decoration +peak_voc_dur_speaker = metricFunction({"speaker"}, {"speaker_type", "duration"})(peak_hour_metric()(voc_dur_speaker)) +voc_dur_speaker_ph = metricFunction({"speaker"}, {"speaker_type", "duration"})(per_hour_metric()(voc_dur_speaker)) +voc_dur_speaker = metricFunction({"speaker"}, {"speaker_type", "duration"})(voc_dur_speaker) + + +@metricFunction({"speaker"}, {"speaker_type", "duration"}, np.nan) +def avg_voc_dur_speaker(annotations: pd.DataFrame, duration: int, **kwargs): + """average duration in milliseconds of vocalizations for a given speaker type + + Required keyword arguments: + - speaker : speaker_type to use + """ + return annotations[annotations["speaker_type"] == kwargs["speaker"]]["duration"].mean() + + +def wc_speaker(annotations: pd.DataFrame, duration: int, **kwargs): + """number of words for a given speaker type + + Required keyword arguments: + - speaker : speaker_type to use + """ + return annotations[annotations["speaker_type"] == kwargs["speaker"]]["words"].sum() + + +peak_wc_speaker = metricFunction({"speaker"}, {"speaker_type", "words"})(peak_hour_metric()(wc_speaker)) +wc_speaker_ph = metricFunction({"speaker"}, {"speaker_type", "words"})(per_hour_metric()(wc_speaker)) +wc_speaker = metricFunction({"speaker"}, {"speaker_type", "words"})(wc_speaker) + + +def sc_speaker(annotations: pd.DataFrame, duration: int, **kwargs): + """number of syllables for a given speaker type + + Required keyword arguments: + - speaker : speaker_type to use + """ + return annotations[annotations["speaker_type"] == kwargs["speaker"]]["syllables"].sum() + + +peak_sc_speaker = metricFunction({"speaker"}, {"speaker_type", "syllables"})(peak_hour_metric()(sc_speaker)) +sc_speaker_ph = metricFunction({"speaker"}, {"speaker_type", "syllables"})(per_hour_metric()(sc_speaker)) +sc_speaker = metricFunction({"speaker"}, {"speaker_type", "syllables"})(sc_speaker) + + +def pc_speaker(annotations: pd.DataFrame, duration: int, **kwargs): + """number of phonemes for a given speaker type + + Required keyword arguments: + - speaker : speaker_type to use + """ + return annotations[annotations["speaker_type"] == kwargs["speaker"]]["phonemes"].sum() + + +peak_pc_speaker = metricFunction({"speaker"}, {"speaker_type", "phonemes"})(peak_hour_metric()(pc_speaker)) +pc_speaker_ph = metricFunction({"speaker"}, {"speaker_type", "phonemes"})(per_hour_metric()(pc_speaker)) +pc_speaker = metricFunction({"speaker"}, {"speaker_type", "phonemes"})(pc_speaker) + + +def wc_adu(annotations: pd.DataFrame, duration: int, **kwargs): + """number of words for all speakers + + Required keyword arguments: + """ + return annotations["words"].sum() + + +peak_wc_adu = metricFunction(set(), {"words"})(peak_hour_metric()(wc_adu)) +wc_adu_ph = metricFunction(set(), {"words"})(per_hour_metric()(wc_adu)) +wc_adu = metricFunction(set(), {"words"})(wc_adu) + + +def sc_adu(annotations: pd.DataFrame, duration: int, **kwargs): + """number of syllables for all speakers + + Required keyword arguments: + """ + return annotations["syllables"].sum() + + +peak_sc_adu = metricFunction(set(), {"syllables"})(peak_hour_metric()(sc_adu)) +sc_adu_ph = metricFunction(set(), {"syllables"})(per_hour_metric()(sc_adu)) +sc_adu = metricFunction(set(), {"syllables"})(sc_adu) + + +def pc_adu(annotations: pd.DataFrame, duration: int, **kwargs): + """number of phonemes for all speakers + + Required keyword arguments: + """ + return annotations["phonemes"].sum() + + +peak_pc_adu = metricFunction(set(), {"phonemes"})(peak_hour_metric()(pc_adu)) +pc_adu_ph = metricFunction(set(), {"phonemes"})(per_hour_metric()(pc_adu)) +pc_adu = metricFunction(set(), {"phonemes"})(pc_adu) + + +def cry_voc_speaker(annotations: pd.DataFrame, duration: int, **kwargs): + """number of cry vocalizations for a given speaker (based on vcm_type or lena cries) + + Required keyword arguments: + - speaker : speaker_type to use + """ + if 'vcm_type' in annotations.columns: + return annotations.loc[(annotations["speaker_type"] == kwargs["speaker"]) & + (annotations["vcm_type"] == "Y")].shape[0] + # elif 'cries' in annotations.columns: + else: + return annotations[annotations['speaker_type'] == kwargs["speaker"]]["cries"].apply( + lambda x: len(ast.literal_eval(x))).sum() + + +peak_cry_voc_speaker = metricFunction({"speaker"}, ({"speaker_type", "vcm_type"}, {"speaker_type", "cries"}) + )(peak_hour_metric()(cry_voc_speaker)) +cry_voc_speaker_ph = metricFunction({"speaker"}, ({"speaker_type", "vcm_type"}, {"speaker_type", "cries"}) + )(per_hour_metric()(cry_voc_speaker)) +cry_voc_speaker = metricFunction({"speaker"}, ({"speaker_type", "vcm_type"}, {"speaker_type", "cries"}) + )(cry_voc_speaker) + + +def cry_voc_dur_speaker(annotations: pd.DataFrame, duration: int, **kwargs): + """total duration of cry vocalizations by a given speaker type in milliseconds (based on vcm_type or lena cry) + + Required keyword arguments: + - speaker : speaker_type to use + """ + if 'vcm_type' in annotations.columns and 'duration' in annotations.columns: + return annotations.loc[(annotations["speaker_type"] == kwargs["speaker"]) & + (annotations["vcm_type"] == "Y")]["duration"].sum() + # elif 'child_cry_vfx_len' in annotations.columns: + else: + return annotations[annotations['speaker_type'] == kwargs["speaker"]]["child_cry_vfx_len"].sum() + + +peak_cry_voc_dur_speaker = metricFunction({"speaker"}, ( +{"speaker_type", "vcm_type", "duration"}, {"speaker_type", "child_cry_vfx_len"}))( + peak_hour_metric()(cry_voc_dur_speaker)) +cry_voc_dur_speaker_ph = metricFunction({"speaker"}, ( +{"speaker_type", "vcm_type", "duration"}, {"speaker_type", "child_cry_vfx_len"}))( + per_hour_metric()(cry_voc_dur_speaker)) +cry_voc_dur_speaker = metricFunction({"speaker"}, + ({"speaker_type", "vcm_type", "duration"}, {"speaker_type", "child_cry_vfx_len"}))( + cry_voc_dur_speaker) + + +@metricFunction({"speaker"}, ({"speaker_type", "vcm_type", "duration"}, {'speaker_type', "child_cry_vfx_len", "cries"}), + np.nan) +def avg_cry_voc_dur_speaker(annotations: pd.DataFrame, duration: int, **kwargs): + """average duration of cry vocalizations by a given speaker type (based on vcm_type or lena cries) + + Required keyword arguments: + - speaker : speaker_type to use + """ + if 'vcm_type' in annotations.columns and 'duration' in annotations.columns: + value = annotations.loc[(annotations["speaker_type"] == kwargs["speaker"]) & + (annotations["vcm_type"] == "Y")]["duration"].mean() + else: + annots = annotations[annotations['speaker_type'] == kwargs["speaker"]] + value = annots["child_cry_vfx_len"].sum() / annots["cries"].apply(lambda x: len(ast.literal_eval(x))).sum() + + if pd.isnull(value): + value = 0 + return value + + +def can_voc_speaker(annotations: pd.DataFrame, duration: int, **kwargs): + """number of canonical vocalizations for a given speaker type (based on vcm_type) + + Required keyword arguments: + - speaker : speaker_type to use + """ + return annotations.loc[(annotations["speaker_type"] == kwargs["speaker"]) & (annotations["vcm_type"] == "C")].shape[ + 0] + + +peak_can_voc_speaker = metricFunction({"speaker"}, {"speaker_type", "vcm_type"})(peak_hour_metric()(can_voc_speaker)) +can_voc_speaker_ph = metricFunction({"speaker"}, {"speaker_type", "vcm_type"})(per_hour_metric()(can_voc_speaker)) +can_voc_speaker = metricFunction({"speaker"}, {"speaker_type", "vcm_type"})(can_voc_speaker) + + +def can_voc_dur_speaker(annotations: pd.DataFrame, duration: int, **kwargs): + """total duration of canonical vocalizations by a given speaker type in milliseconds (based on vcm_type) + + Required keyword arguments: + - speaker : speaker_type to use + """ + return annotations.loc[(annotations["speaker_type"] == kwargs["speaker"]) & (annotations["vcm_type"] == "C")][ + "duration"].sum() + + +peak_can_voc_dur_speaker = metricFunction({"speaker"}, {"speaker_type", "vcm_type", "duration"})( + peak_hour_metric()(can_voc_dur_speaker)) +can_voc_dur_speaker_ph = metricFunction({"speaker"}, {"speaker_type", "vcm_type", "duration"})( + per_hour_metric()(can_voc_dur_speaker)) +can_voc_dur_speaker = metricFunction({"speaker"}, {"speaker_type", "vcm_type", "duration"})(can_voc_dur_speaker) + + +@metricFunction({"speaker"}, {"speaker_type", "vcm_type", "duration"}, np.nan) +def avg_can_voc_dur_speaker(annotations: pd.DataFrame, duration: int, **kwargs): + """average duration of canonical vocalizations for a given speaker type (based on vcm_type) + + Required keyword arguments: + - speaker : speaker_type to use + """ + value = annotations.loc[(annotations["speaker_type"] == kwargs["speaker"]) & (annotations["vcm_type"] == "C")][ + "duration"].mean() + if pd.isnull(value): value = 0 + return value + + +def non_can_voc_speaker(annotations: pd.DataFrame, duration: int, **kwargs): + """number of non-canonical vocalizations for a given speaker type (based on vcm_type) + + Required keyword arguments: + - speaker : speaker_type to use + """ + return annotations.loc[(annotations["speaker_type"] == kwargs["speaker"]) & + (annotations["vcm_type"] == "N")].shape[0] + + +peak_non_can_voc_speaker = metricFunction({"speaker"}, {"speaker_type", "vcm_type"})( + peak_hour_metric()(non_can_voc_speaker)) +non_can_voc_speaker_ph = metricFunction({"speaker"}, {"speaker_type", "vcm_type"})( + per_hour_metric()(non_can_voc_speaker)) +non_can_voc_speaker = metricFunction({"speaker"}, {"speaker_type", "vcm_type"})(non_can_voc_speaker) + + +def non_can_voc_dur_speaker(annotations: pd.DataFrame, duration: int, **kwargs): + """total duration of non-canonical vocalizations by a given speaker type in milliseconds (based on vcm_type) + + Required keyword arguments: + - speaker : speaker_type to use + """ + return annotations.loc[(annotations["speaker_type"] == kwargs["speaker"]) & + (annotations["vcm_type"] == "N")]["duration"].sum() + + +peak_non_can_voc_dur_speaker = metricFunction({"speaker"}, {"speaker_type", "vcm_type", "duration"})( + peak_hour_metric()(non_can_voc_dur_speaker)) +non_can_voc_dur_speaker_ph = metricFunction({"speaker"}, {"speaker_type", "vcm_type", "duration"})( + per_hour_metric()(non_can_voc_dur_speaker)) +non_can_voc_dur_speaker = metricFunction({"speaker"}, {"speaker_type", "vcm_type", "duration"})(non_can_voc_dur_speaker) + + +@metricFunction({"speaker"}, {"speaker_type", "vcm_type", "duration"}, np.nan) +def avg_non_can_voc_dur_speaker(annotations: pd.DataFrame, duration: int, **kwargs): + """average duration of non-canonical vocalizations for a given speaker type (based on vcm_type) + + Required keyword arguments: + - speaker : speaker_type to use + """ + value = annotations.loc[(annotations["speaker_type"] == kwargs["speaker"]) & + (annotations["vcm_type"] == "N")]["duration"].mean() + if pd.isnull(value): + value = 0 + return value + + +@metricFunction(set(), set(), np.nan) +def lp_n(annotations: pd.DataFrame, duration: int, **kwargs): + """linguistic proportion on the number of vocalizations for CHI (based on vcm_type or [cries,vfxs,utterances_count] if vcm_type does not exist) + + Required keyword arguments: + """ + if {"cries", "vfxs", "utterances_count"}.issubset(annotations.columns): + annotations = annotations[annotations["speaker_type"] == "CHI"] + cries = annotations["cries"].apply(lambda x: len(ast.literal_eval(x))).sum() + vfxs = annotations["vfxs"].apply(lambda x: len(ast.literal_eval(x))).sum() + utterances = annotations["utterances_count"].sum() + total = (utterances + cries + vfxs) + if total: + value = utterances / total + else: + value = np.nan + elif "vcm_type" in annotations.columns: + speech_voc = annotations.loc[(annotations["speaker_type"] == "CHI") & + (annotations["vcm_type"].isin(["N", "C"]))].shape[0] + cry_voc = annotations.loc[(annotations["speaker_type"] == "CHI") & (annotations["vcm_type"] == "Y")].shape[0] + total = speech_voc + cry_voc + if total: + value = speech_voc / total + else: + value = np.nan + else: + raise ValueError( + "the given set does not have the necessary columns for this metric, choose a set that contains either [" + "vcm_type] or [cries,vfxs,utterances_count]") + return value + + +@metricFunction(set(), {"speaker_type", "vcm_type"}, np.nan) +def cp_n(annotations: pd.DataFrame, duration: int, **kwargs): + """canonical proportion on the number of vocalizations for CHI (based on vcm_type) + + Required keyword arguments: + """ + speech_voc = annotations.loc[(annotations["speaker_type"] == "CHI") & + (annotations["vcm_type"].isin(["N", "C"]))].shape[0] + can_voc = annotations.loc[(annotations["speaker_type"] == "CHI") & (annotations["vcm_type"] == "C")].shape[0] + if speech_voc: + value = can_voc / speech_voc + else: + value = np.nan + return value + + +@metricFunction(set(), set(), np.nan) +def lp_dur(annotations: pd.DataFrame, duration: int, **kwargs): + """linguistic proportion on the duration of vocalizations for CHI (based on vcm_type or [child_cry_vfxs_len,utterances_length] if vcm_type does not exist) + + Required keyword arguments: + """ + if {"child_cry_vfx_len", "utterances_length"}.issubset(annotations.columns): + annotations = annotations[annotations["speaker_type"] == "CHI"] + utter_len = annotations["utterances_length"].sum() + total = annotations["child_cry_vfx_len"].sum() + utter_len + if total: + value = utter_len / total + else: + value = np.nan + elif "vcm_type" in annotations.columns: + speech_dur = annotations.loc[(annotations["speaker_type"] == "CHI") & + (annotations["vcm_type"].isin(["N", "C"]))]["duration"].sum() + cry_dur = annotations.loc[(annotations["speaker_type"] == "CHI") & + (annotations["vcm_type"] == "Y")]["duration"].sum() + total = speech_dur + cry_dur + if total: + value = speech_dur / total + else: + value = np.nan + else: + raise ValueError( + "the {} set does not have the necessary columns for this metric, choose a set that contains either [" + "vcm_type] or [child_cry_vfx_len,utterances_length]") + return value + + +@metricFunction(set(), {"speaker_type", "vcm_type", "duration"}, np.nan) +def cp_dur(annotations: pd.DataFrame, duration: int, **kwargs): + """canonical proportion on the number of vocalizations for CHI (based on vcm_type) + + Required keyword arguments: + """ + speech_dur = annotations.loc[(annotations["speaker_type"] == "CHI") & + (annotations["vcm_type"].isin(["N", "C"]))]["duration"].sum() + can_dur = annotations.loc[(annotations["speaker_type"] == "CHI") & + (annotations["vcm_type"] == "C")]["duration"].sum() + if speech_dur: + value = can_dur / speech_dur + else: + value = np.nan + return value + + +def lena_CVC(annotations: pd.DataFrame, duration: int, **kwargs): + """number of child vocalizations according to LENA's extraction + + Required keyword arguments: + """ + return annotations["utterances_count"].sum() + + +peak_lena_CVC = metricFunction(set(), {"utterances_count"})(peak_hour_metric()(lena_CVC)) +lena_CVC_ph = metricFunction(set(), {"utterances_count"})(per_hour_metric()(lena_CVC)) +lena_CVC = metricFunction(set(), {"utterances_count"})(lena_CVC) + + +def lena_CTC(annotations: pd.DataFrame, duration: int, **kwargs): + """number of conversational turn counts according to LENA's extraction + + Required keyword arguments: + """ + conv_types = {'TIMR', 'TIFR'} + return annotations[annotations["lena_conv_turn_type"].isin(conv_types)].shape[0] + + +peak_lena_CTC = metricFunction(set(), {"lena_conv_turn_type"})(peak_hour_metric()(lena_CTC)) +lena_CTC_ph = metricFunction(set(), {"lena_conv_turn_type"})(per_hour_metric()(lena_CTC)) +lena_CTC = metricFunction(set(), {"lena_conv_turn_type"})(lena_CTC) + + +def simple_CTC(annotations: pd.DataFrame, + duration: int, + interlocutors_1=('CHI',), + interlocutors_2=('FEM', 'MAL', 'OCH'), + max_interval=1000, + min_delay=0, + **kwargs): + """number of conversational turn counts based on vocalizations occurring + in a given interval of one another + + keyword arguments: + - interlocutors_1 : first group of interlocutors, default = ['CHI'] + - interlocutors_2 : second group of interlocutors, default = ['FEM','MAL','OCH'] + - max_interval : maximum interval in ms for it to be considered a turn, default = 1000 + - min_delay : minimum delay between somebody starting speaking + """ + # build the interactants groups, every label in interlocutors_1 can interact with interlocutors_2 and vice versa + speakers = set(interlocutors_1 + interlocutors_2) + interactants = {k: set(interlocutors_2) for k in interlocutors_1} + for k in interlocutors_2: + if k in interactants: + interactants[k] = interactants[k] | set(interlocutors_1) + else: + interactants[k] = set(interlocutors_1) + + annotations = annotations[annotations["speaker_type"].isin(speakers)].copy() + + if annotations.shape[0]: + # store the duration between vocalizations + annotations["iti"] = annotations["segment_onset"] - annotations["segment_offset"].shift(1) + # store the previous speaker + annotations["prev_speaker_type"] = annotations["speaker_type"].shift(1) + + annotations["delay"] = annotations["segment_onset"] - annotations["segment_onset"].shift(1) + + # not using absolute value for 'iti' is a choice and should be evaluated (we allow speakers to 'interrupt' + # themselves + annotations["is_CT"] = ( + (annotations.apply(lambda row: row["prev_speaker_type"] in interactants[row['speaker_type']], axis=1)) + & + (annotations['iti'] < max_interval) + & + (annotations['delay'] >= min_delay) + ) + + return annotations['is_CT'].sum() + else: + return 0 + + +peak_simple_CTC = metricFunction(set(), {"speaker_type"})(peak_hour_metric()(simple_CTC)) +simple_CTC_ph = metricFunction(set(), {"speaker_type"})(per_hour_metric()(simple_CTC)) +simple_CTC = metricFunction(set(), {"speaker_type"})(simple_CTC) diff --git a/ChildProject/pipelines/conversations.py b/ChildProject/pipelines/conversations.py index 4aed1d8b..16ac4b50 100644 --- a/ChildProject/pipelines/conversations.py +++ b/ChildProject/pipelines/conversations.py @@ -1,5 +1,887 @@ +from abc import ABC, abstractmethod +import os +import argparse +import datetime +import multiprocessing as mp import numpy as np import pandas as pd +from typing import Union, List +import yaml +from git import Repo +from git.exc import InvalidGitRepositoryError -def conversations(): - pass \ No newline at end of file +import ChildProject +from ChildProject.pipelines.pipeline import Pipeline + +from ChildProject.tables import assert_dataframe, assert_columns_presence, read_csv_with_dtype +import ChildProject.pipelines.metricsFunctions as metfunc +from ..utils import TimeInterval, time_intervals_intersect + +pipelines = {} + + +class Metrics(ABC): + """ + Main class for generating metrics from a project object and a list of desired metrics + + :param project: ChildProject instance of the target dataset. + :type project: ChildProject.projects.ChildProject + :param metrics_list: pandas DataFrame containing the desired metrics (metrics functions are in metricsFunctions.py) + :type metrics_list: pd.DataFrame + :param by: unit to extract metric from (recording_filename, experiment, child_id, session_id, segments), defaults to 'recording_filename', 'segments' is mandatory if passing the segments argument + :type by: str, optional + :param recordings: recordings to sample from; if None, all recordings will be sampled, defaults to None + :type recordings: Union[str, List[str], pd.DataFrame], optional + :param from_time: If specified (in HH:MM:SS format), ignore annotations outside of the given time-range, defaults to None + :type from_time: str, optional + :param to_time: If specified (in HH:MM:SS format), ignore annotations outside of the given time-range, defaults to None + :type to_time: str, optional + :param rec_cols: comma separated columns from recordings.csv to include in the outputted metrics (optional), recording_filename,session_id,child_id,duration are always included if possible and dont need to be specified. Any column that is not unique for a given unit (eg date_iso for a child_id being recorded on multiple days) will output a value + :type rec_cols: str, optional + :param child_cols: comma separated columns from children.csv to include in the outputted metrics (optional), None by default + :type child_cols: str, optional + :param period: time units to aggregate (optional); equivalent to ``pandas.Grouper`` freq argument. + :type period: str, optional + :param segments: DataFrame or path to csv file of the segments to extract from, containing 'recording_filename', 'segment_onset' and 'segment_offset' columns. To use this option, the option must be set to 'segments'. Also, this option cannot be combined with options [recordings,period,from_time,to_time]. + :type segments: Union[str, pd.DataFrame], optional + :param threads: amount of threads to run on, defaults to 1 + :type threads: int, optional + """ + + def __init__( + self, + project: ChildProject.projects.ChildProject, + metrics_list: pd.DataFrame, + by: str = "recording_filename", + recordings: Union[str, List[str], pd.DataFrame] = None, + from_time: str = None, + to_time: str = None, + rec_cols: str = None, #metadata + child_cols: str = None, #metadata + #period: str = None, + #segments: Union[str, pd.DataFrame] = None, + threads: int = 1, + ): + + self.project = project + self.am = ChildProject.annotations.AnnotationManager(self.project) + self.threads = int(threads) + + # check that the callable column is either a callable function or a string that can be found as being part of the list of metrics in ChildProject/pipelines/metricsFunctions.py + def check_callable(row): + if callable(row["callable"]): return row["callable"] + if isinstance(row["callable"], str): + try: + f = getattr(metfunc, row["callable"]) + except Exception: + raise ValueError( + "{} function is not defined and was not found in ChildProject/pipelines/metricsFunctions.py".format( + row["callable"])) + return f + else: + raise ValueError( + "{} cannot be evaluated as a metric, must be a callable object or a string".format(row["callable"])) + + # block checking presence of required columns and evaluates the callable functions + if isinstance(metrics_list, pd.DataFrame): + if ({'callable', 'set'}).issubset(metrics_list.columns): + metrics_list["callable"] = metrics_list.apply(check_callable, axis=1) + else: + raise ValueError("metrics_list parameter must contain atleast the columns [callable,set]") + else: + raise ValueError("metrics_list parameter must be a pandas DataFrame") + metrics_list.sort_values(by="set", inplace=True) + + for setname in np.unique(metrics_list['set'].values): + if setname not in self.am.annotations["set"].values: + raise ValueError( + f"annotation set '{setname}' was not found in the index; " + "check spelling and make sure the set was properly imported." + ) + self.metrics_list = metrics_list + + # necessary columns to construct the metrics + join_columns = { + "recording_filename", + "child_id", + "duration", + "session_id", + "session_offset", + } + # get existing columns of the dataset for recordings + correct_cols = set(self.project.recordings.columns) + if by != 'segments' and by not in correct_cols: raise ValueError( + "<{}> is not specified in this dataset, cannot extract by it, change your --by option".format(by)) + if rec_cols: + # when user requests recording columns, build the list and verify they exist (warn otherwise) + rec_cols = set(rec_cols.split(",")) + for i in rec_cols: + if i not in correct_cols: + print( + "Warning, requested column <{}> does not exist in recordings.csv, ignoring this column. existing columns are : {}".format( + i, correct_cols)) + rec_cols &= correct_cols + # add wanted columns to the one we already get + join_columns.update(rec_cols) + self.rec_cols = rec_cols + + join_columns &= correct_cols + + # join dataset annotation with their info in recordings.csv + self.am.annotations = self.am.annotations.merge( + self.project.recordings[list(join_columns)], + left_on="recording_filename", + right_on="recording_filename", + ) + + # get existing columns of the dataset for children + correct_cols = set(self.project.children.columns) + if child_cols: + # when user requests children columns, build the list and verify they exist (warn otherwise) + child_cols = set(child_cols.split(",")) + child_cols.add("child_id") + for i in child_cols: + if i not in correct_cols: + print( + "Warning, requested column <{}> does not exist in children.csv, ignoring this column. existing columns are : {}".format( + i, correct_cols)) + child_cols &= correct_cols + self.child_cols = child_cols + + # join dataset annotation with their info in children.csv + self.am.annotations = self.am.annotations.merge( + self.project.children[list(child_cols)], + left_on="child_id", + right_on="child_id", + ) + else: + self.child_cols = None + + self.by = by + self.period = period + self.segments = segments + self.recordings = Pipeline.recordings_from_list(recordings) + + # If the extraction is done on segments + if segments is not None: + # we enforce that incompatible arguments are not set + assert by == 'segments' and period is None and recordings is None and from_time is None and to_time is None, "the option can not be combined with options [period,recordings,from_time,to_time], and should be set to 'segments'" + + dtypes = {'recording_filename': 'string', 'segment_onset': 'Int64', 'segment_offset': 'Int64'} + # use the DataFrame provided or import it from a csv file + if isinstance(segments, pd.DataFrame): + self.segments = segments.astype(dtypes) + else: + self.segments = read_csv_with_dtype(segments, dtypes) + + # check that required columns are present and dataframe not empty + assert_dataframe("segments", self.segments, not_empty=True) + assert_columns_presence( + "segments", + self.segments, + {"recording_filename", "segment_onset", "segment_offset"}, + ) + # not on segments + else: + + # build a dataframe with all the periods we will want for each unit + if self.period: + self.periods = pd.interval_range( + start=datetime.datetime(1900, 1, 1, 0, 0, 0, 0), + end=datetime.datetime(1900, 1, 2, 0, 0, 0, 0), + freq=self.period, + closed="both", + ) + self.periods = pd.DataFrame(self.periods.to_tuples().to_list(), columns=['period_start', 'period_end']) + + # turn from_time and to to_time to datetime objects + if from_time: + try: + self.from_time = datetime.datetime.strptime(from_time, "%H:%M:%S") + except: + raise ValueError( + f"invalid value for from_time ('{from_time}'); should have HH:MM:SS format instead") + else: + self.from_time = None + + if to_time: + try: + self.to_time = datetime.datetime.strptime(to_time, "%H:%M:%S") + except: + raise ValueError(f"invalid value for to_time ('{to_time}'); should have HH:MM:SS format instead") + else: + self.to_time = None + + self._initiate_metrics_df() + + def __init_subclass__(cls, **kwargs): + super().__init_subclass__(**kwargs) + pipelines[cls.SUBCOMMAND] = cls + + def _process_unit(self, row): #process recording line + #keep lines for which conv_count is nopt Na and group by conv + """for one unit (i.e. 1 {recording|session|child} [period]) compute the list of required metrics and store the results in the current row of self.metrics + + :param row: index and Series of the unit to process, to be modified with the results + :type row: (int , pandas.Series) + :return: Series containing all the computed metrics result for that unit + :rtype: pandas.Series + """ + # row[0] is the index of the row we are processing + # row[1] is the actual Series containing all the metrics for the currently processed line + prev_set = "" + duration_set = 0 + for i, line in self.metrics_list.iterrows(): + curr_set = line["set"] + if prev_set != curr_set: + index, annotations = self.retrieve_segments([curr_set], row[1]) + if index.shape[0]: + duration_set = ( + index["range_offset"] - index["range_onset"] + ).sum() + else: + duration_set = 0 + row[1]["duration_{}".format(line["set"])] = duration_set + prev_set = curr_set + + name, value = line["callable"](annotations, duration_set, + **line.drop(['callable', 'set']).dropna().to_dict()) + row[1][name] = value + + return row[1] + + def extract(self): + """from the initiated self.metrics, compute each row metrics (handles threading) + Once the Metrics class is initialized, call this function to extract the metrics and populate self.metrics + + :return: DataFrame of computed metrics + :rtype: pandas.DataFrame + """ + if self.threads == 1: + self.metrics = pd.DataFrame( + [self._process_unit(row) for row in self.metrics.iterrows()] + ) + else: + with mp.Pool( + processes=self.threads if self.threads >= 1 else mp.cpu_count() + ) as pool: + self.metrics = pd.DataFrame( + pool.map(self._process_unit, self.metrics.iterrows()) + ) + if self.period: + self.metrics['period_start'] = self.metrics['period_start'].dt.strftime('%H:%M:%S') + self.metrics['period_end'] = self.metrics['period_end'].dt.strftime('%H:%M:%S') + return self.metrics + + def retrieve_segments(self, sets: List[str], row: str): + """from a list of sets and a row identifying the unit computed, return the relevant annotation segments + + :param sets: List of annotation sets to keep + :type sets: List[str] + :param row: Series storing the unit to compute information + :type row: pandas.Series + :return: relevant annotation DataFrame and index DataFrame + :rtype: (pandas.DataFrame , pandas.DataFrame) + """ + # if extraction from segments, annotations are retrieved from get_within_ranges + if self.segments is not None: + matches = self.am.get_within_ranges(ranges=pd.DataFrame( + [[row['recording_filename'], row['segment_onset'], row['segment_offset']]], + columns=['recording_filename', 'range_onset', 'range_offset']), + sets=sets, + missing_data='warn') + # else prepare and use get_within_time_range + else: + annotations = self.am.annotations[self.am.annotations[self.by] == row[self.by]] + annotations = annotations[annotations["set"].isin(sets)] + # restrict to time ranges + if self.from_time and self.to_time: + # add the periods columns + if self.period: + st_hour = row["period_start"] + end_hour = row["period_end"] + intervals = time_intervals_intersect(TimeInterval(self.from_time, self.to_time), + TimeInterval(st_hour, end_hour)) + matches = pd.concat([self.am.get_within_time_range(annotations, i) for i in intervals], + ignore_index=True) if intervals else pd.DataFrame() + else: + matches = self.am.get_within_time_range( + annotations, TimeInterval(self.from_time, self.to_time)) + elif self.period: + # add the periods columns + st_hour = row["period_start"] + end_hour = row["period_end"] + matches = self.am.get_within_time_range( + annotations, TimeInterval(st_hour, end_hour)) + else: + matches = annotations + + if matches.shape[0]: + segments = self.am.get_segments(matches) + else: + # no annotations for that unit + return pd.DataFrame(), pd.DataFrame() + + # prevent overflows + segments["duration"] = ( + (segments["segment_offset"] - segments["segment_onset"]) + .astype(float) + .fillna(0) + ) + + return matches, segments + + def _initiate_metrics_df(self): + """builds a dataframe with all the rows necessary and their labels + eg : - one row per child if --by child_id and no --period + - 48 rows if 2 recordings in the corpus --period 1h --by recording_filename + Then the extract() method should populate the dataframe with actual metrics + """ + # build the metrics dataframe from the segments argument + if self.segments is not None: + recordings = self.project.get_recordings_from_list(self.segments['recording_filename'].unique()) + self.by = 'recording_filename' + self.metrics = self.segments.copy() + # else use the list of recordings of the dataset and the by option + else: + recordings = self.project.get_recordings_from_list(self.recordings) + self.metrics = pd.DataFrame(recordings[self.by].unique(), columns=[self.by]) + if self.period: + # if period, use the self.periods dataframe to build all the list of segments per unit + self.metrics[ + "key"] = 0 # with old versions of pandas, we are forced to have a common column to do a cross join, we drop the column after + self.periods["key"] = 0 + self.metrics = pd.merge(self.metrics, self.periods, on='key', how='outer').drop('key', axis=1) + + # add info for child_id + self.metrics["child_id"] = self.metrics.apply( + lambda row: self.project.recordings[self.project.recordings[self.by] == row[self.by] + ]["child_id"].iloc[0], + axis=1) + + # get and add to dataframe children.csv columns asked + if self.child_cols: + for label in self.child_cols: + self.metrics[label] = self.metrics.apply(lambda row: + self.project.children[ + self.project.children["child_id"] == row["child_id"] + ][label].iloc[0], axis=1) + + # this loop is for the purpose of checking for name duplicates in the metrics + # we do a dry run on the first line with no annotations bc impractical to check in multiprocessing + df = pd.DataFrame() + duration_set = 0 + names = set() + for i, line in self.metrics_list.iterrows(): + name, value = line["callable"](df, duration_set, **line.drop(['callable', 'set'], + errors='ignore').dropna().to_dict()) + + if name in names: + raise ValueError('the metric name <{}> is used multiple times, make sure it is unique'.format(name)) + else: + names.add(name) + + # checking that columns added by the user are unique (e.g. date_iso may be different when extract by child_id), replace with NA if they are not + def check_unicity(row, label): + value = self.project.recordings[ + self.project.recordings[self.by] == row[self.by] + ][label].drop_duplicates() + # check that there is only one row remaining (ie this column has a unique value for that unit) + if len(value) == 1: + return value.iloc[0] + # otherwise, leave the column as NA + else: + return np.nan + + # get and add to dataframe recordings.csv columns asked + if self.rec_cols: + for label in self.rec_cols: + self.metrics[label] = self.metrics.apply(lambda row: check_unicity(row, label), axis=1) + + +class CustomMetrics(Metrics): + """metrics extraction from a csv file. + Extracts a number of metrics listed in a csv file as a dataframe. + the csv file must contain the columns : + - 'callable' which is the name of the wanted metric from the list of available metrics + - 'set' which is the set of annotations to use for that specific metric (make sure this set has the required columns for that metric) + - 'name' is optional, this is the name to give to that metric (if not given, a default name will be attributed) + - any other necessary argument for the given metrics (eg the voc_speaker_ph metric requires the 'speaker' argument: add a column 'speaker' in the csv file and fill its cells for this metric with the wanted value (CHI|FEM|MAL|OCH)) + + :param project: ChildProject instance of the target dataset. + :type project: ChildProject.projects.ChildProject + :param metrics: name of the csv file listing the metrics to extract + :type metrics: str + :param recordings: recordings to sample from; if None, all recordings will be sampled, defaults to None + :type recordings: Union[str, List[str], pd.DataFrame], optional + :param from_time: If specified (in HH:MM:SS format), ignore annotations outside of the given time-range, defaults to None + :type from_time: str, optional + :param to_time: If specified (in HH:MM:SS format), ignore annotations outside of the given time-range, defaults to None + :type to_time: str, optional + :param rec_cols: comma separated columns from recordings.csv to include in the outputted metrics (optional), recording_filename,session_id,child_id,duration are always included if possible and dont need to be specified. Any column that is not unique for a given unit (eg date_iso for a child_id being recorded on multiple days) will output a value + :type rec_cols: str, optional + :param child_cols: comma separated columns from children.csv to include in the outputted metrics (optional), None by default + :type child_cols: str, optional + :param by: unit to extract metric from (recording_filename, experiment, child_id, session_id, segments), defaults to 'recording_filename', 'segments' is mandatory if passing the segments argument + :type by: str, optional + :param period: time units to aggregate (optional); equivalent to ``pandas.Grouper`` freq argument. + :type period: str, optional + :param segments: DataFrame or path to csv file of the segments to extract from, containing 'recording_filename', 'segment_onset' and 'segment_offset' columns. To use this option, the option must be set to 'segments'. Also, this option cannot be combined with options [recordings,period,from_time,to_time]. + :type segments: Union[str, pd.DataFrame], optional + :param threads: amount of threads to run on, defaults to 1 + :type threads: int, optional + """ + + SUBCOMMAND = "custom" + + def __init__( + self, + project: ChildProject.projects.ChildProject, + metrics: str, + recordings: Union[str, List[str], pd.DataFrame] = None, + from_time: str = None, + to_time: str = None, + rec_cols: str = None, + child_cols: str = None, + by: str = "recording_filename", + period: str = None, + segments: Union[str, pd.DataFrame] = None, + threads: int = 1, + ): + metrics_df = pd.read_csv(metrics) + + super().__init__(project, metrics_df, by=by, recordings=recordings, + from_time=from_time, to_time=to_time, rec_cols=rec_cols, + child_cols=child_cols, period=period, segments=segments, threads=threads) + + @staticmethod + def add_parser(subparsers, subcommand): + parser = subparsers.add_parser(subcommand, help="metrics from a csv file") + parser.add_argument("metrics", + help="name if the csv file containing the list of metrics", + ) + + +class LenaMetrics(Metrics): + """LENA metrics extractor. + Extracts a number of metrics from the LENA .its annotations. + + :param project: ChildProject instance of the target dataset. + :type project: ChildProject.projects.ChildProject + :param set: name of the set associated to the .its annotations + :type set: str + :param recordings: recordings to sample from; if None, all recordings will be sampled, defaults to None + :type recordings: Union[str, List[str], pd.DataFrame], optional + :param from_time: If specified (in HH:MM:SS format), ignore annotations outside of the given time-range, defaults to None + :type from_time: str, optional + :param to_time: If specified (in HH:MM:SS format), ignore annotations outside of the given time-range, defaults to None + :type to_time: str, optional + :param rec_cols: comma separated columns from recordings.csv to include in the outputted metrics (optional), recording_filename,session_id,child_id,duration are always included if possible and dont need to be specified. Any column that is not unique for a given unit (eg date_iso for a child_id being recorded on multiple days) will output a value + :type rec_cols: str, optional + :param child_cols: comma separated columns from children.csv to include in the outputted metrics (optional), None by default + :type child_cols: str, optional + :param by: unit to extract metric from (recording_filename, experiment, child_id, session_id, segments), defaults to 'recording_filename', 'segments' is mandatory if passing the segments argument + :type by: str, optional + :param period: time units to aggregate (optional); equivalent to ``pandas.Grouper`` freq argument. + :type period: str, optional + :param segments: DataFrame or path to csv file of the segments to extract from, containing 'recording_filename', 'segment_onset' and 'segment_offset' columns. To use this option, the option must be set to 'segments'. Also, this option cannot be combined with options [recordings,period,from_time,to_time]. + :type segments: Union[str, pd.DataFrame], optional + :param threads: amount of threads to run on, defaults to 1 + :type threads: int, optional + """ + + SUBCOMMAND = "lena" + + def __init__( + self, + project: ChildProject.projects.ChildProject, + set: str, + recordings: Union[str, List[str], pd.DataFrame] = None, + from_time: str = None, + to_time: str = None, + rec_cols: str = None, + child_cols: str = None, + by: str = "recording_filename", + period: str = None, + segments: Union[str, pd.DataFrame] = None, + threads: int = 1, + ): + self.set = set + + METRICS = pd.DataFrame(np.array( + [["voc_speaker_ph", self.set, 'FEM'], + ["voc_speaker_ph", self.set, 'MAL'], + ["voc_speaker_ph", self.set, 'OCH'], + ["voc_speaker_ph", self.set, 'CHI'], + ["voc_dur_speaker_ph", self.set, 'FEM'], + ["voc_dur_speaker_ph", self.set, 'MAL'], + ["voc_dur_speaker_ph", self.set, 'OCH'], + ["voc_dur_speaker_ph", self.set, 'CHI'], + ["avg_voc_dur_speaker", self.set, 'FEM'], + ["avg_voc_dur_speaker", self.set, 'MAL'], + ["avg_voc_dur_speaker", self.set, 'OCH'], + ["avg_voc_dur_speaker", self.set, 'CHI'], + ["wc_speaker_ph", self.set, 'FEM'], + ["wc_speaker_ph", self.set, 'MAL'], + ["wc_adu_ph", self.set, pd.NA], + ["lp_n", self.set, pd.NA], + ["lp_dur", self.set, pd.NA], + ["lena_CVC", self.set, pd.NA], + ["lena_CTC", self.set, pd.NA], + ]), columns=["callable", "set", "speaker"]) + + super().__init__(project, METRICS, by=by, recordings=recordings, + period=period, from_time=from_time, to_time=to_time, rec_cols=rec_cols, + child_cols=child_cols, segments=segments, threads=threads) + + if self.set not in self.am.annotations["set"].values: + raise ValueError( + f"annotation set '{self.set}' was not found in the index; " + "check spelling and make sure the set was properly imported." + ) + + @staticmethod + def add_parser(subparsers, subcommand): + parser = subparsers.add_parser(subcommand, help="LENA metrics") + parser.add_argument("set", help="name of the LENA its annotations set") + + +class AclewMetrics(Metrics): + """ACLEW metrics extractor. + Extracts a number of metrics from the ACLEW pipeline annotations, which includes: + + - The Voice Type Classifier by Lavechin et al. (arXiv:2005.12656) + - The Automatic LInguistic Unit Count Estimator (ALICE) by Räsänen et al. (doi:10.3758/s13428-020-01460-x) + - The VoCalisation Maturity model (VCMNet) by Al Futaisi et al. (doi:10.1145/3340555.3353751) + + :param project: ChildProject instance of the target dataset. + :type project: ChildProject.projects.ChildProject + :param vtc: name of the set associated to the VTC annotations + :type vtc: str + :param alice: name of the set associated to the ALICE annotations + :type alice: str + :param vcm: name of the set associated to the VCM annotations + :type vcm: str + :param recordings: recordings to sample from; if None, all recordings will be sampled, defaults to None + :type recordings: Union[str, List[str], pd.DataFrame], optional + :param from_time: If specified (in HH:MM:SS format), ignore annotations outside of the given time-range, defaults to None + :type from_time: str, optional + :param to_time: If specified (in HH:MM:SS format), ignore annotations outside of the given time-range, defaults to None + :type to_time: str, optional + :param rec_cols: comma separated columns from recordings.csv to include in the outputted metrics (optional), recording_filename,session_id,child_id,duration are always included if possible and dont need to be specified. Any column that is not unique for a given unit (eg date_iso for a child_id being recorded on multiple days) will output a value + :type rec_cols: str, optional + :param child_cols: comma separated columns from children.csv to include in the outputted metrics (optional), None by default + :type child_cols: str, optional + :param by: unit to extract metric from (recording_filename, experiment, child_id, session_id, segments), defaults to 'recording_filename', 'segments' is mandatory if passing the segments argument + :type by: str, optional + :param period: time units to aggregate (optional); equivalent to ``pandas.Grouper`` freq argument. + :type period: str, optional + :param segments: DataFrame or path to csv file of the segments to extract from, containing 'recording_filename', 'segment_onset' and 'segment_offset' columns. To use this option, the option must be set to 'segments'. Also, this option cannot be combined with options [recordings,period,from_time,to_time]. + :type segments: Union[str, pd.DataFrame], optional + :param threads: amount of threads to run on, defaults to 1 + :type threads: int, optional + """ + + SUBCOMMAND = "aclew" + + def __init__( + self, + project: ChildProject.projects.ChildProject, + vtc: str = "vtc", + alice: str = "alice", + vcm: str = "vcm", + recordings: Union[str, List[str], pd.DataFrame] = None, + from_time: str = None, + to_time: str = None, + rec_cols: str = None, + child_cols: str = None, + period: str = None, + segments: Union[str, pd.DataFrame] = None, + by: str = "recording_filename", + threads: int = 1, + ): + + self.vtc = vtc + self.alice = alice + self.vcm = vcm + + am = ChildProject.annotations.AnnotationManager( + project) # temporary instance to check for existing sets. This is suboptimal because an annotation manager will be created by Metrics. However, the metrics class raises a ValueError for every set passed that does not exist, here we want to check in advance which of the alice and vcm sets exist without raising an error + + METRICS = np.array( + [["voc_speaker_ph", self.vtc, 'FEM'], + ["voc_speaker_ph", self.vtc, 'MAL'], + ["voc_speaker_ph", self.vtc, 'OCH'], + ["voc_speaker_ph", self.vtc, 'CHI'], + ["voc_dur_speaker_ph", self.vtc, 'FEM'], + ["voc_dur_speaker_ph", self.vtc, 'MAL'], + ["voc_dur_speaker_ph", self.vtc, 'OCH'], + ["voc_dur_speaker_ph", self.vtc, 'CHI'], + ["avg_voc_dur_speaker", self.vtc, 'FEM'], + ["avg_voc_dur_speaker", self.vtc, 'MAL'], + ["avg_voc_dur_speaker", self.vtc, 'OCH'], + ["avg_voc_dur_speaker", self.vtc, 'CHI'], + ["simple_CTC_ph", self.vtc, pd.NA], + ]) + + if self.alice not in am.annotations["set"].values: + print(f"The ALICE set ('{self.alice}') was not found in the index.") + else: + METRICS = np.concatenate((METRICS, np.array( + [["wc_speaker_ph", self.alice, 'FEM'], + ["wc_speaker_ph", self.alice, 'MAL'], + ["sc_speaker_ph", self.alice, 'FEM'], + ["sc_speaker_ph", self.alice, 'MAL'], + ["pc_speaker_ph", self.alice, 'FEM'], + ["pc_speaker_ph", self.alice, 'MAL'], + ["wc_adu_ph", self.alice, pd.NA], + ["sc_adu_ph", self.alice, pd.NA], + ["pc_adu_ph", self.alice, pd.NA], + ]))) + + if self.vcm not in am.annotations["set"].values: + print(f"The vcm set ('{self.vcm}') was not found in the index.") + else: + METRICS = np.concatenate((METRICS, np.array( + [["cry_voc_speaker_ph", self.vcm, 'CHI'], + ["cry_voc_dur_speaker_ph", self.vcm, 'CHI'], + ["avg_cry_voc_dur_speaker", self.vcm, 'CHI'], + ["can_voc_speaker_ph", self.vcm, 'CHI'], + ["can_voc_dur_speaker_ph", self.vcm, 'CHI'], + ["avg_can_voc_dur_speaker", self.vcm, 'CHI'], + ["non_can_voc_speaker_ph", self.vcm, 'CHI'], + ["non_can_voc_dur_speaker_ph", self.vcm, 'CHI'], + ["avg_non_can_voc_dur_speaker", self.vcm, 'CHI'], + ["lp_n", self.vcm, pd.NA], + ["lp_dur", self.vcm, pd.NA], + ["cp_n", self.vcm, pd.NA], + ["cp_dur", self.vcm, pd.NA], + ]))) + + METRICS = pd.DataFrame(METRICS, columns=["callable", "set", "speaker"]) + + super().__init__(project, METRICS, by=by, recordings=recordings, + period=period, from_time=from_time, to_time=to_time, + rec_cols=rec_cols, child_cols=child_cols, segments=segments, + threads=threads) + + @staticmethod + def add_parser(subparsers, subcommand): + parser = subparsers.add_parser(subcommand, help="LENA metrics") + parser.add_argument("--vtc", help="vtc set", default="vtc") + parser.add_argument("--alice", help="alice set", default="alice") + parser.add_argument("--vcm", help="vcm set", default="vcm") + + +class MetricsPipeline(Pipeline): + def __init__(self): + self.metrics = [] + + def run(self, path, destination, pipeline, func=None, **kwargs): + self.destination = destination + # build a dictionary with all parameters used + parameters = locals() + parameters = { + key: parameters[key] + for key in parameters + if key not in ["self", "kwargs", "func"] # not sure what func parameter is for, seems unecessary to keep + } + for key in kwargs: # add all kwargs to dictionary + parameters[key] = kwargs[key] + + self.project = ChildProject.projects.ChildProject(path) + self.project.read() + + try: + datarepo = Repo(path) + parameters['dataset_hash'] = datarepo.head.object.hexsha + except InvalidGitRepositoryError: + print("Your dataset is not currently a git repository") + + if pipeline not in pipelines: + raise NotImplementedError(f"invalid pipeline '{pipeline}'") + + metrics = pipelines[pipeline](self.project, **kwargs) + metrics.extract() + + self.metrics = metrics.metrics + self.metrics.to_csv(self.destination, index=False) + + # get the df of metrics used from the Metrics class + metrics_df = metrics.metrics_list + metrics_df['callable'] = metrics_df.apply(lambda row: row['callable'].__name__, + axis=1) # from the callables used, find their name back + parameters['metrics_list'] = [{k: v for k, v in m.items() if pd.notnull(v)} for m in + metrics_df.to_dict(orient='records')] + date = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") + # create a yaml file with all the parameters used + self.parameters_path = os.path.splitext(self.destination)[0] + "_parameters_{}.yml".format(date) + print("exported metrics to {}".format(self.destination)) + yaml.dump( + { + "package_version": ChildProject.__version__, + "date": date, + "parameters": parameters, + }, + open(self.parameters_path, "w+"), sort_keys=False, + ) + print("exported sampler parameters to {}".format(self.parameters_path)) + + return self.metrics + + @staticmethod + def setup_parser(parser): + parser.add_argument("path", help="path to the dataset") + parser.add_argument("destination", help="segments destination") + + subparsers = parser.add_subparsers(help="pipeline", dest="pipeline") + for pipeline in pipelines: + pipelines[pipeline].add_parser(subparsers, pipeline) + + parser.add_argument( + "--recordings", + help="path to a CSV dataframe containing the list of recordings to sample from (by default, all recordings will be sampled). The CSV should have one column named recording_filename.", + default=None, + ) + + parser.add_argument( + "--by", + help="units to sample from (default behavior is to sample by recording)", + choices=["recording_filename", "session_id", "child_id", "experiment", "segments"], + default="recording_filename", + ) + + parser.add_argument( + "--segments", + help="path to a CSV dataframe containing the list of segments to sample from. The CSV should have 3 columns named recording_filename, segment_onset, segment_offset. --by must be set to 'segments', Can not be used along with options [--period,--recordings,--from-tim,--to-time]", + default=None, + ) + + parser.add_argument( + "--period", + help="time units to aggregate (optional); equivalent to ``pandas.Grouper`` freq argument. The resulting metrics will be split for each unit across all the resulting periods.", + default=None, + ) + + parser.add_argument( + "-f", + "--from-time", + help="time range start in HH:MM:SS format (optional)", + default=None, + ) + + parser.add_argument( + "-t", + "--to-time", + help="time range end in HH:MM:SS format (optional)", + default=None, + ) + + parser.add_argument( + "--rec-cols", + help="comma separated columns from recordings.csv to include in the outputted metrics (optional), NA if ambiguous", + default=None, + ) + + parser.add_argument( + "--child-cols", + help="comma separated columns from children.csv to include in the outputted metrics (optional), NA if ambiguous", + default=None, + ) + + parser.add_argument( + "--threads", help="amount of threads to run on", default=1, type=int + ) + + +class MetricsSpecificationPipeline(Pipeline): + def __init__(self): + self.metrics = [] + + def run(self, parameters_input, func=None): + # build a dictionary with all parameters used + parameters = None + with open(parameters_input, "r") as stream: + try: + parameters = yaml.safe_load(stream) + if 'parameters' in parameters: parameters = parameters['parameters'] + except yaml.YAMLError as exc: + raise yaml.YAMLError( + "parsing of the parameters file {} failed. See above exception for more details".format( + parameters_input)) from exc + + if parameters: + if "path" not in parameters: + raise ValueError( + "the parameter file {} must contain at least the 'path' key specifying the path to the dataset".format( + parameters_input)) + if "destination" not in parameters: + raise ValueError( + "the parameter file {} must contain the 'destination' key specifying the file to output the metrics to".format( + parameters_input)) + if "metrics_list" not in parameters: + raise ValueError( + "the parameter file {} must contain the 'metrics_list' key containing the list of the desired metrics".format( + parameters_input)) + try: + metrics_df = pd.DataFrame(parameters["metrics_list"]) + except Exception as e: + raise ValueError( + "The 'metrics_list' key in {} must be a list of elements".format(parameters_input)) from e + else: + raise ValueError("could not find any parameters in {}".format(parameters_input)) + + try: + datarepo = Repo(parameters["path"]) + parameters['dataset_hash'] = datarepo.head.object.hexsha + except InvalidGitRepositoryError: + print("Your dataset is not currently a git repository") + + self.project = ChildProject.projects.ChildProject(parameters["path"]) + self.project.read() + + self.destination = parameters['destination'] + + unwanted_keys = {'metrics', 'pipeline'} + for i in unwanted_keys: + if i in parameters: del parameters[i] + + arguments = { + key: parameters[key] + for key in parameters + if key not in {"metrics_list", "path", "destination", "dataset_hash"} + } + try: + metrics = Metrics(self.project, metrics_df, **arguments) + except TypeError as e: + raise ValueError('Unrecognized parameter found {}'.format(e.args[0][46:])) from e + metrics.extract() + + self.metrics = metrics.metrics + self.metrics.to_csv(self.destination, index=False) + + # get the df of metrics used from the Metrics class + metrics_df = metrics.metrics_list + metrics_df['callable'] = metrics_df.apply(lambda row: row['callable'].__name__, + axis=1) # from the callables used, find their name back + parameters['metrics_list'] = [{k: v for k, v in m.items() if pd.notnull(v)} for m in + metrics_df.to_dict(orient='records')] + date = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") + # create a yaml file with all the parameters used + self.parameters_path = os.path.splitext(self.destination)[0] + "_parameters_{}.yml".format(date) + print("exported metrics to {}".format(self.destination)) + yaml.dump( + { + "package_version": ChildProject.__version__, + "date": date, + "parameters": parameters, + }, + open(self.parameters_path, "w+"), sort_keys=False, + ) + print("exported metrics parameters to {}".format(self.parameters_path)) + + return self.metrics + + @staticmethod + def setup_parser(parser): + parser.add_argument("parameters_input", help="path to the yml file with all parameters") \ No newline at end of file From ee5b3c8a8e464d09cccfdd03acd3fda175580ff0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Agata=20Kozio=C5=82?= Date: Wed, 28 Feb 2024 11:36:20 +0100 Subject: [PATCH 05/44] retrieve unwanted changes --- ChildProject/annotations.py | 202 ------------------------------------ ChildProject/cmdline.py | 36 ------- 2 files changed, 238 deletions(-) diff --git a/ChildProject/annotations.py b/ChildProject/annotations.py index 0118f68a..33e61414 100644 --- a/ChildProject/annotations.py +++ b/ChildProject/annotations.py @@ -923,208 +923,6 @@ def derive_annotations(self, return imported, errors - def _summarise_conversations( - self, - annotation: dict, - overwrite_existing: bool = False, - ): - """import and convert ``annotation``. This function should not be called outside of this class. - - :param import_function: If callable, ``import_function`` will be called to convert the input annotation into a dataframe. Otherwise, the conversion will be performed by a built-in function. - :type import_function: Callable[[str], pd.DataFrame] - :param output_set: name of the new set of derived annotations - :type output_set: str - :param params: Optional parameters. With ```new_tiers```, the corresponding EAF tiers will be imported - :type params: dict - :param annotation: input annotation dictionary (attributes defined according to :ref:`ChildProject.annotations.AnnotationManager.SEGMENTS_COLUMNS`) - :type annotation: dict - :param overwrite_existing: choose if lines with the same set and annotation_filename should be overwritten - :type overwrite_existing: bool - :return: output annotation dictionary (attributes defined according to :ref:`ChildProject.annotations.AnnotationManager.SEGMENTS_COLUMNS`) - :rtype: dict - """ - - source_recording = os.path.splitext(annotation["recording_filename"])[0] - annotation_filename = "{}_{}_{}.csv".format( - source_recording, annotation["range_onset"], annotation["range_offset"] - ) - output_filename = os.path.join( - "extra", annotation_filename - ) - - # # check if the annotation file already exists in dataset (same filename and same set) - # if self.annotations[(self.annotations['set'] == output_set) & - # (self.annotations['annotation_filename'] == annotation_filename)].shape[0] > 0: - # if overwrite_existing: - # logger_annotations.warning("Derived file %s will be overwritten", output_filename) - # - # else: - # logger_annotations.warning("File %s already exists. To overwrite, specify parameter ''overwrite_existing''", output_filename) - # return annotation_result - - # find if there are annotation indexes in the same set that overlap the new annotation - # as it is not possible to annotate multiple times the same audio stretch in the same set - # ovl_annots = self.annotations[(self.annotations['set'] == output_set) & - # (self.annotations[ - # 'annotation_filename'] != annotation_filename) & # this condition avoid matching a line that should be overwritten (so has the same annotation_filename), it is dependent on the previous block!!! - # (self.annotations['recording_filename'] == annotation['recording_filename']) & - # (self.annotations['range_onset'] < annotation['range_offset']) & - # (self.annotations['range_offset'] > annotation['range_onset']) - # ] - # if ovl_annots.shape[0] > 0: - # array_tup = list( - # ovl_annots[['set', 'recording_filename', 'range_onset', 'range_offset']].itertuples(index=False, - # name=None)) - # annotation_result[ - # "error"] = f"derivation for set <{output_set}> recording <{annotation['recording_filename']}> from {annotation['range_onset']} to {annotation['range_offset']} cannot continue because it overlaps with these existing annotation lines: {array_tup}" - # logger_annotations.error("Error: %s", annotation['error']) - # # (f"Error: {annotation['error']}") - # annotation_result["imported_at"] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") - # return annotation_result - - path = os.path.join( - self.project.path, - "annotations", - annotation["set"], - "converted", #EXPAND - annotation["annotation_filename"], - ) - - #TODO CHECK FOR DTYPES - df_input = pd.read_csv(path) - df = None - - try: - df = conversations(df_input) - # if callable(import_function): - # df = import_function(df_input) - # elif import_function in DERIVATIONS.keys(): - # df = DERIVATIONS[import_function](df_input) - # else: - # raise ValueError( - # "derivation value '{}' unknown, use one of {}".format(import_function, DERIVATIONS.keys()) - # ) - except: - annotation["error"] = traceback.format_exc() - logger_annotations.error("An error occurred while processing '%s'", path, exc_info=True) - - if df is None or not isinstance(df, pd.DataFrame): - annotation_result["imported_at"] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") - return annotation_result - - if not df.shape[1]: - df = pd.DataFrame(columns=[c.name for c in self.SEGMENTS_COLUMNS]) - - df["raw_filename"] = annotation["raw_filename"] - - df["segment_onset"] += np.int64(annotation["time_seek"]) - df["segment_offset"] += np.int64(annotation["time_seek"]) - df["segment_onset"] = df["segment_onset"].astype(np.int64) - df["segment_offset"] = df["segment_offset"].astype(np.int64) - - annotation_result["time_seek"] = np.int64(annotation["time_seek"]) - annotation_result["range_onset"] = np.int64(annotation["range_onset"]) - annotation_result["range_offset"] = np.int64(annotation["range_offset"]) - - df = AnnotationManager.clip_segments( - df, annotation_result["range_onset"], annotation_result["range_offset"] - ) - - sort_columns = ["segment_onset", "segment_offset"] - if "speaker_type" in df.columns: - sort_columns.append("speaker_type") - - df.sort_values(sort_columns, inplace=True) - - os.makedirs( - os.path.dirname(os.path.join(self.project.path, output_filename)), - exist_ok=True, - ) - df.to_csv(os.path.join(self.project.path, output_filename), index=False) - - annotation_result["annotation_filename"] = annotation_filename - annotation_result["imported_at"] = datetime.datetime.now().strftime( - "%Y-%m-%d %H:%M:%S" - ) - annotation_result["package_version"] = __version__ - - return annotation_result - - def summarise_conversations(self, - input_set: str, - output_set: str, - derivation_function: Union[str, Callable], - threads: int = -1, - overwrite_existing: bool = False, - ) -> (pd.DataFrame, pd.DataFrame): - """Derive annotations. - - :param input_set: name of the set of annotations to be derived - :rtype: str - :param output_set: name of the new set of derived annotations - :rtype: str - :param derivation_function: name of the derivation type to be performed - :rtype: Union[str, Callable] - :param threads: If > 1, conversions will be run on ``threads`` threads, defaults to -1 - :type threads: int, optional - :param overwrite_existing: choice if lines with the same set and annotation_filename should be overwritten - :type overwrite_existing: bool, optional - :return: tuple of dataframe of derived annotations, as in :ref:`format-annotations` and dataframe of errors - :rtype: tuple (pd.DataFrame, pd.DataFrame) - """ - input_processed = self.annotations[self.annotations['set'] == input_set].copy() - assert not input_processed.empty, "Input set {0} does not exist".format(input_set) - - if threads == 1: - imported = input_processed.apply( - partial(self._summarise_conversations, - overwrite_existing=overwrite_existing - ), axis=1 - ).to_dict(orient="records") - else: - - with mp.Pool(processes=threads if threads > 0 else mp.cpu_count()) as pool: - imported = pool.map( - partial(self._summarise_conversations, - overwrite_existing=overwrite_existing - ), - input_processed.to_dict(orient="records"), - ) - - imported = pd.DataFrame(imported) - imported.drop( - list(set(imported.columns) - {c.name for c in self.INDEX_COLUMNS}), - axis=1, - inplace=True, - ) - - if 'error' in imported.columns: - errors = imported[~imported["error"].isnull()] - imported = imported[imported["error"].isnull()] - # when errors occur, separate them in a different csv in extra - if errors.shape[0] > 0: - output = os.path.join(self.project.path, "extra", - "errors_conv_summary_{}.csv".format(datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))) - errors.to_csv(output, index=False) - logger_annotations.info("Errors summary exported to %s", output) - else: - errors = None - - self.read() - self.annotations = pd.concat([self.annotations, imported], sort=False) - # at this point, 2 lines with same set and annotation_filename can happen if specified overwrite, - # dropping duplicates remove the first importation and keeps the more recent one - self.annotations = self.annotations.sort_values('imported_at').drop_duplicates( - subset=["set", "recording_filename", "range_onset", "range_offset"], keep='last') - self.write() - - sets = set(input_processed['set'].unique()) - outdated_sets = self._check_for_outdated_merged_sets(sets=sets) - for warning in outdated_sets: - logger_annotations.warning("warning: %s", warning) - - return imported, errors - def get_subsets(self, annotation_set: str, recursive: bool = False) -> List[str]: """Retrieve the list of subsets belonging to a given set of annotations. diff --git a/ChildProject/cmdline.py b/ChildProject/cmdline.py index e5e4de98..86baf180 100755 --- a/ChildProject/cmdline.py +++ b/ChildProject/cmdline.py @@ -290,42 +290,6 @@ def derive_annotations(args): logger.error("\n".join(errors)) logger.error("\n".join(warnings)) -@subcommand( - [ - arg("source", help="project path"), - arg("--input-set", "-i", help="input set", required=True, type=str), - arg("--threads", help="amount of threads to run on", type=int, default=0), - arg("--overwrite-existing", "--ow", - help="overwrites existing summary file if should generate the same output file (useful when reimporting)", - action='store_true'), - ] -) -def summarise_conversations(args): - """generate summary metrics for conversations""" - - project = ChildProject(args.source) - - perform_validation(project, require_success=True, ignore_recordings=True) - - am = AnnotationManager(project) - imported, errors_der = am.summarise_conversations(args.input_set, args.threads, overwrite_existing=args.overwrite_existing) - - if errors_der is not None and errors_der.shape[0] > 0: - logger.error('Conversational summary generation failed for %d entry/ies', errors_der.shape[0]) - logger.debug(errors_der) - - if imported is not None and imported.shape[0] > 0: - errors, warnings = am.validate(imported, threads=args.threads) - - if len(am.errors) > 0: - logger.error( - "in the resulting annotations %s errors and %s warnings were found", - len(am.errors) + len(errors), - len(warnings), - ) # Is it right ? - logger.error("\n".join(am.errors)) - logger.error("\n".join(errors)) - logger.error("\n".join(warnings)) @subcommand( [ From 61c4e83e27754b1a9d562d84e5bb0ef7fe69fefe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Agata=20Kozio=C5=82?= Date: Wed, 28 Feb 2024 11:36:34 +0100 Subject: [PATCH 06/44] Update .DS_Store --- .../annotations/alice/.DS_Store | Bin 6148 -> 6148 bytes 1 file changed, 0 insertions(+), 0 deletions(-) diff --git a/examples/valid_raw_data/annotations/alice/.DS_Store b/examples/valid_raw_data/annotations/alice/.DS_Store index 1d960c0fb771c0bb2a3f367396a3a21bfa6c7b75..79bc424dfe9688bd5dd30d2e58abed6c5c569f39 100644 GIT binary patch delta 283 zcmZoMXfc=|#>B)qu~2NHo}wr-0|Nsi1A_nqLn%XvXHI@{Qcix-W=5vvj9DNlHimqV z1VaI`G*BFZjRXC zjQsN8lEjkIVyDT0jNY-FoSg9jlGW8lW)?aMhDH{(IttZ>h6aW@3TEaewY8iaqRRT# zLGjr+xq12BU<(--fi4FFUMLNtdVmZRr*Hns_=;sSI|n}p(Ca|&eP^D`FJj37)DJdd JbAZSeW&rq$J@^0s delta 245 zcmZoMXfc=|#>B`mu~2NHo}wrd0|Nsi1A_oVaY0f}eiD#(GO1u<;qu7_A}mhK3`Goy z4CP7X$dWmk#U%y?*BP0ZSyCz62Gn7jWtm^GY;4eA+|16w&jIw*W<%!h%#-;=EIB}`fhI6*4iMSG F3;@$@Hpu`0 From 125fa013ec8c70cd118fd429ff25965cbe447546 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Agata=20Kozio=C5=82?= Date: Wed, 28 Feb 2024 12:18:12 +0100 Subject: [PATCH 07/44] Update Conversations class --- ChildProject/pipelines/conversations.py | 163 ++++++------------------ 1 file changed, 42 insertions(+), 121 deletions(-) diff --git a/ChildProject/pipelines/conversations.py b/ChildProject/pipelines/conversations.py index 16ac4b50..ce0208db 100644 --- a/ChildProject/pipelines/conversations.py +++ b/ChildProject/pipelines/conversations.py @@ -20,9 +20,9 @@ pipelines = {} -class Metrics(ABC): +class Conversations(ABC): """ - Main class for generating metrics from a project object and a list of desired metrics + Main class for generating conversational metrics from a project object and a list of desired metrics :param project: ChildProject instance of the target dataset. :type project: ChildProject.projects.ChildProject @@ -40,10 +40,6 @@ class Metrics(ABC): :type rec_cols: str, optional :param child_cols: comma separated columns from children.csv to include in the outputted metrics (optional), None by default :type child_cols: str, optional - :param period: time units to aggregate (optional); equivalent to ``pandas.Grouper`` freq argument. - :type period: str, optional - :param segments: DataFrame or path to csv file of the segments to extract from, containing 'recording_filename', 'segment_onset' and 'segment_offset' columns. To use this option, the option must be set to 'segments'. Also, this option cannot be combined with options [recordings,period,from_time,to_time]. - :type segments: Union[str, pd.DataFrame], optional :param threads: amount of threads to run on, defaults to 1 :type threads: int, optional """ @@ -58,8 +54,6 @@ def __init__( to_time: str = None, rec_cols: str = None, #metadata child_cols: str = None, #metadata - #period: str = None, - #segments: Union[str, pd.DataFrame] = None, threads: int = 1, ): @@ -75,7 +69,7 @@ def check_callable(row): f = getattr(metfunc, row["callable"]) except Exception: raise ValueError( - "{} function is not defined and was not found in ChildProject/pipelines/metricsFunctions.py".format( + "{} function is not defined and was not found in ChildProject/pipelines/conversationFunctions.py".format( row["callable"])) return f else: @@ -158,59 +152,25 @@ def check_callable(row): self.child_cols = None self.by = by - self.period = period - self.segments = segments self.recordings = Pipeline.recordings_from_list(recordings) - # If the extraction is done on segments - if segments is not None: - # we enforce that incompatible arguments are not set - assert by == 'segments' and period is None and recordings is None and from_time is None and to_time is None, "the option can not be combined with options [period,recordings,from_time,to_time], and should be set to 'segments'" - - dtypes = {'recording_filename': 'string', 'segment_onset': 'Int64', 'segment_offset': 'Int64'} - # use the DataFrame provided or import it from a csv file - if isinstance(segments, pd.DataFrame): - self.segments = segments.astype(dtypes) - else: - self.segments = read_csv_with_dtype(segments, dtypes) - - # check that required columns are present and dataframe not empty - assert_dataframe("segments", self.segments, not_empty=True) - assert_columns_presence( - "segments", - self.segments, - {"recording_filename", "segment_onset", "segment_offset"}, - ) - # not on segments - else: - - # build a dataframe with all the periods we will want for each unit - if self.period: - self.periods = pd.interval_range( - start=datetime.datetime(1900, 1, 1, 0, 0, 0, 0), - end=datetime.datetime(1900, 1, 2, 0, 0, 0, 0), - freq=self.period, - closed="both", - ) - self.periods = pd.DataFrame(self.periods.to_tuples().to_list(), columns=['period_start', 'period_end']) - - # turn from_time and to to_time to datetime objects - if from_time: - try: - self.from_time = datetime.datetime.strptime(from_time, "%H:%M:%S") - except: - raise ValueError( + # turn from_time and to to_time to datetime objects + if from_time: + try: + self.from_time = datetime.datetime.strptime(from_time, "%H:%M:%S") + except: + raise ValueError( f"invalid value for from_time ('{from_time}'); should have HH:MM:SS format instead") - else: - self.from_time = None + else: + self.from_time = None - if to_time: - try: - self.to_time = datetime.datetime.strptime(to_time, "%H:%M:%S") - except: - raise ValueError(f"invalid value for to_time ('{to_time}'); should have HH:MM:SS format instead") - else: - self.to_time = None + if to_time: + try: + self.to_time = datetime.datetime.strptime(to_time, "%H:%M:%S") + except: + raise ValueError(f"invalid value for to_time ('{to_time}'); should have HH:MM:SS format instead") + else: + self.to_time = None self._initiate_metrics_df() @@ -218,9 +178,9 @@ def __init_subclass__(cls, **kwargs): super().__init_subclass__(**kwargs) pipelines[cls.SUBCOMMAND] = cls - def _process_unit(self, row): #process recording line + def _process_recording(self, row): #process recording line #keep lines for which conv_count is nopt Na and group by conv - """for one unit (i.e. 1 {recording|session|child} [period]) compute the list of required metrics and store the results in the current row of self.metrics + """for one unit (i.e. 1 recording) compute the list of required metrics and store the results in the current row of self.metrics :param row: index and Series of the unit to process, to be modified with the results :type row: (int , pandas.Series) @@ -259,18 +219,15 @@ def extract(self): """ if self.threads == 1: self.metrics = pd.DataFrame( - [self._process_unit(row) for row in self.metrics.iterrows()] + [self._process_recording(row) for row in self.metrics.iterrows()] ) else: with mp.Pool( processes=self.threads if self.threads >= 1 else mp.cpu_count() ) as pool: self.metrics = pd.DataFrame( - pool.map(self._process_unit, self.metrics.iterrows()) + pool.map(self._process_recording, self.metrics.iterrows()) ) - if self.period: - self.metrics['period_start'] = self.metrics['period_start'].dt.strftime('%H:%M:%S') - self.metrics['period_end'] = self.metrics['period_end'].dt.strftime('%H:%M:%S') return self.metrics def retrieve_segments(self, sets: List[str], row: str): @@ -337,21 +294,15 @@ def _initiate_metrics_df(self): - 48 rows if 2 recordings in the corpus --period 1h --by recording_filename Then the extract() method should populate the dataframe with actual metrics """ - # build the metrics dataframe from the segments argument - if self.segments is not None: - recordings = self.project.get_recordings_from_list(self.segments['recording_filename'].unique()) - self.by = 'recording_filename' - self.metrics = self.segments.copy() - # else use the list of recordings of the dataset and the by option - else: - recordings = self.project.get_recordings_from_list(self.recordings) - self.metrics = pd.DataFrame(recordings[self.by].unique(), columns=[self.by]) - if self.period: - # if period, use the self.periods dataframe to build all the list of segments per unit - self.metrics[ - "key"] = 0 # with old versions of pandas, we are forced to have a common column to do a cross join, we drop the column after - self.periods["key"] = 0 - self.metrics = pd.merge(self.metrics, self.periods, on='key', how='outer').drop('key', axis=1) + + recordings = self.project.get_recordings_from_list(self.recordings) + self.metrics = pd.DataFrame(recordings[self.by].unique(), columns=[self.by]) + if self.period: + # if period, use the self.periods dataframe to build all the list of segments per unit + self.metrics[ + "key"] = 0 # with old versions of pandas, we are forced to have a common column to do a cross join, we drop the column after + self.periods["key"] = 0 + self.metrics = pd.merge(self.metrics, self.periods, on='key', how='outer').drop('key', axis=1) # add info for child_id self.metrics["child_id"] = self.metrics.apply( @@ -399,7 +350,7 @@ def check_unicity(row, label): self.metrics[label] = self.metrics.apply(lambda row: check_unicity(row, label), axis=1) -class CustomMetrics(Metrics): +class CustomMetrics(Conversations): """metrics extraction from a csv file. Extracts a number of metrics listed in a csv file as a dataframe. the csv file must contain the columns : @@ -424,10 +375,6 @@ class CustomMetrics(Metrics): :type child_cols: str, optional :param by: unit to extract metric from (recording_filename, experiment, child_id, session_id, segments), defaults to 'recording_filename', 'segments' is mandatory if passing the segments argument :type by: str, optional - :param period: time units to aggregate (optional); equivalent to ``pandas.Grouper`` freq argument. - :type period: str, optional - :param segments: DataFrame or path to csv file of the segments to extract from, containing 'recording_filename', 'segment_onset' and 'segment_offset' columns. To use this option, the option must be set to 'segments'. Also, this option cannot be combined with options [recordings,period,from_time,to_time]. - :type segments: Union[str, pd.DataFrame], optional :param threads: amount of threads to run on, defaults to 1 :type threads: int, optional """ @@ -444,15 +391,13 @@ def __init__( rec_cols: str = None, child_cols: str = None, by: str = "recording_filename", - period: str = None, - segments: Union[str, pd.DataFrame] = None, threads: int = 1, ): metrics_df = pd.read_csv(metrics) super().__init__(project, metrics_df, by=by, recordings=recordings, from_time=from_time, to_time=to_time, rec_cols=rec_cols, - child_cols=child_cols, period=period, segments=segments, threads=threads) + child_cols=child_cols, threads=threads) @staticmethod def add_parser(subparsers, subcommand): @@ -462,7 +407,7 @@ def add_parser(subparsers, subcommand): ) -class LenaMetrics(Metrics): +class LenaMetrics(Conversations): """LENA metrics extractor. Extracts a number of metrics from the LENA .its annotations. @@ -482,10 +427,6 @@ class LenaMetrics(Metrics): :type child_cols: str, optional :param by: unit to extract metric from (recording_filename, experiment, child_id, session_id, segments), defaults to 'recording_filename', 'segments' is mandatory if passing the segments argument :type by: str, optional - :param period: time units to aggregate (optional); equivalent to ``pandas.Grouper`` freq argument. - :type period: str, optional - :param segments: DataFrame or path to csv file of the segments to extract from, containing 'recording_filename', 'segment_onset' and 'segment_offset' columns. To use this option, the option must be set to 'segments'. Also, this option cannot be combined with options [recordings,period,from_time,to_time]. - :type segments: Union[str, pd.DataFrame], optional :param threads: amount of threads to run on, defaults to 1 :type threads: int, optional """ @@ -502,8 +443,6 @@ def __init__( rec_cols: str = None, child_cols: str = None, by: str = "recording_filename", - period: str = None, - segments: Union[str, pd.DataFrame] = None, threads: int = 1, ): self.set = set @@ -531,8 +470,8 @@ def __init__( ]), columns=["callable", "set", "speaker"]) super().__init__(project, METRICS, by=by, recordings=recordings, - period=period, from_time=from_time, to_time=to_time, rec_cols=rec_cols, - child_cols=child_cols, segments=segments, threads=threads) + from_time=from_time, to_time=to_time, rec_cols=rec_cols, + child_cols=child_cols, threads=threads) if self.set not in self.am.annotations["set"].values: raise ValueError( @@ -546,7 +485,7 @@ def add_parser(subparsers, subcommand): parser.add_argument("set", help="name of the LENA its annotations set") -class AclewMetrics(Metrics): +class AclewMetrics(Conversations): """ACLEW metrics extractor. Extracts a number of metrics from the ACLEW pipeline annotations, which includes: @@ -574,10 +513,6 @@ class AclewMetrics(Metrics): :type child_cols: str, optional :param by: unit to extract metric from (recording_filename, experiment, child_id, session_id, segments), defaults to 'recording_filename', 'segments' is mandatory if passing the segments argument :type by: str, optional - :param period: time units to aggregate (optional); equivalent to ``pandas.Grouper`` freq argument. - :type period: str, optional - :param segments: DataFrame or path to csv file of the segments to extract from, containing 'recording_filename', 'segment_onset' and 'segment_offset' columns. To use this option, the option must be set to 'segments'. Also, this option cannot be combined with options [recordings,period,from_time,to_time]. - :type segments: Union[str, pd.DataFrame], optional :param threads: amount of threads to run on, defaults to 1 :type threads: int, optional """ @@ -595,8 +530,6 @@ def __init__( to_time: str = None, rec_cols: str = None, child_cols: str = None, - period: str = None, - segments: Union[str, pd.DataFrame] = None, by: str = "recording_filename", threads: int = 1, ): @@ -661,8 +594,8 @@ def __init__( METRICS = pd.DataFrame(METRICS, columns=["callable", "set", "speaker"]) super().__init__(project, METRICS, by=by, recordings=recordings, - period=period, from_time=from_time, to_time=to_time, - rec_cols=rec_cols, child_cols=child_cols, segments=segments, + from_time=from_time, to_time=to_time, + rec_cols=rec_cols, child_cols=child_cols, threads=threads) @staticmethod @@ -673,7 +606,7 @@ def add_parser(subparsers, subcommand): parser.add_argument("--vcm", help="vcm set", default="vcm") -class MetricsPipeline(Pipeline): +class ConversationsPipeline(Pipeline): def __init__(self): self.metrics = [] @@ -751,18 +684,6 @@ def setup_parser(parser): default="recording_filename", ) - parser.add_argument( - "--segments", - help="path to a CSV dataframe containing the list of segments to sample from. The CSV should have 3 columns named recording_filename, segment_onset, segment_offset. --by must be set to 'segments', Can not be used along with options [--period,--recordings,--from-tim,--to-time]", - default=None, - ) - - parser.add_argument( - "--period", - help="time units to aggregate (optional); equivalent to ``pandas.Grouper`` freq argument. The resulting metrics will be split for each unit across all the resulting periods.", - default=None, - ) - parser.add_argument( "-f", "--from-time", @@ -794,7 +715,7 @@ def setup_parser(parser): ) -class MetricsSpecificationPipeline(Pipeline): +class ConversationsSpecificationPipeline(Pipeline): def __init__(self): self.metrics = [] @@ -852,7 +773,7 @@ def run(self, parameters_input, func=None): if key not in {"metrics_list", "path", "destination", "dataset_hash"} } try: - metrics = Metrics(self.project, metrics_df, **arguments) + metrics = Conversations(self.project, metrics_df, **arguments) except TypeError as e: raise ValueError('Unrecognized parameter found {}'.format(e.args[0][46:])) from e metrics.extract() From 55f39d0a9db695ec2278bc7e98ae820da1d09e36 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Agata=20Kozio=C5=82?= Date: Wed, 28 Feb 2024 12:18:22 +0100 Subject: [PATCH 08/44] register conversations pipelines --- ChildProject/cmdline.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/ChildProject/cmdline.py b/ChildProject/cmdline.py index 86baf180..3efe5822 100755 --- a/ChildProject/cmdline.py +++ b/ChildProject/cmdline.py @@ -1,6 +1,8 @@ #!/usr/bin/env python3 from ChildProject.projects import ChildProject from ChildProject.annotations import AnnotationManager +from .pipelines.conversations import ConversationsPipeline +from .pipelines.conversations import ConversationsSpecificationPipeline from .pipelines.samplers import SamplerPipeline from .pipelines.eafbuilder import EafBuilderPipeline from .pipelines.zooniverse import ZooniversePipeline @@ -641,7 +643,8 @@ def main(): register_pipeline("anonymize", AnonymizationPipeline) register_pipeline("metrics", MetricsPipeline) register_pipeline("metrics-specification", MetricsSpecificationPipeline) - #register_pipeline("conversations-summary", ConversationsPipeline) + register_pipeline("conversations-summary", ConversationsPipeline) + register_pipeline("conversations-specification", ConversationsSpecificationPipeline) args = parser.parse_args() args.func(args) From b274aefbad3b599d068be11abf6db83a9eefe72a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Agata=20Kozio=C5=82?= Date: Wed, 28 Feb 2024 13:28:38 +0100 Subject: [PATCH 09/44] Define metrics --- .../pipelines/conversationFunctions.py | 38 +++++++++++++++++-- ChildProject/pipelines/conversations.py | 6 +-- 2 files changed, 38 insertions(+), 6 deletions(-) diff --git a/ChildProject/pipelines/conversationFunctions.py b/ChildProject/pipelines/conversationFunctions.py index 35d9dec0..3eb0f97e 100644 --- a/ChildProject/pipelines/conversationFunctions.py +++ b/ChildProject/pipelines/conversationFunctions.py @@ -8,7 +8,7 @@ """ This file lists all the metrics functions commonly used. -New metrics can be added by defining new functions for the Metrics class to use : +New metrics can be added by defining new functions for the Conversations class to use : - Create a new function using the same arguments (i.e. annotations, duration, **kwargs) - Define calculation of the metric with: - annotations, which is a dataframe containing all the relevant annotated segments to use. It contains the @@ -19,7 +19,7 @@ - duration which is the duration of audio annotated in milliseconds - kwargs, whatever keyword parameter you chose to pass to the function (except 'name', 'callable', 'set' which can not be used). This will need to be given with the list of metrics when called - - Wrap you function with the 'metricFunction' decorator to make it callable by the pipeline, read metricFunction help + - Wrap you function with the 'conversationFunction' decorator to make it callable by the pipeline, read conversationFunction help for more info !! Metrics functions should still behave and return the correct result when receiving an empty dataframe @@ -31,7 +31,7 @@ RESERVED = {'set', 'name', 'callable'} # arguments reserved usage. use other keyword labels. -def metricFunction(args: set, columns: Union[Set[str], Tuple[Set[str], ...]], empty_value=0, default_name: str = None): +def conversationFunction(args: set, columns: Union[Set[str], Tuple[Set[str], ...]], empty_value=0, default_name: str = None): """Decorator for all metrics functions to make them ready to be called by the pipeline. :param args: set of required keyword arguments for that function, raise ValueError if were not given \ @@ -105,6 +105,38 @@ def new_func(annotations: pd.DataFrame, duration: int, **kwargs): return decorator +@conversationFunction(set(), {"speaker_type", "vcm_type", "duration"}, np.nan) +def is_speaker(annotations: pd.DataFrame, **kwargs): + return kwargs["speaker"] in annotations['speaker_type'].tolist() + +@conversationFunction(set(), {"speaker_type", "vcm_type", "duration"}, np.nan) +def voc_counter(annotations: pd.DataFrame, **kwargs): + return annotations[annotations['speaker_type'] == kwargs["speaker"]]['speaker_type'].count() + +@conversationFunction(set(), {"speaker_type", "vcm_type", "duration"}, np.nan) +def voc_total(annotations: pd.DataFrame, **kwargs): + return annotations[annotations['speaker_type'] == kwargs["speaker"]]['voc_duration'].sum(min_count=1) + +@conversationFunction(set(), {"speaker_type", "vcm_type", "duration"}, np.nan) +def voc_average(annotations: pd.DataFrame, **kwargs): + return annotations[annotations['speaker_type'] == kwargs["speaker"]]['voc_duration'].mean() + + +def cp_dur(annotations: pd.DataFrame, duration: int, **kwargs): + """canonical proportion on the number of vocalizations for CHI (based on vcm_type) + + Required keyword arguments: + """ + speech_dur = annotations.loc[(annotations["speaker_type"] == "CHI") & + (annotations["vcm_type"].isin(["N", "C"]))]["duration"].sum() + can_dur = annotations.loc[(annotations["speaker_type"] == "CHI") & + (annotations["vcm_type"] == "C")]["duration"].sum() + if speech_dur: + value = can_dur / speech_dur + else: + value = np.nan + return value + def peak_hour_metric(empty_value=0): """ diff --git a/ChildProject/pipelines/conversations.py b/ChildProject/pipelines/conversations.py index ce0208db..de81c241 100644 --- a/ChildProject/pipelines/conversations.py +++ b/ChildProject/pipelines/conversations.py @@ -14,7 +14,7 @@ from ChildProject.pipelines.pipeline import Pipeline from ChildProject.tables import assert_dataframe, assert_columns_presence, read_csv_with_dtype -import ChildProject.pipelines.metricsFunctions as metfunc +import ChildProject.pipelines.conversationFunctions as convfunc from ..utils import TimeInterval, time_intervals_intersect pipelines = {} @@ -61,12 +61,12 @@ def __init__( self.am = ChildProject.annotations.AnnotationManager(self.project) self.threads = int(threads) - # check that the callable column is either a callable function or a string that can be found as being part of the list of metrics in ChildProject/pipelines/metricsFunctions.py + # check that the callable column is either a callable function or a string that can be found as being part of the list of metrics in ChildProject/pipelines/conversationFunctions.py def check_callable(row): if callable(row["callable"]): return row["callable"] if isinstance(row["callable"], str): try: - f = getattr(metfunc, row["callable"]) + f = getattr(convfunc, row["callable"]) except Exception: raise ValueError( "{} function is not defined and was not found in ChildProject/pipelines/conversationFunctions.py".format( From b881013b9b1aedd5e8888ff2c430d9b9ec85e5b4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Agata=20Kozio=C5=82?= Date: Wed, 28 Feb 2024 16:40:46 +0100 Subject: [PATCH 10/44] Updated metric parameters --- .../pipelines/conversationFunctions.py | 28 ++++--------------- 1 file changed, 6 insertions(+), 22 deletions(-) diff --git a/ChildProject/pipelines/conversationFunctions.py b/ChildProject/pipelines/conversationFunctions.py index 3eb0f97e..093d0292 100644 --- a/ChildProject/pipelines/conversationFunctions.py +++ b/ChildProject/pipelines/conversationFunctions.py @@ -105,37 +105,21 @@ def new_func(annotations: pd.DataFrame, duration: int, **kwargs): return decorator -@conversationFunction(set(), {"speaker_type", "vcm_type", "duration"}, np.nan) +@conversationFunction(set(), {"speaker_type", "conv_count", "duration"}, np.nan) def is_speaker(annotations: pd.DataFrame, **kwargs): return kwargs["speaker"] in annotations['speaker_type'].tolist() -@conversationFunction(set(), {"speaker_type", "vcm_type", "duration"}, np.nan) +@conversationFunction(set(), {"speaker_type", "conv_count", "duration"}, np.nan) def voc_counter(annotations: pd.DataFrame, **kwargs): return annotations[annotations['speaker_type'] == kwargs["speaker"]]['speaker_type'].count() -@conversationFunction(set(), {"speaker_type", "vcm_type", "duration"}, np.nan) +@conversationFunction(set(), {"speaker_type", "conv_count", "duration"}, np.nan) def voc_total(annotations: pd.DataFrame, **kwargs): return annotations[annotations['speaker_type'] == kwargs["speaker"]]['voc_duration'].sum(min_count=1) -@conversationFunction(set(), {"speaker_type", "vcm_type", "duration"}, np.nan) -def voc_average(annotations: pd.DataFrame, **kwargs): - return annotations[annotations['speaker_type'] == kwargs["speaker"]]['voc_duration'].mean() - - -def cp_dur(annotations: pd.DataFrame, duration: int, **kwargs): - """canonical proportion on the number of vocalizations for CHI (based on vcm_type) - - Required keyword arguments: - """ - speech_dur = annotations.loc[(annotations["speaker_type"] == "CHI") & - (annotations["vcm_type"].isin(["N", "C"]))]["duration"].sum() - can_dur = annotations.loc[(annotations["speaker_type"] == "CHI") & - (annotations["vcm_type"] == "C")]["duration"].sum() - if speech_dur: - value = can_dur / speech_dur - else: - value = np.nan - return value +# @conversationFunction(set(), {"speaker_type", "conv_count", "duration"}, np.nan) +# def voc_average(annotations: pd.DataFrame, **kwargs): +# return annotations[annotations['speaker_type'] == kwargs["speaker"]]['voc_duration'].mean() def peak_hour_metric(empty_value=0): From 745f5deb48444c37bef3ed07326043f8f3c5c7ea Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Agata=20Kozio=C5=82?= Date: Fri, 1 Mar 2024 13:46:49 +0100 Subject: [PATCH 11/44] Updated Conversation class --- ChildProject/pipelines/conversations.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/ChildProject/pipelines/conversations.py b/ChildProject/pipelines/conversations.py index de81c241..25cba214 100644 --- a/ChildProject/pipelines/conversations.py +++ b/ChildProject/pipelines/conversations.py @@ -195,6 +195,9 @@ def _process_recording(self, row): #process recording line curr_set = line["set"] if prev_set != curr_set: index, annotations = self.retrieve_segments([curr_set], row[1]) + # Change the annotations dataframe, i.e. group by conversations + annotations = annotations.dropna(subset='conv_count') + annotations['voc_duration'] = annotations['segment_offset'] - annotations['segment_onset'] if index.shape[0]: duration_set = ( index["range_offset"] - index["range_onset"] @@ -204,8 +207,12 @@ def _process_recording(self, row): #process recording line row[1]["duration_{}".format(line["set"])] = duration_set prev_set = curr_set - name, value = line["callable"](annotations, duration_set, - **line.drop(['callable', 'set']).dropna().to_dict()) + # name, value = line["callable"](annotations, duration_set, + # **line.drop(['callable', 'set']).dropna().to_dict()) + name, value = annotations.groupby('conv_count').apply( + lambda conv: line["callable"](conv, duration_set, + **line.drop(['callable', 'set']).dropna().to_dict())) + row[1][name] = value return row[1] From d47191a9ebe754a790e1fd81830b83094abce076 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Agata=20Kozio=C5=82?= Date: Fri, 1 Mar 2024 13:47:00 +0100 Subject: [PATCH 12/44] Updated metrics --- .../pipelines/conversationFunctions.py | 549 +----------------- 1 file changed, 22 insertions(+), 527 deletions(-) diff --git a/ChildProject/pipelines/conversationFunctions.py b/ChildProject/pipelines/conversationFunctions.py index 093d0292..41b3a7c8 100644 --- a/ChildProject/pipelines/conversationFunctions.py +++ b/ChildProject/pipelines/conversationFunctions.py @@ -117,535 +117,30 @@ def voc_counter(annotations: pd.DataFrame, **kwargs): def voc_total(annotations: pd.DataFrame, **kwargs): return annotations[annotations['speaker_type'] == kwargs["speaker"]]['voc_duration'].sum(min_count=1) +@conversationFunction(set(), {"speaker_type", "conv_count", "duration"}, np.nan) +def assign_conv_type(conv): + if not conv['CHI_present']: + return 'overheard' + elif conv['CHI_present']: + if not conv['OCH_present'] and conv[['FEM_present', 'MAL_present']].sum() == 1: + if conv['FEM_present']: + return 'dyadic_FEM' + if conv['MAL_present']: + return 'dyadic_MAL' + if conv['OCH_present'] and conv[['FEM_present', 'MAL_present']].sum() == 0: + return 'peer' + if not conv['OCH_present'] and conv[['FEM_present', 'MAL_present']].sum() == 2: + return 'parent' + if conv['OCH_present'] and conv[['FEM_present', 'MAL_present']].sum() == 1: + if conv['FEM_present']: + return 'triadic_FEM' + if conv['MAL_present']: + return 'triadic_MAL' + if conv[['OCH_present', 'FEM_present', 'MAL_present']].sum() == 3: + return 'multiparty' + return np.nan() # @conversationFunction(set(), {"speaker_type", "conv_count", "duration"}, np.nan) # def voc_average(annotations: pd.DataFrame, **kwargs): # return annotations[annotations['speaker_type'] == kwargs["speaker"]]['voc_duration'].mean() -def peak_hour_metric(empty_value=0): - """ - empty_value : should repeat the empty value of the metric function wrapper (as this will be used for empty periods) - """ - - def decorator(function): - """Decorator a metric function to select the maximum value observed over 1h periods. function is prefixed with - 'peak_' - """ - - @functools.wraps(function) - def new_func(annotations: pd.DataFrame, duration: int, **kwargs): - # time to consider for periods, here 1h by default, else put it in kwargs - period_time = 3600000 if 'period_time' not in kwargs else kwargs['period_time'] - periods = duration // period_time # number of hours to consider - - # what hour it belongs to (we made the choice of using onset to choose the hour) - annotations['hour_number_metric'] = annotations['segment_onset'] // period_time - - result_array = np.array([]) - for i in range(periods): - # select the annotations for this hour - period_annotations = annotations[annotations['hour_number_metric'] == i] - - if period_annotations.shape[0]: - # compute metric for the period - metric = function(period_annotations, period_time, **kwargs) - else: - metric = empty_value - - result_array = np.append(result_array, metric) # store the result - - # if we have results, return the max, else return NaN - if len(result_array): - return np.nanmax(result_array) - else: - return np.nan - - # wraps will give the same name and doc, so we need to slightly edit them for the peak function - new_func.__doc__ = "Computing the peak for 1h for the following metric:\n" + function.__doc__ - new_func.__name__ = "peak_" + function.__name__ - new_func.__qualname__ = "peak_" + function.__qualname__ - return new_func - - return decorator - - -def per_hour_metric(): - """ - """ - - def decorator(function): - """Decorator creating a metric function controlling the original value by time. function is suffixed with '_ph' - """ - - @functools.wraps(function) - def new_func(annotations: pd.DataFrame, duration: int, **kwargs): - # time to consider for periods, here 1h by default, else put it in kwargs - return function(annotations, duration, **kwargs) * (3600000 / duration) - - # wraps will give the same name and doc, so we need to slightly edit them for the peak function - new_func.__doc__ = function.__doc__ + "This value is a 'per hour' value." - new_func.__name__ = function.__name__ + '_ph' - new_func.__qualname__ = function.__qualname__ + '_ph' - return new_func - - return decorator - - -def voc_speaker(annotations: pd.DataFrame, duration: int, **kwargs): - """number of vocalizations for a given speaker type - - Required keyword arguments: - - speaker : speaker_type to use - """ - return annotations[annotations["speaker_type"] == kwargs["speaker"]].shape[0] - - -# Decorate for the peak metric, per hour metric, and then the classic metric to avoid conflicts of decoration -peak_voc_speaker = metricFunction({"speaker"}, {"speaker_type"})(peak_hour_metric()(voc_speaker)) -voc_speaker_ph = metricFunction({"speaker"}, {"speaker_type"})(per_hour_metric()(voc_speaker)) -voc_speaker = metricFunction({"speaker"}, {"speaker_type"})(voc_speaker) - - -def voc_dur_speaker(annotations: pd.DataFrame, duration: int, **kwargs): - """total duration of vocalizations by a given speaker type in milliseconds per hour - - Required keyword arguments: - - speaker : speaker_type to use - """ - return annotations[annotations["speaker_type"] == kwargs["speaker"]]["duration"].sum() - - -# Decorate for the peak metric, per hour metric, and then the classic metric to avoid conflicts of decoration -peak_voc_dur_speaker = metricFunction({"speaker"}, {"speaker_type", "duration"})(peak_hour_metric()(voc_dur_speaker)) -voc_dur_speaker_ph = metricFunction({"speaker"}, {"speaker_type", "duration"})(per_hour_metric()(voc_dur_speaker)) -voc_dur_speaker = metricFunction({"speaker"}, {"speaker_type", "duration"})(voc_dur_speaker) - - -@metricFunction({"speaker"}, {"speaker_type", "duration"}, np.nan) -def avg_voc_dur_speaker(annotations: pd.DataFrame, duration: int, **kwargs): - """average duration in milliseconds of vocalizations for a given speaker type - - Required keyword arguments: - - speaker : speaker_type to use - """ - return annotations[annotations["speaker_type"] == kwargs["speaker"]]["duration"].mean() - - -def wc_speaker(annotations: pd.DataFrame, duration: int, **kwargs): - """number of words for a given speaker type - - Required keyword arguments: - - speaker : speaker_type to use - """ - return annotations[annotations["speaker_type"] == kwargs["speaker"]]["words"].sum() - - -peak_wc_speaker = metricFunction({"speaker"}, {"speaker_type", "words"})(peak_hour_metric()(wc_speaker)) -wc_speaker_ph = metricFunction({"speaker"}, {"speaker_type", "words"})(per_hour_metric()(wc_speaker)) -wc_speaker = metricFunction({"speaker"}, {"speaker_type", "words"})(wc_speaker) - - -def sc_speaker(annotations: pd.DataFrame, duration: int, **kwargs): - """number of syllables for a given speaker type - - Required keyword arguments: - - speaker : speaker_type to use - """ - return annotations[annotations["speaker_type"] == kwargs["speaker"]]["syllables"].sum() - - -peak_sc_speaker = metricFunction({"speaker"}, {"speaker_type", "syllables"})(peak_hour_metric()(sc_speaker)) -sc_speaker_ph = metricFunction({"speaker"}, {"speaker_type", "syllables"})(per_hour_metric()(sc_speaker)) -sc_speaker = metricFunction({"speaker"}, {"speaker_type", "syllables"})(sc_speaker) - - -def pc_speaker(annotations: pd.DataFrame, duration: int, **kwargs): - """number of phonemes for a given speaker type - - Required keyword arguments: - - speaker : speaker_type to use - """ - return annotations[annotations["speaker_type"] == kwargs["speaker"]]["phonemes"].sum() - - -peak_pc_speaker = metricFunction({"speaker"}, {"speaker_type", "phonemes"})(peak_hour_metric()(pc_speaker)) -pc_speaker_ph = metricFunction({"speaker"}, {"speaker_type", "phonemes"})(per_hour_metric()(pc_speaker)) -pc_speaker = metricFunction({"speaker"}, {"speaker_type", "phonemes"})(pc_speaker) - - -def wc_adu(annotations: pd.DataFrame, duration: int, **kwargs): - """number of words for all speakers - - Required keyword arguments: - """ - return annotations["words"].sum() - - -peak_wc_adu = metricFunction(set(), {"words"})(peak_hour_metric()(wc_adu)) -wc_adu_ph = metricFunction(set(), {"words"})(per_hour_metric()(wc_adu)) -wc_adu = metricFunction(set(), {"words"})(wc_adu) - - -def sc_adu(annotations: pd.DataFrame, duration: int, **kwargs): - """number of syllables for all speakers - - Required keyword arguments: - """ - return annotations["syllables"].sum() - - -peak_sc_adu = metricFunction(set(), {"syllables"})(peak_hour_metric()(sc_adu)) -sc_adu_ph = metricFunction(set(), {"syllables"})(per_hour_metric()(sc_adu)) -sc_adu = metricFunction(set(), {"syllables"})(sc_adu) - - -def pc_adu(annotations: pd.DataFrame, duration: int, **kwargs): - """number of phonemes for all speakers - - Required keyword arguments: - """ - return annotations["phonemes"].sum() - - -peak_pc_adu = metricFunction(set(), {"phonemes"})(peak_hour_metric()(pc_adu)) -pc_adu_ph = metricFunction(set(), {"phonemes"})(per_hour_metric()(pc_adu)) -pc_adu = metricFunction(set(), {"phonemes"})(pc_adu) - - -def cry_voc_speaker(annotations: pd.DataFrame, duration: int, **kwargs): - """number of cry vocalizations for a given speaker (based on vcm_type or lena cries) - - Required keyword arguments: - - speaker : speaker_type to use - """ - if 'vcm_type' in annotations.columns: - return annotations.loc[(annotations["speaker_type"] == kwargs["speaker"]) & - (annotations["vcm_type"] == "Y")].shape[0] - # elif 'cries' in annotations.columns: - else: - return annotations[annotations['speaker_type'] == kwargs["speaker"]]["cries"].apply( - lambda x: len(ast.literal_eval(x))).sum() - - -peak_cry_voc_speaker = metricFunction({"speaker"}, ({"speaker_type", "vcm_type"}, {"speaker_type", "cries"}) - )(peak_hour_metric()(cry_voc_speaker)) -cry_voc_speaker_ph = metricFunction({"speaker"}, ({"speaker_type", "vcm_type"}, {"speaker_type", "cries"}) - )(per_hour_metric()(cry_voc_speaker)) -cry_voc_speaker = metricFunction({"speaker"}, ({"speaker_type", "vcm_type"}, {"speaker_type", "cries"}) - )(cry_voc_speaker) - - -def cry_voc_dur_speaker(annotations: pd.DataFrame, duration: int, **kwargs): - """total duration of cry vocalizations by a given speaker type in milliseconds (based on vcm_type or lena cry) - - Required keyword arguments: - - speaker : speaker_type to use - """ - if 'vcm_type' in annotations.columns and 'duration' in annotations.columns: - return annotations.loc[(annotations["speaker_type"] == kwargs["speaker"]) & - (annotations["vcm_type"] == "Y")]["duration"].sum() - # elif 'child_cry_vfx_len' in annotations.columns: - else: - return annotations[annotations['speaker_type'] == kwargs["speaker"]]["child_cry_vfx_len"].sum() - - -peak_cry_voc_dur_speaker = metricFunction({"speaker"}, ( -{"speaker_type", "vcm_type", "duration"}, {"speaker_type", "child_cry_vfx_len"}))( - peak_hour_metric()(cry_voc_dur_speaker)) -cry_voc_dur_speaker_ph = metricFunction({"speaker"}, ( -{"speaker_type", "vcm_type", "duration"}, {"speaker_type", "child_cry_vfx_len"}))( - per_hour_metric()(cry_voc_dur_speaker)) -cry_voc_dur_speaker = metricFunction({"speaker"}, - ({"speaker_type", "vcm_type", "duration"}, {"speaker_type", "child_cry_vfx_len"}))( - cry_voc_dur_speaker) - - -@metricFunction({"speaker"}, ({"speaker_type", "vcm_type", "duration"}, {'speaker_type', "child_cry_vfx_len", "cries"}), - np.nan) -def avg_cry_voc_dur_speaker(annotations: pd.DataFrame, duration: int, **kwargs): - """average duration of cry vocalizations by a given speaker type (based on vcm_type or lena cries) - - Required keyword arguments: - - speaker : speaker_type to use - """ - if 'vcm_type' in annotations.columns and 'duration' in annotations.columns: - value = annotations.loc[(annotations["speaker_type"] == kwargs["speaker"]) & - (annotations["vcm_type"] == "Y")]["duration"].mean() - else: - annots = annotations[annotations['speaker_type'] == kwargs["speaker"]] - value = annots["child_cry_vfx_len"].sum() / annots["cries"].apply(lambda x: len(ast.literal_eval(x))).sum() - - if pd.isnull(value): - value = 0 - return value - - -def can_voc_speaker(annotations: pd.DataFrame, duration: int, **kwargs): - """number of canonical vocalizations for a given speaker type (based on vcm_type) - - Required keyword arguments: - - speaker : speaker_type to use - """ - return annotations.loc[(annotations["speaker_type"] == kwargs["speaker"]) & (annotations["vcm_type"] == "C")].shape[ - 0] - - -peak_can_voc_speaker = metricFunction({"speaker"}, {"speaker_type", "vcm_type"})(peak_hour_metric()(can_voc_speaker)) -can_voc_speaker_ph = metricFunction({"speaker"}, {"speaker_type", "vcm_type"})(per_hour_metric()(can_voc_speaker)) -can_voc_speaker = metricFunction({"speaker"}, {"speaker_type", "vcm_type"})(can_voc_speaker) - - -def can_voc_dur_speaker(annotations: pd.DataFrame, duration: int, **kwargs): - """total duration of canonical vocalizations by a given speaker type in milliseconds (based on vcm_type) - - Required keyword arguments: - - speaker : speaker_type to use - """ - return annotations.loc[(annotations["speaker_type"] == kwargs["speaker"]) & (annotations["vcm_type"] == "C")][ - "duration"].sum() - - -peak_can_voc_dur_speaker = metricFunction({"speaker"}, {"speaker_type", "vcm_type", "duration"})( - peak_hour_metric()(can_voc_dur_speaker)) -can_voc_dur_speaker_ph = metricFunction({"speaker"}, {"speaker_type", "vcm_type", "duration"})( - per_hour_metric()(can_voc_dur_speaker)) -can_voc_dur_speaker = metricFunction({"speaker"}, {"speaker_type", "vcm_type", "duration"})(can_voc_dur_speaker) - - -@metricFunction({"speaker"}, {"speaker_type", "vcm_type", "duration"}, np.nan) -def avg_can_voc_dur_speaker(annotations: pd.DataFrame, duration: int, **kwargs): - """average duration of canonical vocalizations for a given speaker type (based on vcm_type) - - Required keyword arguments: - - speaker : speaker_type to use - """ - value = annotations.loc[(annotations["speaker_type"] == kwargs["speaker"]) & (annotations["vcm_type"] == "C")][ - "duration"].mean() - if pd.isnull(value): value = 0 - return value - - -def non_can_voc_speaker(annotations: pd.DataFrame, duration: int, **kwargs): - """number of non-canonical vocalizations for a given speaker type (based on vcm_type) - - Required keyword arguments: - - speaker : speaker_type to use - """ - return annotations.loc[(annotations["speaker_type"] == kwargs["speaker"]) & - (annotations["vcm_type"] == "N")].shape[0] - - -peak_non_can_voc_speaker = metricFunction({"speaker"}, {"speaker_type", "vcm_type"})( - peak_hour_metric()(non_can_voc_speaker)) -non_can_voc_speaker_ph = metricFunction({"speaker"}, {"speaker_type", "vcm_type"})( - per_hour_metric()(non_can_voc_speaker)) -non_can_voc_speaker = metricFunction({"speaker"}, {"speaker_type", "vcm_type"})(non_can_voc_speaker) - - -def non_can_voc_dur_speaker(annotations: pd.DataFrame, duration: int, **kwargs): - """total duration of non-canonical vocalizations by a given speaker type in milliseconds (based on vcm_type) - - Required keyword arguments: - - speaker : speaker_type to use - """ - return annotations.loc[(annotations["speaker_type"] == kwargs["speaker"]) & - (annotations["vcm_type"] == "N")]["duration"].sum() - - -peak_non_can_voc_dur_speaker = metricFunction({"speaker"}, {"speaker_type", "vcm_type", "duration"})( - peak_hour_metric()(non_can_voc_dur_speaker)) -non_can_voc_dur_speaker_ph = metricFunction({"speaker"}, {"speaker_type", "vcm_type", "duration"})( - per_hour_metric()(non_can_voc_dur_speaker)) -non_can_voc_dur_speaker = metricFunction({"speaker"}, {"speaker_type", "vcm_type", "duration"})(non_can_voc_dur_speaker) - - -@metricFunction({"speaker"}, {"speaker_type", "vcm_type", "duration"}, np.nan) -def avg_non_can_voc_dur_speaker(annotations: pd.DataFrame, duration: int, **kwargs): - """average duration of non-canonical vocalizations for a given speaker type (based on vcm_type) - - Required keyword arguments: - - speaker : speaker_type to use - """ - value = annotations.loc[(annotations["speaker_type"] == kwargs["speaker"]) & - (annotations["vcm_type"] == "N")]["duration"].mean() - if pd.isnull(value): - value = 0 - return value - - -@metricFunction(set(), set(), np.nan) -def lp_n(annotations: pd.DataFrame, duration: int, **kwargs): - """linguistic proportion on the number of vocalizations for CHI (based on vcm_type or [cries,vfxs,utterances_count] if vcm_type does not exist) - - Required keyword arguments: - """ - if {"cries", "vfxs", "utterances_count"}.issubset(annotations.columns): - annotations = annotations[annotations["speaker_type"] == "CHI"] - cries = annotations["cries"].apply(lambda x: len(ast.literal_eval(x))).sum() - vfxs = annotations["vfxs"].apply(lambda x: len(ast.literal_eval(x))).sum() - utterances = annotations["utterances_count"].sum() - total = (utterances + cries + vfxs) - if total: - value = utterances / total - else: - value = np.nan - elif "vcm_type" in annotations.columns: - speech_voc = annotations.loc[(annotations["speaker_type"] == "CHI") & - (annotations["vcm_type"].isin(["N", "C"]))].shape[0] - cry_voc = annotations.loc[(annotations["speaker_type"] == "CHI") & (annotations["vcm_type"] == "Y")].shape[0] - total = speech_voc + cry_voc - if total: - value = speech_voc / total - else: - value = np.nan - else: - raise ValueError( - "the given set does not have the necessary columns for this metric, choose a set that contains either [" - "vcm_type] or [cries,vfxs,utterances_count]") - return value - - -@metricFunction(set(), {"speaker_type", "vcm_type"}, np.nan) -def cp_n(annotations: pd.DataFrame, duration: int, **kwargs): - """canonical proportion on the number of vocalizations for CHI (based on vcm_type) - - Required keyword arguments: - """ - speech_voc = annotations.loc[(annotations["speaker_type"] == "CHI") & - (annotations["vcm_type"].isin(["N", "C"]))].shape[0] - can_voc = annotations.loc[(annotations["speaker_type"] == "CHI") & (annotations["vcm_type"] == "C")].shape[0] - if speech_voc: - value = can_voc / speech_voc - else: - value = np.nan - return value - - -@metricFunction(set(), set(), np.nan) -def lp_dur(annotations: pd.DataFrame, duration: int, **kwargs): - """linguistic proportion on the duration of vocalizations for CHI (based on vcm_type or [child_cry_vfxs_len,utterances_length] if vcm_type does not exist) - - Required keyword arguments: - """ - if {"child_cry_vfx_len", "utterances_length"}.issubset(annotations.columns): - annotations = annotations[annotations["speaker_type"] == "CHI"] - utter_len = annotations["utterances_length"].sum() - total = annotations["child_cry_vfx_len"].sum() + utter_len - if total: - value = utter_len / total - else: - value = np.nan - elif "vcm_type" in annotations.columns: - speech_dur = annotations.loc[(annotations["speaker_type"] == "CHI") & - (annotations["vcm_type"].isin(["N", "C"]))]["duration"].sum() - cry_dur = annotations.loc[(annotations["speaker_type"] == "CHI") & - (annotations["vcm_type"] == "Y")]["duration"].sum() - total = speech_dur + cry_dur - if total: - value = speech_dur / total - else: - value = np.nan - else: - raise ValueError( - "the {} set does not have the necessary columns for this metric, choose a set that contains either [" - "vcm_type] or [child_cry_vfx_len,utterances_length]") - return value - - -@metricFunction(set(), {"speaker_type", "vcm_type", "duration"}, np.nan) -def cp_dur(annotations: pd.DataFrame, duration: int, **kwargs): - """canonical proportion on the number of vocalizations for CHI (based on vcm_type) - - Required keyword arguments: - """ - speech_dur = annotations.loc[(annotations["speaker_type"] == "CHI") & - (annotations["vcm_type"].isin(["N", "C"]))]["duration"].sum() - can_dur = annotations.loc[(annotations["speaker_type"] == "CHI") & - (annotations["vcm_type"] == "C")]["duration"].sum() - if speech_dur: - value = can_dur / speech_dur - else: - value = np.nan - return value - - -def lena_CVC(annotations: pd.DataFrame, duration: int, **kwargs): - """number of child vocalizations according to LENA's extraction - - Required keyword arguments: - """ - return annotations["utterances_count"].sum() - - -peak_lena_CVC = metricFunction(set(), {"utterances_count"})(peak_hour_metric()(lena_CVC)) -lena_CVC_ph = metricFunction(set(), {"utterances_count"})(per_hour_metric()(lena_CVC)) -lena_CVC = metricFunction(set(), {"utterances_count"})(lena_CVC) - - -def lena_CTC(annotations: pd.DataFrame, duration: int, **kwargs): - """number of conversational turn counts according to LENA's extraction - - Required keyword arguments: - """ - conv_types = {'TIMR', 'TIFR'} - return annotations[annotations["lena_conv_turn_type"].isin(conv_types)].shape[0] - - -peak_lena_CTC = metricFunction(set(), {"lena_conv_turn_type"})(peak_hour_metric()(lena_CTC)) -lena_CTC_ph = metricFunction(set(), {"lena_conv_turn_type"})(per_hour_metric()(lena_CTC)) -lena_CTC = metricFunction(set(), {"lena_conv_turn_type"})(lena_CTC) - - -def simple_CTC(annotations: pd.DataFrame, - duration: int, - interlocutors_1=('CHI',), - interlocutors_2=('FEM', 'MAL', 'OCH'), - max_interval=1000, - min_delay=0, - **kwargs): - """number of conversational turn counts based on vocalizations occurring - in a given interval of one another - - keyword arguments: - - interlocutors_1 : first group of interlocutors, default = ['CHI'] - - interlocutors_2 : second group of interlocutors, default = ['FEM','MAL','OCH'] - - max_interval : maximum interval in ms for it to be considered a turn, default = 1000 - - min_delay : minimum delay between somebody starting speaking - """ - # build the interactants groups, every label in interlocutors_1 can interact with interlocutors_2 and vice versa - speakers = set(interlocutors_1 + interlocutors_2) - interactants = {k: set(interlocutors_2) for k in interlocutors_1} - for k in interlocutors_2: - if k in interactants: - interactants[k] = interactants[k] | set(interlocutors_1) - else: - interactants[k] = set(interlocutors_1) - - annotations = annotations[annotations["speaker_type"].isin(speakers)].copy() - - if annotations.shape[0]: - # store the duration between vocalizations - annotations["iti"] = annotations["segment_onset"] - annotations["segment_offset"].shift(1) - # store the previous speaker - annotations["prev_speaker_type"] = annotations["speaker_type"].shift(1) - - annotations["delay"] = annotations["segment_onset"] - annotations["segment_onset"].shift(1) - - # not using absolute value for 'iti' is a choice and should be evaluated (we allow speakers to 'interrupt' - # themselves - annotations["is_CT"] = ( - (annotations.apply(lambda row: row["prev_speaker_type"] in interactants[row['speaker_type']], axis=1)) - & - (annotations['iti'] < max_interval) - & - (annotations['delay'] >= min_delay) - ) - - return annotations['is_CT'].sum() - else: - return 0 - - -peak_simple_CTC = metricFunction(set(), {"speaker_type"})(peak_hour_metric()(simple_CTC)) -simple_CTC_ph = metricFunction(set(), {"speaker_type"})(per_hour_metric()(simple_CTC)) -simple_CTC = metricFunction(set(), {"speaker_type"})(simple_CTC) From 4a846ca7833ea3de20527718b2c6a77437270a18 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Agata=20Kozio=C5=82?= Date: Fri, 1 Mar 2024 16:26:53 +0100 Subject: [PATCH 13/44] Updated structure by Loann --- .../pipelines/conversationFunctions.py | 127 ++++---- ChildProject/pipelines/conversations.py | 270 ++++-------------- 2 files changed, 114 insertions(+), 283 deletions(-) diff --git a/ChildProject/pipelines/conversationFunctions.py b/ChildProject/pipelines/conversationFunctions.py index 41b3a7c8..72e61b25 100644 --- a/ChildProject/pipelines/conversationFunctions.py +++ b/ChildProject/pipelines/conversationFunctions.py @@ -24,25 +24,15 @@ !! Metrics functions should still behave and return the correct result when receiving an empty dataframe """ +RESERVED = {'name', 'callable'} # arguments reserved usage. use other keyword labels. -# error message in case of missing columns in annotations -MISSING_COLUMNS = 'The given set <{}> does not have the required column(s) <{}> for computing the {} metric' -RESERVED = {'set', 'name', 'callable'} # arguments reserved usage. use other keyword labels. - - -def conversationFunction(args: set, columns: Union[Set[str], Tuple[Set[str], ...]], empty_value=0, default_name: str = None): +def conversationFunction(args: set = set()): """Decorator for all metrics functions to make them ready to be called by the pipeline. :param args: set of required keyword arguments for that function, raise ValueError if were not given \ you cannot use keywords [name, callable, set] as they are reserved :type args: set - :param columns: required columns in the dataframe given, missing columns raise ValueError - :type columns: set - :param default_name: default name to use for the metric in the resulting dataframe. Every keyword argument found in the name will be replaced by its value (e.g. 'voc_speaker_ph' uses kwarg 'speaker' so if speaker = 'CHI', name will be 'voc_chi_ph'). if no name is given, the __name__ of the function is used - :type default_name: str - :param empty_value: value to return when annotations are empty but the unit was annotated (e.g. 0 for counts like voc_speaker_ph , None for proportions like lp_n) - :type empty_value: float|int :return: new function to substitute the metric function :rtype: Callable """ @@ -56,90 +46,87 @@ def decorator(function): function.__name__, a, RESERVED)) @functools.wraps(function) - def new_func(annotations: pd.DataFrame, duration: int, **kwargs): + def new_func(annotations: pd.DataFrame, **kwargs): for arg in args: if arg not in kwargs: raise ValueError(f"{function.__name__} metric needs an argument <{arg}>") - # if a name is explicitly given, use it - if 'name' in kwargs and not pd.isnull(kwargs['name']) and kwargs['name']: - metric_name = kwargs['name'] - # else if a default name for the function exists, use the function name - elif default_name: - metric_name = default_name - # else, no name was found, use the name of the function - else: - metric_name = function.__name__ - - metric_name_replaced = metric_name - # metric_name is the basename used to designate this metric (voc_speaker_ph), - # metric_name_replaced replaces the values of kwargs - # found in the name by their values, giving the metric name for that instance only (voc_chi_ph) - for arg in kwargs: - metric_name_replaced = re.sub(arg, str(kwargs[arg]).lower(), metric_name_replaced) - if annotations.shape[0]: - # if multiple possibilities of columns, explore each and fail only if each combination is missing - # a column, if one possibility, fail if a column is missing - if isinstance(columns, tuple) and len(columns) > 0 and isinstance(columns[0], set): - missing_columns = [] - for possible_cols in columns: - possible_missing = possible_cols - set(annotations.columns) - if possible_missing: - missing_columns.append(possible_missing) - # if we have as many cases of missing columns as possibilities, we can't compute the metric - if len(missing_columns) == len(columns): - raise ValueError( - MISSING_COLUMNS.format(annotations['set'].iloc[0], - ' or '.join([str(s) for s in missing_columns]), - metric_name)) - else: - missing_columns = columns - set(annotations.columns) - if missing_columns: - raise ValueError( - MISSING_COLUMNS.format(annotations['set'].iloc[0], missing_columns, metric_name)) - res = function(annotations, duration, **kwargs) - else: # no annotation for that unit - res = empty_value if duration else None # duration != 0 => was annotated but not segments there - return metric_name_replaced, res + + res = function(annotations, **kwargs) + + return res return new_func return decorator -@conversationFunction(set(), {"speaker_type", "conv_count", "duration"}, np.nan) +@conversationFunction() +def conversation_onset(annotations: pd.DataFrame): + return annotations.reset_index().iloc[0]['segment_onset'] + +@conversationFunction() +def conversation_offset(annotations: pd.DataFrame): + return annotations.reset_index().iloc[-1]['segment_offset'] + +@conversationFunction() +def conversation_duration(annotations: pd.DataFrame): + return annotations.reset_index().iloc[-1]['segment_offset'] - annotations.reset_index().iloc[0]['segment_onset'] + +@conversationFunction() +def vocalisations_count(annotations: pd.DataFrame): + return annotations['speaker_type'].count() + +@conversationFunction() +def who_initiated(annotations: pd.DataFrame): + return annotations.reset_index().iloc[0]['speaker_type'] + +@conversationFunction() +def who_finished(annotations: pd.DataFrame): + return annotations.reset_index().iloc[-1]['speaker_type'] + +@conversationFunction() +def total_duration_of_vocalisations(annotations: pd.DataFrame): + return annotations['voc_duration'].sum() + +@conversationFunction({'speaker'}) def is_speaker(annotations: pd.DataFrame, **kwargs): return kwargs["speaker"] in annotations['speaker_type'].tolist() -@conversationFunction(set(), {"speaker_type", "conv_count", "duration"}, np.nan) +@conversationFunction({'speaker'}) def voc_counter(annotations: pd.DataFrame, **kwargs): return annotations[annotations['speaker_type'] == kwargs["speaker"]]['speaker_type'].count() -@conversationFunction(set(), {"speaker_type", "conv_count", "duration"}, np.nan) +@conversationFunction({'speaker'}) def voc_total(annotations: pd.DataFrame, **kwargs): return annotations[annotations['speaker_type'] == kwargs["speaker"]]['voc_duration'].sum(min_count=1) -@conversationFunction(set(), {"speaker_type", "conv_count", "duration"}, np.nan) -def assign_conv_type(conv): - if not conv['CHI_present']: +@conversationFunction() +def assign_conv_type(annotations: pd.DataFrame, **kwargs): + speaker_present = {} + for speaker in ['CHI', 'FEM', 'MAL', 'OCH']: + speaker_present[speaker] = speaker in annotations['speaker_type'].tolist() + speaker_df = pd.DataFrame.from_dict(speaker_present) + + if not speaker_df['CHI']: return 'overheard' - elif conv['CHI_present']: - if not conv['OCH_present'] and conv[['FEM_present', 'MAL_present']].sum() == 1: - if conv['FEM_present']: + elif speaker_df['CHI']: + if not speaker_df['OCH'] and speaker_df[['FEM', 'MAL']].sum() == 1: + if speaker_df['FEM']: return 'dyadic_FEM' - if conv['MAL_present']: + if speaker_df['MAL']: return 'dyadic_MAL' - if conv['OCH_present'] and conv[['FEM_present', 'MAL_present']].sum() == 0: + if speaker_df['OCH'] and speaker_df[['FEM', 'MAL']].sum() == 0: return 'peer' - if not conv['OCH_present'] and conv[['FEM_present', 'MAL_present']].sum() == 2: + if not speaker_df['OCH'] and speaker_df[['FEM', 'MAL']].sum() == 2: return 'parent' - if conv['OCH_present'] and conv[['FEM_present', 'MAL_present']].sum() == 1: - if conv['FEM_present']: + if speaker_df['OCH'] and speaker_df[['FEM', 'MAL']].sum() == 1: + if speaker_df['FEM']: return 'triadic_FEM' - if conv['MAL_present']: + if speaker_df['MAL']: return 'triadic_MAL' - if conv[['OCH_present', 'FEM_present', 'MAL_present']].sum() == 3: + if speaker_df[['OCH', 'FEM', 'MAL']].sum() == 3: return 'multiparty' return np.nan() -# @conversationFunction(set(), {"speaker_type", "conv_count", "duration"}, np.nan) +# @conversationFunction(set()) # def voc_average(annotations: pd.DataFrame, **kwargs): # return annotations[annotations['speaker_type'] == kwargs["speaker"]]['voc_duration'].mean() diff --git a/ChildProject/pipelines/conversations.py b/ChildProject/pipelines/conversations.py index 25cba214..dd4edc34 100644 --- a/ChildProject/pipelines/conversations.py +++ b/ChildProject/pipelines/conversations.py @@ -47,8 +47,8 @@ class Conversations(ABC): def __init__( self, project: ChildProject.projects.ChildProject, + setname: str, metrics_list: pd.DataFrame, - by: str = "recording_filename", recordings: Union[str, List[str], pd.DataFrame] = None, from_time: str = None, to_time: str = None, @@ -78,20 +78,20 @@ def check_callable(row): # block checking presence of required columns and evaluates the callable functions if isinstance(metrics_list, pd.DataFrame): - if ({'callable', 'set'}).issubset(metrics_list.columns): + if ({'callable', 'name'}).issubset(metrics_list.columns): metrics_list["callable"] = metrics_list.apply(check_callable, axis=1) else: - raise ValueError("metrics_list parameter must contain atleast the columns [callable,set]") + raise ValueError("metrics_list parameter must contain at least the columns [callable,name]") else: raise ValueError("metrics_list parameter must be a pandas DataFrame") metrics_list.sort_values(by="set", inplace=True) - for setname in np.unique(metrics_list['set'].values): - if setname not in self.am.annotations["set"].values: - raise ValueError( - f"annotation set '{setname}' was not found in the index; " - "check spelling and make sure the set was properly imported." - ) + if setname not in self.am.annotations["set"].values: + raise ValueError( + f"annotation set '{setname}' was not found in the index; " + "check spelling and make sure the set was properly imported." + ) + self.set = setname self.metrics_list = metrics_list # necessary columns to construct the metrics @@ -104,8 +104,7 @@ def check_callable(row): } # get existing columns of the dataset for recordings correct_cols = set(self.project.recordings.columns) - if by != 'segments' and by not in correct_cols: raise ValueError( - "<{}> is not specified in this dataset, cannot extract by it, change your --by option".format(by)) + if rec_cols: # when user requests recording columns, build the list and verify they exist (warn otherwise) rec_cols = set(rec_cols.split(",")) @@ -151,8 +150,10 @@ def check_callable(row): else: self.child_cols = None - self.by = by - self.recordings = Pipeline.recordings_from_list(recordings) + if recordings is None: + self.recordings = self.project.recordings['recording_filename'].to_list() + else: + self.recordings = Pipeline.recordings_from_list(recordings) # turn from_time and to to_time to datetime objects if from_time: @@ -172,13 +173,11 @@ def check_callable(row): else: self.to_time = None - self._initiate_metrics_df() - def __init_subclass__(cls, **kwargs): super().__init_subclass__(**kwargs) pipelines[cls.SUBCOMMAND] = cls - def _process_recording(self, row): #process recording line + def _process_recording(self, annotations): #process recording line #keep lines for which conv_count is nopt Na and group by conv """for one unit (i.e. 1 recording) compute the list of required metrics and store the results in the current row of self.metrics @@ -187,35 +186,14 @@ def _process_recording(self, row): #process recording line :return: Series containing all the computed metrics result for that unit :rtype: pandas.Series """ - # row[0] is the index of the row we are processing - # row[1] is the actual Series containing all the metrics for the currently processed line - prev_set = "" - duration_set = 0 + result = {} for i, line in self.metrics_list.iterrows(): - curr_set = line["set"] - if prev_set != curr_set: - index, annotations = self.retrieve_segments([curr_set], row[1]) - # Change the annotations dataframe, i.e. group by conversations - annotations = annotations.dropna(subset='conv_count') - annotations['voc_duration'] = annotations['segment_offset'] - annotations['segment_onset'] - if index.shape[0]: - duration_set = ( - index["range_offset"] - index["range_onset"] - ).sum() - else: - duration_set = 0 - row[1]["duration_{}".format(line["set"])] = duration_set - prev_set = curr_set - - # name, value = line["callable"](annotations, duration_set, - # **line.drop(['callable', 'set']).dropna().to_dict()) - name, value = annotations.groupby('conv_count').apply( - lambda conv: line["callable"](conv, duration_set, - **line.drop(['callable', 'set']).dropna().to_dict())) - - row[1][name] = value - - return row[1] + + annotations['voc_duration'] = annotations['segment_offset'] - annotations['segment_onset'] + + result[line['name']] = line["callable"](annotations, **line.drop(['callable', 'name']).dropna().to_dict()) + + return result def extract(self): """from the initiated self.metrics, compute each row metrics (handles threading) @@ -225,19 +203,30 @@ def extract(self): :rtype: pandas.DataFrame """ if self.threads == 1: - self.metrics = pd.DataFrame( - [self._process_recording(row) for row in self.metrics.iterrows()] + full_annotations = pd.concat([self.retrieve_segments(rec) for rec in self.recordings]) + else: + with mp.Pool( + processes=self.threads if self.threads >= 1 else mp.cpu_count() + ) as pool: + full_annotations = pd.concat(pool.map(self.retrieve_segments, self.recordings)) + + conversations = full_annotations.groupby(['recording_filename', 'conv_count']) + + if self.threads == 1: + self.conversations = pd.DataFrame( + [self._process_conversation(conversation) for group, conversation in conversations] ) else: with mp.Pool( processes=self.threads if self.threads >= 1 else mp.cpu_count() ) as pool: - self.metrics = pd.DataFrame( - pool.map(self._process_recording, self.metrics.iterrows()) + self.conversations = pd.DataFrame( + pool.map(self._process_conversation, [conv for group, conv in conversations]) ) - return self.metrics - def retrieve_segments(self, sets: List[str], row: str): + return self.conversations + + def retrieve_segments(self, recording: str): """from a list of sets and a row identifying the unit computed, return the relevant annotation segments :param sets: List of annotation sets to keep @@ -247,117 +236,30 @@ def retrieve_segments(self, sets: List[str], row: str): :return: relevant annotation DataFrame and index DataFrame :rtype: (pandas.DataFrame , pandas.DataFrame) """ - # if extraction from segments, annotations are retrieved from get_within_ranges - if self.segments is not None: - matches = self.am.get_within_ranges(ranges=pd.DataFrame( - [[row['recording_filename'], row['segment_onset'], row['segment_offset']]], - columns=['recording_filename', 'range_onset', 'range_offset']), - sets=sets, - missing_data='warn') - # else prepare and use get_within_time_range + annotations = self.am.annotations[recording == self.am.annotations['recording_filename']] + annotations = annotations[annotations["set"] == self.set] + # restrict to time ranges + if self.from_time and self.to_time: + matches = self.am.get_within_time_range( + annotations, TimeInterval(self.from_time, self.to_time)) else: - annotations = self.am.annotations[self.am.annotations[self.by] == row[self.by]] - annotations = annotations[annotations["set"].isin(sets)] - # restrict to time ranges - if self.from_time and self.to_time: - # add the periods columns - if self.period: - st_hour = row["period_start"] - end_hour = row["period_end"] - intervals = time_intervals_intersect(TimeInterval(self.from_time, self.to_time), - TimeInterval(st_hour, end_hour)) - matches = pd.concat([self.am.get_within_time_range(annotations, i) for i in intervals], - ignore_index=True) if intervals else pd.DataFrame() - else: - matches = self.am.get_within_time_range( - annotations, TimeInterval(self.from_time, self.to_time)) - elif self.period: - # add the periods columns - st_hour = row["period_start"] - end_hour = row["period_end"] - matches = self.am.get_within_time_range( - annotations, TimeInterval(st_hour, end_hour)) - else: - matches = annotations + matches = annotations if matches.shape[0]: segments = self.am.get_segments(matches) + segments = segments.dropna(subset='conv_count') else: # no annotations for that unit return pd.DataFrame(), pd.DataFrame() - # prevent overflows - segments["duration"] = ( - (segments["segment_offset"] - segments["segment_onset"]) - .astype(float) - .fillna(0) - ) - - return matches, segments - - def _initiate_metrics_df(self): - """builds a dataframe with all the rows necessary and their labels - eg : - one row per child if --by child_id and no --period - - 48 rows if 2 recordings in the corpus --period 1h --by recording_filename - Then the extract() method should populate the dataframe with actual metrics - """ - - recordings = self.project.get_recordings_from_list(self.recordings) - self.metrics = pd.DataFrame(recordings[self.by].unique(), columns=[self.by]) - if self.period: - # if period, use the self.periods dataframe to build all the list of segments per unit - self.metrics[ - "key"] = 0 # with old versions of pandas, we are forced to have a common column to do a cross join, we drop the column after - self.periods["key"] = 0 - self.metrics = pd.merge(self.metrics, self.periods, on='key', how='outer').drop('key', axis=1) - - # add info for child_id - self.metrics["child_id"] = self.metrics.apply( - lambda row: self.project.recordings[self.project.recordings[self.by] == row[self.by] - ]["child_id"].iloc[0], - axis=1) - - # get and add to dataframe children.csv columns asked - if self.child_cols: - for label in self.child_cols: - self.metrics[label] = self.metrics.apply(lambda row: - self.project.children[ - self.project.children["child_id"] == row["child_id"] - ][label].iloc[0], axis=1) - - # this loop is for the purpose of checking for name duplicates in the metrics - # we do a dry run on the first line with no annotations bc impractical to check in multiprocessing - df = pd.DataFrame() - duration_set = 0 - names = set() - for i, line in self.metrics_list.iterrows(): - name, value = line["callable"](df, duration_set, **line.drop(['callable', 'set'], - errors='ignore').dropna().to_dict()) + segments['recording_filename'] = recording - if name in names: - raise ValueError('the metric name <{}> is used multiple times, make sure it is unique'.format(name)) - else: - names.add(name) - - # checking that columns added by the user are unique (e.g. date_iso may be different when extract by child_id), replace with NA if they are not - def check_unicity(row, label): - value = self.project.recordings[ - self.project.recordings[self.by] == row[self.by] - ][label].drop_duplicates() - # check that there is only one row remaining (ie this column has a unique value for that unit) - if len(value) == 1: - return value.iloc[0] - # otherwise, leave the column as NA - else: - return np.nan + #TODO check that required columns exist - # get and add to dataframe recordings.csv columns asked - if self.rec_cols: - for label in self.rec_cols: - self.metrics[label] = self.metrics.apply(lambda row: check_unicity(row, label), axis=1) + return segments -class CustomMetrics(Conversations): +class CustomConversations(Conversations): """metrics extraction from a csv file. Extracts a number of metrics listed in a csv file as a dataframe. the csv file must contain the columns : @@ -492,7 +394,7 @@ def add_parser(subparsers, subcommand): parser.add_argument("set", help="name of the LENA its annotations set") -class AclewMetrics(Conversations): +class AclewConversations(Conversations): """ACLEW metrics extractor. Extracts a number of metrics from the ACLEW pipeline annotations, which includes: @@ -529,78 +431,22 @@ class AclewMetrics(Conversations): def __init__( self, project: ChildProject.projects.ChildProject, - vtc: str = "vtc", - alice: str = "alice", - vcm: str = "vcm", + setname: str = "vtc/conversations", recordings: Union[str, List[str], pd.DataFrame] = None, from_time: str = None, to_time: str = None, rec_cols: str = None, child_cols: str = None, - by: str = "recording_filename", threads: int = 1, ): - self.vtc = vtc - self.alice = alice - self.vcm = vcm - - am = ChildProject.annotations.AnnotationManager( - project) # temporary instance to check for existing sets. This is suboptimal because an annotation manager will be created by Metrics. However, the metrics class raises a ValueError for every set passed that does not exist, here we want to check in advance which of the alice and vcm sets exist without raising an error - METRICS = np.array( - [["voc_speaker_ph", self.vtc, 'FEM'], - ["voc_speaker_ph", self.vtc, 'MAL'], - ["voc_speaker_ph", self.vtc, 'OCH'], - ["voc_speaker_ph", self.vtc, 'CHI'], - ["voc_dur_speaker_ph", self.vtc, 'FEM'], - ["voc_dur_speaker_ph", self.vtc, 'MAL'], - ["voc_dur_speaker_ph", self.vtc, 'OCH'], - ["voc_dur_speaker_ph", self.vtc, 'CHI'], - ["avg_voc_dur_speaker", self.vtc, 'FEM'], - ["avg_voc_dur_speaker", self.vtc, 'MAL'], - ["avg_voc_dur_speaker", self.vtc, 'OCH'], - ["avg_voc_dur_speaker", self.vtc, 'CHI'], - ["simple_CTC_ph", self.vtc, pd.NA], + [["conversation_onset", "conversation_onset", pd.NA], ]) - if self.alice not in am.annotations["set"].values: - print(f"The ALICE set ('{self.alice}') was not found in the index.") - else: - METRICS = np.concatenate((METRICS, np.array( - [["wc_speaker_ph", self.alice, 'FEM'], - ["wc_speaker_ph", self.alice, 'MAL'], - ["sc_speaker_ph", self.alice, 'FEM'], - ["sc_speaker_ph", self.alice, 'MAL'], - ["pc_speaker_ph", self.alice, 'FEM'], - ["pc_speaker_ph", self.alice, 'MAL'], - ["wc_adu_ph", self.alice, pd.NA], - ["sc_adu_ph", self.alice, pd.NA], - ["pc_adu_ph", self.alice, pd.NA], - ]))) - - if self.vcm not in am.annotations["set"].values: - print(f"The vcm set ('{self.vcm}') was not found in the index.") - else: - METRICS = np.concatenate((METRICS, np.array( - [["cry_voc_speaker_ph", self.vcm, 'CHI'], - ["cry_voc_dur_speaker_ph", self.vcm, 'CHI'], - ["avg_cry_voc_dur_speaker", self.vcm, 'CHI'], - ["can_voc_speaker_ph", self.vcm, 'CHI'], - ["can_voc_dur_speaker_ph", self.vcm, 'CHI'], - ["avg_can_voc_dur_speaker", self.vcm, 'CHI'], - ["non_can_voc_speaker_ph", self.vcm, 'CHI'], - ["non_can_voc_dur_speaker_ph", self.vcm, 'CHI'], - ["avg_non_can_voc_dur_speaker", self.vcm, 'CHI'], - ["lp_n", self.vcm, pd.NA], - ["lp_dur", self.vcm, pd.NA], - ["cp_n", self.vcm, pd.NA], - ["cp_dur", self.vcm, pd.NA], - ]))) - - METRICS = pd.DataFrame(METRICS, columns=["callable", "set", "speaker"]) + METRICS = pd.DataFrame(METRICS, columns=["callable", "name", "speaker"]) - super().__init__(project, METRICS, by=by, recordings=recordings, + super().__init__(project, setname, METRICS, recordings=recordings, from_time=from_time, to_time=to_time, rec_cols=rec_cols, child_cols=child_cols, threads=threads) @@ -608,9 +454,7 @@ def __init__( @staticmethod def add_parser(subparsers, subcommand): parser = subparsers.add_parser(subcommand, help="LENA metrics") - parser.add_argument("--vtc", help="vtc set", default="vtc") - parser.add_argument("--alice", help="alice set", default="alice") - parser.add_argument("--vcm", help="vcm set", default="vcm") + parser.add_argument("--set", help="set", default="vtc/conversations") class ConversationsPipeline(Pipeline): From aafbb6b8f0ac4c80682282aadf2f17030f81f771 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Agata=20Kozio=C5=82?= Date: Fri, 1 Mar 2024 17:47:47 +0100 Subject: [PATCH 14/44] Clean metrics --- .../pipelines/conversationFunctions.py | 35 +++++++++++++++---- 1 file changed, 29 insertions(+), 6 deletions(-) diff --git a/ChildProject/pipelines/conversationFunctions.py b/ChildProject/pipelines/conversationFunctions.py index 72e61b25..5c1653ce 100644 --- a/ChildProject/pipelines/conversationFunctions.py +++ b/ChildProject/pipelines/conversationFunctions.py @@ -59,75 +59,98 @@ def new_func(annotations: pd.DataFrame, **kwargs): return decorator + @conversationFunction() def conversation_onset(annotations: pd.DataFrame): return annotations.reset_index().iloc[0]['segment_onset'] + @conversationFunction() def conversation_offset(annotations: pd.DataFrame): return annotations.reset_index().iloc[-1]['segment_offset'] + @conversationFunction() def conversation_duration(annotations: pd.DataFrame): return annotations.reset_index().iloc[-1]['segment_offset'] - annotations.reset_index().iloc[0]['segment_onset'] + @conversationFunction() def vocalisations_count(annotations: pd.DataFrame): return annotations['speaker_type'].count() + @conversationFunction() def who_initiated(annotations: pd.DataFrame): return annotations.reset_index().iloc[0]['speaker_type'] + @conversationFunction() def who_finished(annotations: pd.DataFrame): return annotations.reset_index().iloc[-1]['speaker_type'] + @conversationFunction() def total_duration_of_vocalisations(annotations: pd.DataFrame): return annotations['voc_duration'].sum() + @conversationFunction({'speaker'}) def is_speaker(annotations: pd.DataFrame, **kwargs): return kwargs["speaker"] in annotations['speaker_type'].tolist() + @conversationFunction({'speaker'}) def voc_counter(annotations: pd.DataFrame, **kwargs): return annotations[annotations['speaker_type'] == kwargs["speaker"]]['speaker_type'].count() + @conversationFunction({'speaker'}) def voc_total(annotations: pd.DataFrame, **kwargs): return annotations[annotations['speaker_type'] == kwargs["speaker"]]['voc_duration'].sum(min_count=1) + +@conversationFunction({'speaker'}) +def voc_contribution(annotations: pd.DataFrame, **kwargs): + speaker_total = annotations[annotations['speaker_type'] == kwargs["speaker"]]['voc_duration'].sum(min_count=1) + total = annotations['voc_duration'].sum() + return speaker_total / total + + @conversationFunction() -def assign_conv_type(annotations: pd.DataFrame, **kwargs): +def assign_conv_type(annotations: pd.DataFrame): + #pd.Categorical(['overheard', 'dyadic_FEM', 'dyadic_MAL', 'peer', 'parent', 'triadic_FEM', 'triadic_MAL', 'multiparty']) speaker_present = {} for speaker in ['CHI', 'FEM', 'MAL', 'OCH']: - speaker_present[speaker] = speaker in annotations['speaker_type'].tolist() - speaker_df = pd.DataFrame.from_dict(speaker_present) + speaker_present[speaker] = [speaker in annotations['speaker_type'].tolist()] + speaker_df = pd.DataFrame.from_dict(speaker_present).iloc[0, :] if not speaker_df['CHI']: return 'overheard' + elif speaker_df['CHI']: if not speaker_df['OCH'] and speaker_df[['FEM', 'MAL']].sum() == 1: if speaker_df['FEM']: return 'dyadic_FEM' + if speaker_df['MAL']: return 'dyadic_MAL' + if speaker_df['OCH'] and speaker_df[['FEM', 'MAL']].sum() == 0: return 'peer' + if not speaker_df['OCH'] and speaker_df[['FEM', 'MAL']].sum() == 2: return 'parent' + if speaker_df['OCH'] and speaker_df[['FEM', 'MAL']].sum() == 1: if speaker_df['FEM']: return 'triadic_FEM' if speaker_df['MAL']: return 'triadic_MAL' + if speaker_df[['OCH', 'FEM', 'MAL']].sum() == 3: return 'multiparty' return np.nan() -# @conversationFunction(set()) -# def voc_average(annotations: pd.DataFrame, **kwargs): -# return annotations[annotations['speaker_type'] == kwargs["speaker"]]['voc_duration'].mean() + From ac50346a1ee2b845595041c433e94d2ae47ed3f3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Agata=20Kozio=C5=82?= Date: Fri, 1 Mar 2024 17:47:59 +0100 Subject: [PATCH 15/44] Updated list of metrics --- ChildProject/pipelines/conversations.py | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/ChildProject/pipelines/conversations.py b/ChildProject/pipelines/conversations.py index dd4edc34..81f5ba19 100644 --- a/ChildProject/pipelines/conversations.py +++ b/ChildProject/pipelines/conversations.py @@ -442,6 +442,30 @@ def __init__( METRICS = np.array( [["conversation_onset", "conversation_onset", pd.NA], + ["conversation_offset", "conversation_offset", pd.NA], + ["conversation_duration", "conversation_duration", pd.NA], + ["vocalisations_count", "vocalisations_count", pd.NA], + ["who_initiated", "initiator", pd.NA], + ["who_finished", "finisher", pd.NA], + ["total_duration_of_vocalisations", "total_duration_of_vocalisations", pd.NA], + ["conversation_duration", "conversation_duration", pd.NA], + ["is_speaker", "CHI_present", 'CHI'], + ["is_speaker", "FEM_present", 'FEM'], + ["is_speaker", "MAL_present", 'MAL'], + ["is_speaker", "OCH_present", 'OCH'], + ["voc_counter", "CHI_voc_counter", 'CHI'], + ["voc_counter", "FEM_voc_counter", 'FEM'], + ["voc_counter", "MAL_voc_counter", 'MAL'], + ["voc_counter", "OCH_voc_counter", 'OCH'], + ["voc_total", "CHI_voc_total", 'CHI'], + ["voc_total", "FEM_voc_total", 'FEM'], + ["voc_total", "MAL_voc_total", 'MAL'], + ["voc_total", "OCH_voc_total", 'OCH'], + ["voc_contribution", "CHI_voc_contribution", 'CHI'], + ["voc_contribution", "FEM_voc_contribution", 'FEM'], + ["voc_contribution", "MAL_voc_contribution", 'MAL'], + ["voc_contribution", "OCH_voc_contribution", 'OCH'], + ["assign_conv_type", "conversation_type", pd.NA], ]) METRICS = pd.DataFrame(METRICS, columns=["callable", "name", "speaker"]) From efe1dfb1e16866a0ef1e4f8f28475231e6851e46 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Agata=20Kozio=C5=82?= Date: Mon, 11 Mar 2024 18:25:12 +0100 Subject: [PATCH 16/44] Update conversationFunctions.py --- ChildProject/pipelines/conversationFunctions.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/ChildProject/pipelines/conversationFunctions.py b/ChildProject/pipelines/conversationFunctions.py index 5c1653ce..0237a1af 100644 --- a/ChildProject/pipelines/conversationFunctions.py +++ b/ChildProject/pipelines/conversationFunctions.py @@ -25,7 +25,12 @@ !! Metrics functions should still behave and return the correct result when receiving an empty dataframe """ RESERVED = {'name', 'callable'} # arguments reserved usage. use other keyword labels. - +#TODO +# 1. Start and end time of each conversation in the recording +# 2. Duration of time between conversations (e.g., time between Convo 1 and Convo 2) +# 3. Key Child ID (i.e., some identifier for the key child in the data set) +# 4. Recording ID (i.e., which of the Key Child's recordings is this, if the Key Child has multiple recordings) +# 5. A string with a list of speaker tags in the conversation (e.g., "CHI, FEM, OCH") def conversationFunction(args: set = set()): """Decorator for all metrics functions to make them ready to be called by the pipeline. From d3c06cde0880f303e6e4d00ef5c2f28bade93007 Mon Sep 17 00:00:00 2001 From: LPeurey Date: Thu, 14 Mar 2024 17:09:36 +0100 Subject: [PATCH 17/44] ready conversations summary just for standard extraction, without any other feature --- .../pipelines/conversationFunctions.py | 4 +- ChildProject/pipelines/conversations.py | 177 +++++++----------- ChildProject/pipelines/metrics.py | 2 +- 3 files changed, 67 insertions(+), 116 deletions(-) diff --git a/ChildProject/pipelines/conversationFunctions.py b/ChildProject/pipelines/conversationFunctions.py index 0237a1af..be895a66 100644 --- a/ChildProject/pipelines/conversationFunctions.py +++ b/ChildProject/pipelines/conversationFunctions.py @@ -28,8 +28,6 @@ #TODO # 1. Start and end time of each conversation in the recording # 2. Duration of time between conversations (e.g., time between Convo 1 and Convo 2) -# 3. Key Child ID (i.e., some identifier for the key child in the data set) -# 4. Recording ID (i.e., which of the Key Child's recordings is this, if the Key Child has multiple recordings) # 5. A string with a list of speaker tags in the conversation (e.g., "CHI, FEM, OCH") def conversationFunction(args: set = set()): @@ -155,7 +153,7 @@ def assign_conv_type(annotations: pd.DataFrame): if speaker_df[['OCH', 'FEM', 'MAL']].sum() == 3: return 'multiparty' - return np.nan() + return np.nan diff --git a/ChildProject/pipelines/conversations.py b/ChildProject/pipelines/conversations.py index 81f5ba19..f77059a5 100644 --- a/ChildProject/pipelines/conversations.py +++ b/ChildProject/pipelines/conversations.py @@ -15,7 +15,7 @@ from ChildProject.tables import assert_dataframe, assert_columns_presence, read_csv_with_dtype import ChildProject.pipelines.conversationFunctions as convfunc -from ..utils import TimeInterval, time_intervals_intersect +from ..utils import TimeInterval pipelines = {} @@ -60,8 +60,10 @@ def __init__( self.project = project self.am = ChildProject.annotations.AnnotationManager(self.project) self.threads = int(threads) + self.conversations = None - # check that the callable column is either a callable function or a string that can be found as being part of the list of metrics in ChildProject/pipelines/conversationFunctions.py + # check that the callable column is either a callable function or a string that can be found as being part of + # the list of metrics in ChildProject/pipelines/conversationFunctions.py def check_callable(row): if callable(row["callable"]): return row["callable"] if isinstance(row["callable"], str): @@ -84,7 +86,6 @@ def check_callable(row): raise ValueError("metrics_list parameter must contain at least the columns [callable,name]") else: raise ValueError("metrics_list parameter must be a pandas DataFrame") - metrics_list.sort_values(by="set", inplace=True) if setname not in self.am.annotations["set"].values: raise ValueError( @@ -177,7 +178,7 @@ def __init_subclass__(cls, **kwargs): super().__init_subclass__(**kwargs) pipelines[cls.SUBCOMMAND] = cls - def _process_recording(self, annotations): #process recording line + def _process_conversation(self, conversation): #process recording line #keep lines for which conv_count is nopt Na and group by conv """for one unit (i.e. 1 recording) compute the list of required metrics and store the results in the current row of self.metrics @@ -186,7 +187,8 @@ def _process_recording(self, annotations): #process recording line :return: Series containing all the computed metrics result for that unit :rtype: pandas.Series """ - result = {} + meta, annotations = conversation + result = {'recording_filename': meta[0], 'conv_count': meta[1]} for i, line in self.metrics_list.iterrows(): annotations['voc_duration'] = annotations['segment_offset'] - annotations['segment_onset'] @@ -214,16 +216,49 @@ def extract(self): if self.threads == 1: self.conversations = pd.DataFrame( - [self._process_conversation(conversation) for group, conversation in conversations] + [self._process_conversation(block) for block in conversations] ) else: with mp.Pool( processes=self.threads if self.threads >= 1 else mp.cpu_count() ) as pool: self.conversations = pd.DataFrame( - pool.map(self._process_conversation, [conv for group, conv in conversations]) + pool.map(self._process_conversation, [block for block in conversations]) ) + # now add the rec_cols and child_cols in the result + if self.rec_cols: + if self.child_cols: + recs = self.project.recordings.drop(columns=( + [col for col in self.project.recordings.columns if (col not in self.rec_cols + and col != 'recording_filename' + and col != 'child_id')] + )) + chis = self.project.children.drop(columns=( + [col for col in self.project.children.columns if (col not in self.child_cols + and col != 'child_id')] + )) + meta = recs.merge(chis, how='inner', on='child_id') + self.conversations = self.conversations.merge(meta, how='left', on='recording_filename') + if 'child_id' not in self.child_cols and 'child_id' not in self.rec_cols: + self.conversations.drop(columns=['child_id']) + else: + recs = self.project.recordings.drop(columns=( + [col for col in self.project.recordings.columns if (col not in self.rec_cols + and col != 'recording_filename' + and col != 'child_id')] + )) + self.conversations = self.conversations.merge(recs, how='left', on='recording_filename') + elif self.child_cols: + chis = self.project.children.drop(columns=( + [col for col in self.project.children.columns if (col not in self.child_cols + and col != 'child_id')] + )) + meta = chis.merge(self.project.recordings[['recording_filename','child_id']], how='inner', on='child_id') + self.conversations = self.conversations.merge(meta, how='left', on='recording_filename') + if 'child_id' not in self.child_cols: + self.conversations.drop(columns=['child_id']) + return self.conversations def retrieve_segments(self, recording: str): @@ -310,91 +345,10 @@ def __init__( @staticmethod def add_parser(subparsers, subcommand): - parser = subparsers.add_parser(subcommand, help="metrics from a csv file") - parser.add_argument("metrics", - help="name if the csv file containing the list of metrics", - ) - - -class LenaMetrics(Conversations): - """LENA metrics extractor. - Extracts a number of metrics from the LENA .its annotations. - - :param project: ChildProject instance of the target dataset. - :type project: ChildProject.projects.ChildProject - :param set: name of the set associated to the .its annotations - :type set: str - :param recordings: recordings to sample from; if None, all recordings will be sampled, defaults to None - :type recordings: Union[str, List[str], pd.DataFrame], optional - :param from_time: If specified (in HH:MM:SS format), ignore annotations outside of the given time-range, defaults to None - :type from_time: str, optional - :param to_time: If specified (in HH:MM:SS format), ignore annotations outside of the given time-range, defaults to None - :type to_time: str, optional - :param rec_cols: comma separated columns from recordings.csv to include in the outputted metrics (optional), recording_filename,session_id,child_id,duration are always included if possible and dont need to be specified. Any column that is not unique for a given unit (eg date_iso for a child_id being recorded on multiple days) will output a value - :type rec_cols: str, optional - :param child_cols: comma separated columns from children.csv to include in the outputted metrics (optional), None by default - :type child_cols: str, optional - :param by: unit to extract metric from (recording_filename, experiment, child_id, session_id, segments), defaults to 'recording_filename', 'segments' is mandatory if passing the segments argument - :type by: str, optional - :param threads: amount of threads to run on, defaults to 1 - :type threads: int, optional - """ - - SUBCOMMAND = "lena" - - def __init__( - self, - project: ChildProject.projects.ChildProject, - set: str, - recordings: Union[str, List[str], pd.DataFrame] = None, - from_time: str = None, - to_time: str = None, - rec_cols: str = None, - child_cols: str = None, - by: str = "recording_filename", - threads: int = 1, - ): - self.set = set - - METRICS = pd.DataFrame(np.array( - [["voc_speaker_ph", self.set, 'FEM'], - ["voc_speaker_ph", self.set, 'MAL'], - ["voc_speaker_ph", self.set, 'OCH'], - ["voc_speaker_ph", self.set, 'CHI'], - ["voc_dur_speaker_ph", self.set, 'FEM'], - ["voc_dur_speaker_ph", self.set, 'MAL'], - ["voc_dur_speaker_ph", self.set, 'OCH'], - ["voc_dur_speaker_ph", self.set, 'CHI'], - ["avg_voc_dur_speaker", self.set, 'FEM'], - ["avg_voc_dur_speaker", self.set, 'MAL'], - ["avg_voc_dur_speaker", self.set, 'OCH'], - ["avg_voc_dur_speaker", self.set, 'CHI'], - ["wc_speaker_ph", self.set, 'FEM'], - ["wc_speaker_ph", self.set, 'MAL'], - ["wc_adu_ph", self.set, pd.NA], - ["lp_n", self.set, pd.NA], - ["lp_dur", self.set, pd.NA], - ["lena_CVC", self.set, pd.NA], - ["lena_CTC", self.set, pd.NA], - ]), columns=["callable", "set", "speaker"]) - - super().__init__(project, METRICS, by=by, recordings=recordings, - from_time=from_time, to_time=to_time, rec_cols=rec_cols, - child_cols=child_cols, threads=threads) - - if self.set not in self.am.annotations["set"].values: - raise ValueError( - f"annotation set '{self.set}' was not found in the index; " - "check spelling and make sure the set was properly imported." - ) - - @staticmethod - def add_parser(subparsers, subcommand): - parser = subparsers.add_parser(subcommand, help="LENA metrics") - parser.add_argument("set", help="name of the LENA its annotations set") + pass -class AclewConversations(Conversations): +class StandardConversations(Conversations): """ACLEW metrics extractor. Extracts a number of metrics from the ACLEW pipeline annotations, which includes: @@ -426,7 +380,7 @@ class AclewConversations(Conversations): :type threads: int, optional """ - SUBCOMMAND = "aclew" + SUBCOMMAND = "standard" def __init__( self, @@ -477,8 +431,7 @@ def __init__( @staticmethod def add_parser(subparsers, subcommand): - parser = subparsers.add_parser(subcommand, help="LENA metrics") - parser.add_argument("--set", help="set", default="vtc/conversations") + parser = subparsers.add_parser(subcommand, help="standard conversation extraction") class ConversationsPipeline(Pipeline): @@ -509,14 +462,14 @@ def run(self, path, destination, pipeline, func=None, **kwargs): if pipeline not in pipelines: raise NotImplementedError(f"invalid pipeline '{pipeline}'") - metrics = pipelines[pipeline](self.project, **kwargs) - metrics.extract() + conversations = pipelines[pipeline](self.project, **kwargs) + conversations.extract() - self.metrics = metrics.metrics - self.metrics.to_csv(self.destination, index=False) + self.conversations = conversations.conversations + self.conversations.to_csv(self.destination, index=False) # get the df of metrics used from the Metrics class - metrics_df = metrics.metrics_list + metrics_df = conversations.metrics_list metrics_df['callable'] = metrics_df.apply(lambda row: row['callable'].__name__, axis=1) # from the callables used, find their name back parameters['metrics_list'] = [{k: v for k, v in m.items() if pd.notnull(v)} for m in @@ -535,7 +488,7 @@ def run(self, path, destination, pipeline, func=None, **kwargs): ) print("exported sampler parameters to {}".format(self.parameters_path)) - return self.metrics + return self.conversations @staticmethod def setup_parser(parser): @@ -547,16 +500,16 @@ def setup_parser(parser): pipelines[pipeline].add_parser(subparsers, pipeline) parser.add_argument( - "--recordings", - help="path to a CSV dataframe containing the list of recordings to sample from (by default, all recordings will be sampled). The CSV should have one column named recording_filename.", - default=None, + "--set", + help="Set to use to get the conversation annotations", + required=True, + dest='setname' ) parser.add_argument( - "--by", - help="units to sample from (default behavior is to sample by recording)", - choices=["recording_filename", "session_id", "child_id", "experiment", "segments"], - default="recording_filename", + "--recordings", + help="path to a CSV dataframe containing the list of recordings to sample from (by default, all recordings will be sampled). The CSV should have one column named recording_filename.", + default=None, ) parser.add_argument( @@ -648,16 +601,16 @@ def run(self, parameters_input, func=None): if key not in {"metrics_list", "path", "destination", "dataset_hash"} } try: - metrics = Conversations(self.project, metrics_df, **arguments) + conversations = Conversations(self.project, metrics_df, **arguments) except TypeError as e: raise ValueError('Unrecognized parameter found {}'.format(e.args[0][46:])) from e - metrics.extract() + conversations.extract() - self.metrics = metrics.metrics - self.metrics.to_csv(self.destination, index=False) + self.conversations = conversations.conversations + self.conversations.to_csv(self.destination, index=False) # get the df of metrics used from the Metrics class - metrics_df = metrics.metrics_list + metrics_df = conversations.metrics_list metrics_df['callable'] = metrics_df.apply(lambda row: row['callable'].__name__, axis=1) # from the callables used, find their name back parameters['metrics_list'] = [{k: v for k, v in m.items() if pd.notnull(v)} for m in @@ -676,7 +629,7 @@ def run(self, parameters_input, func=None): ) print("exported metrics parameters to {}".format(self.parameters_path)) - return self.metrics + return self.conversations @staticmethod def setup_parser(parser): diff --git a/ChildProject/pipelines/metrics.py b/ChildProject/pipelines/metrics.py index b07b265f..e5bcb1b6 100644 --- a/ChildProject/pipelines/metrics.py +++ b/ChildProject/pipelines/metrics.py @@ -650,7 +650,7 @@ def __init__( @staticmethod def add_parser(subparsers, subcommand): - parser = subparsers.add_parser(subcommand, help="LENA metrics") + parser = subparsers.add_parser(subcommand, help="ACLEW metrics") parser.add_argument("--vtc", help="vtc set", default="vtc") parser.add_argument("--alice", help="alice set", default="alice") parser.add_argument("--vcm", help="vcm set", default="vcm") From f47e5aab6bc3d2ea054799446203d3266e31796d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Agata=20Kozio=C5=82?= Date: Thu, 14 Mar 2024 17:25:01 +0100 Subject: [PATCH 18/44] Added who_participates column --- ChildProject/pipelines/conversationFunctions.py | 3 +++ ChildProject/pipelines/conversations.py | 1 + 2 files changed, 4 insertions(+) diff --git a/ChildProject/pipelines/conversationFunctions.py b/ChildProject/pipelines/conversationFunctions.py index be895a66..c243342d 100644 --- a/ChildProject/pipelines/conversationFunctions.py +++ b/ChildProject/pipelines/conversationFunctions.py @@ -92,6 +92,9 @@ def who_initiated(annotations: pd.DataFrame): def who_finished(annotations: pd.DataFrame): return annotations.reset_index().iloc[-1]['speaker_type'] +@conversationFunction() +def who_participates(annotations: pd.DataFrame): + return annotations.reset_index()['speaker_type'].unique() @conversationFunction() def total_duration_of_vocalisations(annotations: pd.DataFrame): diff --git a/ChildProject/pipelines/conversations.py b/ChildProject/pipelines/conversations.py index f77059a5..dcac3367 100644 --- a/ChildProject/pipelines/conversations.py +++ b/ChildProject/pipelines/conversations.py @@ -401,6 +401,7 @@ def __init__( ["vocalisations_count", "vocalisations_count", pd.NA], ["who_initiated", "initiator", pd.NA], ["who_finished", "finisher", pd.NA], + ["who_participates", "participators", pd.NA], ["total_duration_of_vocalisations", "total_duration_of_vocalisations", pd.NA], ["conversation_duration", "conversation_duration", pd.NA], ["is_speaker", "CHI_present", 'CHI'], From 9107acbb2cd4bb32a986ae7d61f73aac7d84f5f5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Agata=20Kozio=C5=82?= Date: Thu, 14 Mar 2024 17:35:18 +0100 Subject: [PATCH 19/44] corrected typo --- ChildProject/pipelines/conversations.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ChildProject/pipelines/conversations.py b/ChildProject/pipelines/conversations.py index dcac3367..dd0f9c47 100644 --- a/ChildProject/pipelines/conversations.py +++ b/ChildProject/pipelines/conversations.py @@ -401,7 +401,7 @@ def __init__( ["vocalisations_count", "vocalisations_count", pd.NA], ["who_initiated", "initiator", pd.NA], ["who_finished", "finisher", pd.NA], - ["who_participates", "participators", pd.NA], + ["who_participates", "participants", pd.NA], ["total_duration_of_vocalisations", "total_duration_of_vocalisations", pd.NA], ["conversation_duration", "conversation_duration", pd.NA], ["is_speaker", "CHI_present", 'CHI'], From f71d7683634b5384443d6393fc151f8721fa84ab Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Agata=20Kozio=C5=82?= Date: Thu, 14 Mar 2024 17:42:54 +0100 Subject: [PATCH 20/44] Changed display of participants column --- ChildProject/pipelines/conversationFunctions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ChildProject/pipelines/conversationFunctions.py b/ChildProject/pipelines/conversationFunctions.py index c243342d..5c6286fc 100644 --- a/ChildProject/pipelines/conversationFunctions.py +++ b/ChildProject/pipelines/conversationFunctions.py @@ -94,7 +94,7 @@ def who_finished(annotations: pd.DataFrame): @conversationFunction() def who_participates(annotations: pd.DataFrame): - return annotations.reset_index()['speaker_type'].unique() + return '/'.join(annotations.reset_index()['speaker_type'].unique()) @conversationFunction() def total_duration_of_vocalisations(annotations: pd.DataFrame): From 1e67ca6c1a3567175f772cf26da2c06d145b5e52 Mon Sep 17 00:00:00 2001 From: Loann Peurey Date: Wed, 17 Apr 2024 11:44:07 +0200 Subject: [PATCH 21/44] remove .DS_Store --- .../valid_raw_data/annotations/alice/.DS_Store | Bin 6148 -> 0 bytes 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 examples/valid_raw_data/annotations/alice/.DS_Store diff --git a/examples/valid_raw_data/annotations/alice/.DS_Store b/examples/valid_raw_data/annotations/alice/.DS_Store deleted file mode 100644 index 79bc424dfe9688bd5dd30d2e58abed6c5c569f39..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 6148 zcmeHK!AiqG5Z!HSO(;SR3VK`cTC^!r3tnQa2d_r-pi&c4G#Im`N$sH&a@Qa7NBkX~ z+1-kzdJ~Z{12bI)BaE;#8DqYt}fyw%v2KCX;5XT^DY%J*|t0>o(lFXzlM!r?$1Ry?b!ldq^Hr`K(@>ari)$ ztQnla3k>EK_2duJM5cG(pXJPA35fw>fEZX32F&qh)s|#Ev Date: Wed, 24 Apr 2024 13:41:40 +0200 Subject: [PATCH 22/44] minor formatting changes in metrics --- ChildProject/pipelines/metrics.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/ChildProject/pipelines/metrics.py b/ChildProject/pipelines/metrics.py index e5bcb1b6..7e7d5c8f 100644 --- a/ChildProject/pipelines/metrics.py +++ b/ChildProject/pipelines/metrics.py @@ -3,6 +3,8 @@ import argparse import datetime import multiprocessing as mp +import logging + import numpy as np import pandas as pd from typing import Union, List @@ -19,6 +21,11 @@ pipelines = {} +# Create a logger for the module (file) +logger_metrics = logging.getLogger(__name__) +# messages are propagated to the higher level logger (ChildProject), used in cmdline.py +logger_metrics.propagate = True + class Metrics(ABC): """ Main class for generating metrics from a project object and a list of desired metrics @@ -442,7 +449,7 @@ def __init__( def add_parser(subparsers, subcommand): parser = subparsers.add_parser(subcommand, help="metrics from a csv file") parser.add_argument("metrics", - help="name if the csv file containing the list of metrics", + help="name of the csv file containing the list of metrics", ) class LenaMetrics(Metrics): From bd0f1a02a34fcb7829b054651838e4577d4f6209 Mon Sep 17 00:00:00 2001 From: Loann Peurey Date: Wed, 24 Apr 2024 13:42:03 +0200 Subject: [PATCH 23/44] making custom and specification pipelines work --- ChildProject/pipelines/conversations.py | 231 +++++++++++++----------- 1 file changed, 127 insertions(+), 104 deletions(-) diff --git a/ChildProject/pipelines/conversations.py b/ChildProject/pipelines/conversations.py index dd0f9c47..18729053 100644 --- a/ChildProject/pipelines/conversations.py +++ b/ChildProject/pipelines/conversations.py @@ -3,6 +3,8 @@ import argparse import datetime import multiprocessing as mp +import logging + import numpy as np import pandas as pd from typing import Union, List @@ -12,6 +14,7 @@ import ChildProject from ChildProject.pipelines.pipeline import Pipeline +from ChildProject.annotations import AnnotationManager from ChildProject.tables import assert_dataframe, assert_columns_presence, read_csv_with_dtype import ChildProject.pipelines.conversationFunctions as convfunc @@ -19,26 +22,30 @@ pipelines = {} +# Create a logger for the module (file) +logger_conversations = logging.getLogger(__name__) +# messages are propagated to the higher level logger (ChildProject), used in cmdline.py +logger_conversations.propagate = True class Conversations(ABC): """ - Main class for generating conversational metrics from a project object and a list of desired metrics + Main class for generating a conversational extraction from a project object and a list of desired features :param project: ChildProject instance of the target dataset. :type project: ChildProject.projects.ChildProject - :param metrics_list: pandas DataFrame containing the desired metrics (metrics functions are in metricsFunctions.py) - :type metrics_list: pd.DataFrame - :param by: unit to extract metric from (recording_filename, experiment, child_id, session_id, segments), defaults to 'recording_filename', 'segments' is mandatory if passing the segments argument - :type by: str, optional + :param setname: set to extract conversations from (recording_filename, experiment, child_id, session_id, segments), defaults to 'recording_filename', 'segments' is mandatory if passing the segments argument + :type setname: str + :param features_list: pandas DataFrame containing the desired features (features functions are in conversationsFunctions.py) + :type features_list: pd.DataFrame :param recordings: recordings to sample from; if None, all recordings will be sampled, defaults to None :type recordings: Union[str, List[str], pd.DataFrame], optional :param from_time: If specified (in HH:MM:SS format), ignore annotations outside of the given time-range, defaults to None :type from_time: str, optional :param to_time: If specified (in HH:MM:SS format), ignore annotations outside of the given time-range, defaults to None :type to_time: str, optional - :param rec_cols: comma separated columns from recordings.csv to include in the outputted metrics (optional), recording_filename,session_id,child_id,duration are always included if possible and dont need to be specified. Any column that is not unique for a given unit (eg date_iso for a child_id being recorded on multiple days) will output a value + :param rec_cols: comma separated columns from recordings.csv to include in the outputted extraction (optional), recording_filename,session_id,child_id,duration are always included if possible and dont need to be specified. Any column that is not unique for a given unit (eg date_iso for a child_id being recorded on multiple days) will output a value :type rec_cols: str, optional - :param child_cols: comma separated columns from children.csv to include in the outputted metrics (optional), None by default + :param child_cols: comma separated columns from children.csv to include in the outputted extraction (optional), None by default :type child_cols: str, optional :param threads: amount of threads to run on, defaults to 1 :type threads: int, optional @@ -48,7 +55,7 @@ def __init__( self, project: ChildProject.projects.ChildProject, setname: str, - metrics_list: pd.DataFrame, + features_list: pd.DataFrame, recordings: Union[str, List[str], pd.DataFrame] = None, from_time: str = None, to_time: str = None, @@ -63,7 +70,7 @@ def __init__( self.conversations = None # check that the callable column is either a callable function or a string that can be found as being part of - # the list of metrics in ChildProject/pipelines/conversationFunctions.py + # the list of features in ChildProject/pipelines/conversationFunctions.py def check_callable(row): if callable(row["callable"]): return row["callable"] if isinstance(row["callable"], str): @@ -76,16 +83,16 @@ def check_callable(row): return f else: raise ValueError( - "{} cannot be evaluated as a metric, must be a callable object or a string".format(row["callable"])) + "{} cannot be evaluated as a feature, must be a callable object or a string".format(row["callable"])) # block checking presence of required columns and evaluates the callable functions - if isinstance(metrics_list, pd.DataFrame): - if ({'callable', 'name'}).issubset(metrics_list.columns): - metrics_list["callable"] = metrics_list.apply(check_callable, axis=1) + if isinstance(features_list, pd.DataFrame): + if ({'callable', 'name'}).issubset(features_list.columns): + features_list["callable"] = features_list.apply(check_callable, axis=1) else: - raise ValueError("metrics_list parameter must contain at least the columns [callable,name]") + raise ValueError("features_list parameter must contain at least the columns [callable,name]") else: - raise ValueError("metrics_list parameter must be a pandas DataFrame") + raise ValueError("features_list parameter must be a pandas DataFrame") if setname not in self.am.annotations["set"].values: raise ValueError( @@ -93,9 +100,9 @@ def check_callable(row): "check spelling and make sure the set was properly imported." ) self.set = setname - self.metrics_list = metrics_list + self.features_list = features_list - # necessary columns to construct the metrics + # necessary columns to construct the conversations join_columns = { "recording_filename", "child_id", @@ -111,8 +118,9 @@ def check_callable(row): rec_cols = set(rec_cols.split(",")) for i in rec_cols: if i not in correct_cols: - print( - "Warning, requested column <{}> does not exist in recordings.csv, ignoring this column. existing columns are : {}".format( + logger_conversations.warning( + "requested column <{}> does not exist in recordings.csv,\ + ignoring this column. existing columns are : {}".format( i, correct_cols)) rec_cols &= correct_cols # add wanted columns to the one we already get @@ -136,9 +144,9 @@ def check_callable(row): child_cols.add("child_id") for i in child_cols: if i not in correct_cols: - print( - "Warning, requested column <{}> does not exist in children.csv, ignoring this column. existing columns are : {}".format( - i, correct_cols)) + logger_conversations.warning( + "requested column <{}> does not exist in children.csv, ignoring this column. existing\ + columns are : {}".format(i, correct_cols)) child_cols &= correct_cols self.child_cols = child_cols @@ -160,7 +168,7 @@ def check_callable(row): if from_time: try: self.from_time = datetime.datetime.strptime(from_time, "%H:%M:%S") - except: + except ValueError: raise ValueError( f"invalid value for from_time ('{from_time}'); should have HH:MM:SS format instead") else: @@ -169,7 +177,7 @@ def check_callable(row): if to_time: try: self.to_time = datetime.datetime.strptime(to_time, "%H:%M:%S") - except: + except ValueError: raise ValueError(f"invalid value for to_time ('{to_time}'); should have HH:MM:SS format instead") else: self.to_time = None @@ -179,17 +187,18 @@ def __init_subclass__(cls, **kwargs): pipelines[cls.SUBCOMMAND] = cls def _process_conversation(self, conversation): #process recording line - #keep lines for which conv_count is nopt Na and group by conv - """for one unit (i.e. 1 recording) compute the list of required metrics and store the results in the current row of self.metrics + #keep lines for which conv_count is not Na and group by conv + """for one unit (i.e. 1 recording) compute the list of required features and store the results in the current + row of self.conversations :param row: index and Series of the unit to process, to be modified with the results :type row: (int , pandas.Series) - :return: Series containing all the computed metrics result for that unit + :return: Series containing all the computed features result for that unit :rtype: pandas.Series """ meta, annotations = conversation result = {'recording_filename': meta[0], 'conv_count': meta[1]} - for i, line in self.metrics_list.iterrows(): + for i, line in self.features_list.iterrows(): annotations['voc_duration'] = annotations['segment_offset'] - annotations['segment_onset'] @@ -198,10 +207,11 @@ def _process_conversation(self, conversation): #process recording line return result def extract(self): - """from the initiated self.metrics, compute each row metrics (handles threading) - Once the Metrics class is initialized, call this function to extract the metrics and populate self.metrics + """from the initiated self.features_list, compute each row feature (handles threading) + Once the Conversation class is initialized, call this function to extract the features and populate + self.conversations - :return: DataFrame of computed metrics + :return: DataFrame of computed features :rtype: pandas.DataFrame """ if self.threads == 1: @@ -212,19 +222,20 @@ def extract(self): ) as pool: full_annotations = pd.concat(pool.map(self.retrieve_segments, self.recordings)) - conversations = full_annotations.groupby(['recording_filename', 'conv_count']) + grouper = ['recording_filename', 'conv_count'] + conversations = full_annotations.groupby(grouper) if self.threads == 1: self.conversations = pd.DataFrame( [self._process_conversation(block) for block in conversations] - ) + ) if len(conversations) else pd.DataFrame(columns=grouper) else: with mp.Pool( processes=self.threads if self.threads >= 1 else mp.cpu_count() ) as pool: self.conversations = pd.DataFrame( pool.map(self._process_conversation, [block for block in conversations]) - ) + ) if len(conversations) else pd.DataFrame(columns=grouper) # now add the rec_cols and child_cols in the result if self.rec_cols: @@ -254,22 +265,22 @@ def extract(self): [col for col in self.project.children.columns if (col not in self.child_cols and col != 'child_id')] )) - meta = chis.merge(self.project.recordings[['recording_filename','child_id']], how='inner', on='child_id') + meta = chis.merge(self.project.recordings[['recording_filename', 'child_id']], how='inner', on='child_id') self.conversations = self.conversations.merge(meta, how='left', on='recording_filename') if 'child_id' not in self.child_cols: self.conversations.drop(columns=['child_id']) + if not self.conversations.shape[0]: + logger_conversations.warning("The extraction did not find any conversation") return self.conversations def retrieve_segments(self, recording: str): """from a list of sets and a row identifying the unit computed, return the relevant annotation segments - :param sets: List of annotation sets to keep - :type sets: List[str] - :param row: Series storing the unit to compute information - :type row: pandas.Series - :return: relevant annotation DataFrame and index DataFrame - :rtype: (pandas.DataFrame , pandas.DataFrame) + :param recording: recording + :type recording: str + :return: relevant annotation segments + :rtype: pandas.DataFrame """ annotations = self.am.annotations[recording == self.am.annotations['recording_filename']] annotations = annotations[annotations["set"] == self.set] @@ -285,7 +296,8 @@ def retrieve_segments(self, recording: str): segments = segments.dropna(subset='conv_count') else: # no annotations for that unit - return pd.DataFrame(), pd.DataFrame() + return pd.DataFrame(columns=([c.name for c in AnnotationManager.SEGMENTS_COLUMNS if c.required] + + list(annotations.columns) + ['conv_count'])) segments['recording_filename'] = recording @@ -295,30 +307,29 @@ def retrieve_segments(self, recording: str): class CustomConversations(Conversations): - """metrics extraction from a csv file. - Extracts a number of metrics listed in a csv file as a dataframe. + """conversations extraction from a csv file. + Extracts a number of features listed in a csv file as a dataframe. the csv file must contain the columns : - - 'callable' which is the name of the wanted metric from the list of available metrics - - 'set' which is the set of annotations to use for that specific metric (make sure this set has the required columns for that metric) - - 'name' is optional, this is the name to give to that metric (if not given, a default name will be attributed) - - any other necessary argument for the given metrics (eg the voc_speaker_ph metric requires the 'speaker' argument: add a column 'speaker' in the csv file and fill its cells for this metric with the wanted value (CHI|FEM|MAL|OCH)) + - 'callable' which is the name of the wanted feature from the list of available features + - 'name' is the name to give to that feature + - any other necessary argument for the given feature (eg the is_speaker feature requires the 'speaker' argument: add a column 'speaker' in the csv file and fill its cells for this feature with the wanted value (CHI|FEM|MAL|OCH)) :param project: ChildProject instance of the target dataset. :type project: ChildProject.projects.ChildProject - :param metrics: name of the csv file listing the metrics to extract - :type metrics: str + :param setname: name of the set to extract conversations from + :type setname: str + :param features: name of the csv file listing the features to extract + :type features: str :param recordings: recordings to sample from; if None, all recordings will be sampled, defaults to None :type recordings: Union[str, List[str], pd.DataFrame], optional :param from_time: If specified (in HH:MM:SS format), ignore annotations outside of the given time-range, defaults to None :type from_time: str, optional :param to_time: If specified (in HH:MM:SS format), ignore annotations outside of the given time-range, defaults to None :type to_time: str, optional - :param rec_cols: comma separated columns from recordings.csv to include in the outputted metrics (optional), recording_filename,session_id,child_id,duration are always included if possible and dont need to be specified. Any column that is not unique for a given unit (eg date_iso for a child_id being recorded on multiple days) will output a value + :param rec_cols: comma separated columns from recordings.csv to include in the outputted conversations (optional), recording_filename,session_id,child_id,duration are always included if possible and dont need to be specified. Any column that is not unique for a given unit (eg date_iso for a child_id being recorded on multiple days) will output a value :type rec_cols: str, optional - :param child_cols: comma separated columns from children.csv to include in the outputted metrics (optional), None by default + :param child_cols: comma separated columns from children.csv to include in the outputted conversations (optional), None by default :type child_cols: str, optional - :param by: unit to extract metric from (recording_filename, experiment, child_id, session_id, segments), defaults to 'recording_filename', 'segments' is mandatory if passing the segments argument - :type by: str, optional :param threads: amount of threads to run on, defaults to 1 :type threads: int, optional """ @@ -328,29 +339,32 @@ class CustomConversations(Conversations): def __init__( self, project: ChildProject.projects.ChildProject, - metrics: str, + setname: str, + features: str, recordings: Union[str, List[str], pd.DataFrame] = None, from_time: str = None, to_time: str = None, rec_cols: str = None, child_cols: str = None, - by: str = "recording_filename", threads: int = 1, ): - metrics_df = pd.read_csv(metrics) + features_df = pd.read_csv(features) - super().__init__(project, metrics_df, by=by, recordings=recordings, + super().__init__(project, setname, features_df, recordings=recordings, from_time=from_time, to_time=to_time, rec_cols=rec_cols, child_cols=child_cols, threads=threads) @staticmethod def add_parser(subparsers, subcommand): - pass + parser = subparsers.add_parser(subcommand, help="custom conversation extraction") + parser.add_argument("features", + help="name of the csv file containing the list of features to extract", + ) class StandardConversations(Conversations): - """ACLEW metrics extractor. - Extracts a number of metrics from the ACLEW pipeline annotations, which includes: + """ACLEW conversations extractor. + Extracts a number of conversations from the ACLEW pipeline annotations, which includes: - The Voice Type Classifier by Lavechin et al. (arXiv:2005.12656) - The Automatic LInguistic Unit Count Estimator (ALICE) by Räsänen et al. (doi:10.3758/s13428-020-01460-x) @@ -370,12 +384,10 @@ class StandardConversations(Conversations): :type from_time: str, optional :param to_time: If specified (in HH:MM:SS format), ignore annotations outside of the given time-range, defaults to None :type to_time: str, optional - :param rec_cols: comma separated columns from recordings.csv to include in the outputted metrics (optional), recording_filename,session_id,child_id,duration are always included if possible and dont need to be specified. Any column that is not unique for a given unit (eg date_iso for a child_id being recorded on multiple days) will output a value + :param rec_cols: comma separated columns from recordings.csv to include in the outputted conversations (optional), recording_filename,session_id,child_id,duration are always included if possible and dont need to be specified. Any column that is not unique for a given unit (eg date_iso for a child_id being recorded on multiple days) will output a value :type rec_cols: str, optional - :param child_cols: comma separated columns from children.csv to include in the outputted metrics (optional), None by default + :param child_cols: comma separated columns from children.csv to include in the outputted conversations (optional), None by default :type child_cols: str, optional - :param by: unit to extract metric from (recording_filename, experiment, child_id, session_id, segments), defaults to 'recording_filename', 'segments' is mandatory if passing the segments argument - :type by: str, optional :param threads: amount of threads to run on, defaults to 1 :type threads: int, optional """ @@ -394,7 +406,7 @@ def __init__( threads: int = 1, ): - METRICS = np.array( + features = np.array( [["conversation_onset", "conversation_onset", pd.NA], ["conversation_offset", "conversation_offset", pd.NA], ["conversation_duration", "conversation_duration", pd.NA], @@ -423,9 +435,9 @@ def __init__( ["assign_conv_type", "conversation_type", pd.NA], ]) - METRICS = pd.DataFrame(METRICS, columns=["callable", "name", "speaker"]) + features = pd.DataFrame(features, columns=["callable", "name", "speaker"]) - super().__init__(project, setname, METRICS, recordings=recordings, + super().__init__(project, setname, features, recordings=recordings, from_time=from_time, to_time=to_time, rec_cols=rec_cols, child_cols=child_cols, threads=threads) @@ -437,7 +449,10 @@ def add_parser(subparsers, subcommand): class ConversationsPipeline(Pipeline): def __init__(self): - self.metrics = [] + self.destination = None + self.project = None + self.conversations = None + self.parameters_path = None def run(self, path, destination, pipeline, func=None, **kwargs): self.destination = destination @@ -458,7 +473,7 @@ def run(self, path, destination, pipeline, func=None, **kwargs): datarepo = Repo(path) parameters['dataset_hash'] = datarepo.head.object.hexsha except InvalidGitRepositoryError: - print("Your dataset is not currently a git repository") + logger_conversations.warning("Your dataset is not currently a git repository") if pipeline not in pipelines: raise NotImplementedError(f"invalid pipeline '{pipeline}'") @@ -469,16 +484,16 @@ def run(self, path, destination, pipeline, func=None, **kwargs): self.conversations = conversations.conversations self.conversations.to_csv(self.destination, index=False) - # get the df of metrics used from the Metrics class - metrics_df = conversations.metrics_list - metrics_df['callable'] = metrics_df.apply(lambda row: row['callable'].__name__, + # get the df of features used from the Conversations class + features_df = conversations.features_list + features_df['callable'] = features_df.apply(lambda row: row['callable'].__name__, axis=1) # from the callables used, find their name back - parameters['metrics_list'] = [{k: v for k, v in m.items() if pd.notnull(v)} for m in - metrics_df.to_dict(orient='records')] + parameters['features_list'] = [{k: v for k, v in m.items() if pd.notnull(v)} for m in + features_df.to_dict(orient='records')] date = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") # create a yaml file with all the parameters used self.parameters_path = os.path.splitext(self.destination)[0] + "_parameters_{}.yml".format(date) - print("exported metrics to {}".format(self.destination)) + logger_conversations.info("exported conversations to {}".format(self.destination)) yaml.dump( { "package_version": ChildProject.__version__, @@ -487,7 +502,7 @@ def run(self, path, destination, pipeline, func=None, **kwargs): }, open(self.parameters_path, "w+"), sort_keys=False, ) - print("exported sampler parameters to {}".format(self.parameters_path)) + logger_conversations.info("exported sampler parameters to {}".format(self.parameters_path)) return self.conversations @@ -509,7 +524,8 @@ def setup_parser(parser): parser.add_argument( "--recordings", - help="path to a CSV dataframe containing the list of recordings to sample from (by default, all recordings will be sampled). The CSV should have one column named recording_filename.", + help=("path to a CSV dataframe containing the list of recordings to sample from (by default, all recordings" + " will be sampled). The CSV should have one column named recording_filename."), default=None, ) @@ -529,13 +545,15 @@ def setup_parser(parser): parser.add_argument( "--rec-cols", - help="comma separated columns from recordings.csv to include in the outputted metrics (optional), NA if ambiguous", + help=("comma separated columns from recordings.csv to include in the outputted conversations (optional)," + " NA if ambiguous"), default=None, ) parser.add_argument( "--child-cols", - help="comma separated columns from children.csv to include in the outputted metrics (optional), NA if ambiguous", + help=("comma separated columns from children.csv to include in the outputted conversations (optional)," + " NA if ambiguous"), default=None, ) @@ -546,7 +564,10 @@ def setup_parser(parser): class ConversationsSpecificationPipeline(Pipeline): def __init__(self): - self.metrics = [] + self.destination = None + self.project = None + self.conversations = None + self.parameters_path = None def run(self, parameters_input, func=None): # build a dictionary with all parameters used @@ -554,7 +575,8 @@ def run(self, parameters_input, func=None): with open(parameters_input, "r") as stream: try: parameters = yaml.safe_load(stream) - if 'parameters' in parameters: parameters = parameters['parameters'] + if 'parameters' in parameters: + parameters = parameters['parameters'] except yaml.YAMLError as exc: raise yaml.YAMLError( "parsing of the parameters file {} failed. See above exception for more details".format( @@ -563,21 +585,21 @@ def run(self, parameters_input, func=None): if parameters: if "path" not in parameters: raise ValueError( - "the parameter file {} must contain at least the 'path' key specifying the path to the dataset".format( - parameters_input)) + ("the parameter file {} must contain at least the 'path' key specifying the path to the " + "dataset".format(parameters_input))) if "destination" not in parameters: raise ValueError( - "the parameter file {} must contain the 'destination' key specifying the file to output the metrics to".format( - parameters_input)) - if "metrics_list" not in parameters: + ("the parameter file {} must contain the 'destination' key specifying the file to output " + "the conversations to".format(parameters_input))) + if "features_list" not in parameters: raise ValueError( - "the parameter file {} must contain the 'metrics_list' key containing the list of the desired metrics".format( - parameters_input)) + ("the parameter file {} must contain the 'features_list' key containing the list of the desired " + "features".format(parameters_input))) try: - metrics_df = pd.DataFrame(parameters["metrics_list"]) + features_df = pd.DataFrame(parameters["features_list"]) except Exception as e: raise ValueError( - "The 'metrics_list' key in {} must be a list of elements".format(parameters_input)) from e + "The 'features_list' key in {} must be a list of elements".format(parameters_input)) from e else: raise ValueError("could not find any parameters in {}".format(parameters_input)) @@ -585,24 +607,25 @@ def run(self, parameters_input, func=None): datarepo = Repo(parameters["path"]) parameters['dataset_hash'] = datarepo.head.object.hexsha except InvalidGitRepositoryError: - print("Your dataset is not currently a git repository") + logger_conversations.warning("Your dataset is not currently a git repository") self.project = ChildProject.projects.ChildProject(parameters["path"]) self.project.read() self.destination = parameters['destination'] - unwanted_keys = {'metrics', 'pipeline'} + unwanted_keys = {'features', 'pipeline'} for i in unwanted_keys: - if i in parameters: del parameters[i] + if i in parameters: + del parameters[i] arguments = { key: parameters[key] for key in parameters - if key not in {"metrics_list", "path", "destination", "dataset_hash"} + if key not in {"features_list", "path", "destination", "dataset_hash"} } try: - conversations = Conversations(self.project, metrics_df, **arguments) + conversations = Conversations(self.project, features_list=features_df, **arguments) except TypeError as e: raise ValueError('Unrecognized parameter found {}'.format(e.args[0][46:])) from e conversations.extract() @@ -610,16 +633,16 @@ def run(self, parameters_input, func=None): self.conversations = conversations.conversations self.conversations.to_csv(self.destination, index=False) - # get the df of metrics used from the Metrics class - metrics_df = conversations.metrics_list - metrics_df['callable'] = metrics_df.apply(lambda row: row['callable'].__name__, - axis=1) # from the callables used, find their name back - parameters['metrics_list'] = [{k: v for k, v in m.items() if pd.notnull(v)} for m in - metrics_df.to_dict(orient='records')] + # get the df of features used from the Conversations class + features_df = conversations.features_list + features_df['callable'] = features_df.apply(lambda row: row['callable'].__name__, + axis=1) # from the callables used, find their name back + parameters['features_list'] = [{k: v for k, v in m.items() if pd.notnull(v)} for m in + features_df.to_dict(orient='records')] date = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") # create a yaml file with all the parameters used self.parameters_path = os.path.splitext(self.destination)[0] + "_parameters_{}.yml".format(date) - print("exported metrics to {}".format(self.destination)) + logger_conversations.info("exported conversations to {}".format(self.destination)) yaml.dump( { "package_version": ChildProject.__version__, @@ -628,7 +651,7 @@ def run(self, parameters_input, func=None): }, open(self.parameters_path, "w+"), sort_keys=False, ) - print("exported metrics parameters to {}".format(self.parameters_path)) + logger_conversations.info("exported conversations parameters to {}".format(self.parameters_path)) return self.conversations From f7c209d669020d1157d3b4349a6012da633850de Mon Sep 17 00:00:00 2001 From: Loann Peurey <100950340+LoannPeurey@users.noreply.github.com> Date: Wed, 5 Jun 2024 18:36:33 +0200 Subject: [PATCH 24/44] Update conversations.py --- ChildProject/pipelines/conversations.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/ChildProject/pipelines/conversations.py b/ChildProject/pipelines/conversations.py index 18729053..a9b75d6a 100644 --- a/ChildProject/pipelines/conversations.py +++ b/ChildProject/pipelines/conversations.py @@ -293,6 +293,10 @@ def retrieve_segments(self, recording: str): if matches.shape[0]: segments = self.am.get_segments(matches) + if not segments.shape[0]: + # no annotations for that unit + return pd.DataFrame(columns=([c.name for c in AnnotationManager.SEGMENTS_COLUMNS if c.required] + + list(annotations.columns) + ['conv_count'])) segments = segments.dropna(subset='conv_count') else: # no annotations for that unit @@ -657,4 +661,4 @@ def run(self, parameters_input, func=None): @staticmethod def setup_parser(parser): - parser.add_argument("parameters_input", help="path to the yml file with all parameters") \ No newline at end of file + parser.add_argument("parameters_input", help="path to the yml file with all parameters") From a45803eef635078bbe21485bbc205fd242a0c075 Mon Sep 17 00:00:00 2001 From: Loann Peurey <100950340+LoannPeurey@users.noreply.github.com> Date: Wed, 5 Jun 2024 18:45:18 +0200 Subject: [PATCH 25/44] Update conversations.py --- ChildProject/pipelines/conversations.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ChildProject/pipelines/conversations.py b/ChildProject/pipelines/conversations.py index a9b75d6a..586b35bc 100644 --- a/ChildProject/pipelines/conversations.py +++ b/ChildProject/pipelines/conversations.py @@ -307,7 +307,7 @@ def retrieve_segments(self, recording: str): #TODO check that required columns exist - return segments + return segments.reset_index(drop=True) class CustomConversations(Conversations): From 7c22e1c3db44c79de906fe1958109dc93011cb83 Mon Sep 17 00:00:00 2001 From: Loann Peurey <100950340+LoannPeurey@users.noreply.github.com> Date: Wed, 5 Jun 2024 19:14:35 +0200 Subject: [PATCH 26/44] ensure required columns are present --- ChildProject/pipelines/derivations.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ChildProject/pipelines/derivations.py b/ChildProject/pipelines/derivations.py index 4a05c8c5..0d9c0c25 100644 --- a/ChildProject/pipelines/derivations.py +++ b/ChildProject/pipelines/derivations.py @@ -175,7 +175,7 @@ def conversations(project, return df else: - return pd.DataFrame([]) + return pd.DataFrame([], columns=['segment_onset', 'raw_filename', 'segment_offset']) def remove_overlaps(project, From 317166b34e2f028a0f654bb083381c4c2b235d9d Mon Sep 17 00:00:00 2001 From: Loann Peurey Date: Thu, 6 Jun 2024 16:22:49 +0200 Subject: [PATCH 27/44] prevent duplicate columns when returning empty dataframe --- ChildProject/pipelines/conversations.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/ChildProject/pipelines/conversations.py b/ChildProject/pipelines/conversations.py index 586b35bc..ff555cb5 100644 --- a/ChildProject/pipelines/conversations.py +++ b/ChildProject/pipelines/conversations.py @@ -295,13 +295,13 @@ def retrieve_segments(self, recording: str): segments = self.am.get_segments(matches) if not segments.shape[0]: # no annotations for that unit - return pd.DataFrame(columns=([c.name for c in AnnotationManager.SEGMENTS_COLUMNS if c.required] - + list(annotations.columns) + ['conv_count'])) + return pd.DataFrame(columns=list(set([c.name for c in AnnotationManager.SEGMENTS_COLUMNS if c.required] + + list(annotations.columns) + ['conv_count']))) segments = segments.dropna(subset='conv_count') else: # no annotations for that unit - return pd.DataFrame(columns=([c.name for c in AnnotationManager.SEGMENTS_COLUMNS if c.required] - + list(annotations.columns) + ['conv_count'])) + return pd.DataFrame(columns=list(set([c.name for c in AnnotationManager.SEGMENTS_COLUMNS if c.required] + + list(annotations.columns) + ['conv_count']))) segments['recording_filename'] = recording From 056cf604b5152b41cb6bb01bcb22d6717f30dbf4 Mon Sep 17 00:00:00 2001 From: Loann Peurey Date: Thu, 27 Jun 2024 18:08:01 +0200 Subject: [PATCH 28/44] silence warning for inplace changes of a copy --- ChildProject/annotations.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ChildProject/annotations.py b/ChildProject/annotations.py index f1046539..35f816bf 100644 --- a/ChildProject/annotations.py +++ b/ChildProject/annotations.py @@ -1966,8 +1966,8 @@ def clip_segments(segments: pd.DataFrame, start: int, stop: int) -> pd.DataFrame start = int(start) stop = int(stop) - segments["segment_onset"].clip(lower=start, upper=stop, inplace=True) - segments["segment_offset"].clip(lower=start, upper=stop, inplace=True) + segments["segment_onset"] = segments["segment_onset"].clip(lower=start, upper=stop) + segments["segment_offset"] = segments["segment_offset"].clip(lower=start, upper=stop) segments = segments[segments["segment_offset"] > segments["segment_onset"]] From 61cc5a13a56370e48df0575da538f38675c7bef5 Mon Sep 17 00:00:00 2001 From: Loann Peurey Date: Thu, 27 Jun 2024 18:09:55 +0200 Subject: [PATCH 29/44] exploratory on conversations extract optimizations --- ChildProject/pipelines/conversations.py | 84 ++++++++++++++++++------- 1 file changed, 60 insertions(+), 24 deletions(-) diff --git a/ChildProject/pipelines/conversations.py b/ChildProject/pipelines/conversations.py index ff555cb5..a141defa 100644 --- a/ChildProject/pipelines/conversations.py +++ b/ChildProject/pipelines/conversations.py @@ -1,5 +1,6 @@ from abc import ABC, abstractmethod import os +import itertools import argparse import datetime import multiprocessing as mp @@ -20,6 +21,8 @@ import ChildProject.pipelines.conversationFunctions as convfunc from ..utils import TimeInterval +import time # RM + pipelines = {} # Create a logger for the module (file) @@ -89,6 +92,13 @@ def check_callable(row): if isinstance(features_list, pd.DataFrame): if ({'callable', 'name'}).issubset(features_list.columns): features_list["callable"] = features_list.apply(check_callable, axis=1) + try: + features_list = features_list.set_index('name', verify_integrity=True) + except ValueError as e: + raise ValueError("features_list parameter has duplicates in 'name' column") from e + features_list['args'] = features_list.drop(['callable'], axis=1).apply( + lambda row: row.dropna().to_dict(), axis=1) + features_list = features_list[['callable', 'args']] else: raise ValueError("features_list parameter must contain at least the columns [callable,name]") else: @@ -196,15 +206,37 @@ def _process_conversation(self, conversation): #process recording line :return: Series containing all the computed features result for that unit :rtype: pandas.Series """ - meta, annotations = conversation - result = {'recording_filename': meta[0], 'conv_count': meta[1]} - for i, line in self.features_list.iterrows(): + start_sub = time.time()# RM + # meta, segments = conversation + segments = conversation + meta = conversation.name + + result = {} + # move to init or at least something done once only + index = self.features_list.to_dict(orient="index") + # for i in index: + # + # result[i] = index[i]["callable"](segments, **index[i]['args']) + result['test'] = segments.reset_index().iloc[0]['segment_onset'] + + # result['recording_filename'] = meta[0] + result['conv_count'] = meta - annotations['voc_duration'] = annotations['segment_offset'] - annotations['segment_onset'] + return result#, time.time() - start_sub #RM last bit - result[line['name']] = line["callable"](annotations, **line.drop(['callable', 'name']).dropna().to_dict()) + def _process_recording(self, recording): + grouper = 'conv_count' + start_sub = time.time() # RM + segments = self.retrieve_segments(recording) + segments['voc_duration'] = segments['segment_offset'] - segments['segment_onset'] - return result + conversations = segments.groupby(grouper, group_keys=True) + + #keep as Series?? + extractions = conversations.apply(self._process_conversation).to_list() if len(conversations) else [] + # extractions = [self._process_conversation(block) for block in conversations] + + return extractions, time.time() - start_sub #RM last bit def extract(self): """from the initiated self.features_list, compute each row feature (handles threading) @@ -215,27 +247,32 @@ def extract(self): :rtype: pandas.DataFrame """ if self.threads == 1: - full_annotations = pd.concat([self.retrieve_segments(rec) for rec in self.recordings]) - else: - with mp.Pool( - processes=self.threads if self.threads >= 1 else mp.cpu_count() - ) as pool: - full_annotations = pd.concat(pool.map(self.retrieve_segments, self.recordings)) + extractions = [] + for rec in self.recordings: + segments = self.retrieve_segments(rec) - grouper = ['recording_filename', 'conv_count'] - conversations = full_annotations.groupby(grouper) + conversations = segments.groupby(grouper) - if self.threads == 1: - self.conversations = pd.DataFrame( - [self._process_conversation(block) for block in conversations] - ) if len(conversations) else pd.DataFrame(columns=grouper) + extractions += [self._process_conversation(block) for block in conversations] + self.conversations = pd.DataFrame(extractions) if len(extractions) else pd.DataFrame(columns=grouper) else: + import time with mp.Pool( processes=self.threads if self.threads >= 1 else mp.cpu_count() ) as pool: - self.conversations = pd.DataFrame( - pool.map(self._process_conversation, [block for block in conversations]) - ) if len(conversations) else pd.DataFrame(columns=grouper) + results = pool.map(self._process_recording, self.recordings) + + split = [] + times = [] + for i in results: + split += i[0] + times.append(i[1]) + + results = list(itertools.chain.from_iterable(split)) + times = np.array(times) + print("total_process_rec_time = {} s".format(times.sum())) + print("avg_process_rec_time = {} s".format(times.mean())) + self.conversations = pd.DataFrame(results) if len(results) else pd.DataFrame(columns=grouper) # now add the rec_cols and child_cols in the result if self.rec_cols: @@ -410,8 +447,8 @@ def __init__( threads: int = 1, ): - features = np.array( - [["conversation_onset", "conversation_onset", pd.NA], + features = np.array([ + ["conversation_onset", "conversation_onset", pd.NA], ["conversation_offset", "conversation_offset", pd.NA], ["conversation_duration", "conversation_duration", pd.NA], ["vocalisations_count", "vocalisations_count", pd.NA], @@ -419,7 +456,6 @@ def __init__( ["who_finished", "finisher", pd.NA], ["who_participates", "participants", pd.NA], ["total_duration_of_vocalisations", "total_duration_of_vocalisations", pd.NA], - ["conversation_duration", "conversation_duration", pd.NA], ["is_speaker", "CHI_present", 'CHI'], ["is_speaker", "FEM_present", 'FEM'], ["is_speaker", "MAL_present", 'MAL'], From 097dd1bfee49139c71c1c75eefaa1ed291f19c7f Mon Sep 17 00:00:00 2001 From: Loann Peurey Date: Fri, 28 Jun 2024 17:16:48 +0200 Subject: [PATCH 30/44] remove some dowstream analysis that can be computed later, standardize usage of segments vs annotations --- .../pipelines/conversationFunctions.py | 66 +++++++------------ 1 file changed, 23 insertions(+), 43 deletions(-) diff --git a/ChildProject/pipelines/conversationFunctions.py b/ChildProject/pipelines/conversationFunctions.py index 5c6286fc..b3b44243 100644 --- a/ChildProject/pipelines/conversationFunctions.py +++ b/ChildProject/pipelines/conversationFunctions.py @@ -9,9 +9,9 @@ """ This file lists all the metrics functions commonly used. New metrics can be added by defining new functions for the Conversations class to use : - - Create a new function using the same arguments (i.e. annotations, duration, **kwargs) + - Create a new function using the same arguments (i.e. segments, duration, **kwargs) - Define calculation of the metric with: - - annotations, which is a dataframe containing all the relevant annotated segments to use. It contains the + - segments, which is a dataframe containing all the relevant annotated segments to use. It contains the annotation content (https://childproject.readthedocs.io/en/latest/format.html#id10) joined with the annotation index info (https://childproject.readthedocs.io/en/latest/format.html#id11) as well as any column that was requested to be added to the results by the user using --child-cols or --rec-cols (eg --child-cols child_dob, @@ -49,12 +49,12 @@ def decorator(function): function.__name__, a, RESERVED)) @functools.wraps(function) - def new_func(annotations: pd.DataFrame, **kwargs): + def new_func(segments: pd.DataFrame, **kwargs): for arg in args: if arg not in kwargs: raise ValueError(f"{function.__name__} metric needs an argument <{arg}>") - res = function(annotations, **kwargs) + res = function(segments, **kwargs) return res @@ -64,71 +64,51 @@ def new_func(annotations: pd.DataFrame, **kwargs): @conversationFunction() -def conversation_onset(annotations: pd.DataFrame): - return annotations.reset_index().iloc[0]['segment_onset'] +def who_initiated(segments: pd.DataFrame): + return segments.iloc[0]['speaker_type'] @conversationFunction() -def conversation_offset(annotations: pd.DataFrame): - return annotations.reset_index().iloc[-1]['segment_offset'] - - -@conversationFunction() -def conversation_duration(annotations: pd.DataFrame): - return annotations.reset_index().iloc[-1]['segment_offset'] - annotations.reset_index().iloc[0]['segment_onset'] - - -@conversationFunction() -def vocalisations_count(annotations: pd.DataFrame): - return annotations['speaker_type'].count() - - -@conversationFunction() -def who_initiated(annotations: pd.DataFrame): - return annotations.reset_index().iloc[0]['speaker_type'] - - -@conversationFunction() -def who_finished(annotations: pd.DataFrame): - return annotations.reset_index().iloc[-1]['speaker_type'] +def who_finished(segments: pd.DataFrame): + return segments[segments['segment_offset'] == segments['segment_offset'].max()]['speaker_type'] @conversationFunction() -def who_participates(annotations: pd.DataFrame): - return '/'.join(annotations.reset_index()['speaker_type'].unique()) +def who_participates(segments: pd.DataFrame): + return '/'.join(segments['speaker_type'].unique()) @conversationFunction() -def total_duration_of_vocalisations(annotations: pd.DataFrame): - return annotations['voc_duration'].sum() +def total_duration_of_vocalisations(segments: pd.DataFrame): + return segments['voc_duration'].sum() @conversationFunction({'speaker'}) -def is_speaker(annotations: pd.DataFrame, **kwargs): - return kwargs["speaker"] in annotations['speaker_type'].tolist() +def is_speaker(segments: pd.DataFrame, **kwargs): + return kwargs["speaker"] in segments['speaker_type'].tolist() @conversationFunction({'speaker'}) -def voc_counter(annotations: pd.DataFrame, **kwargs): - return annotations[annotations['speaker_type'] == kwargs["speaker"]]['speaker_type'].count() +def voc_counter(segments: pd.DataFrame, **kwargs): + return segments[segments['speaker_type'] == kwargs["speaker"]]['speaker_type'].count() @conversationFunction({'speaker'}) -def voc_total(annotations: pd.DataFrame, **kwargs): - return annotations[annotations['speaker_type'] == kwargs["speaker"]]['voc_duration'].sum(min_count=1) +def voc_total(segments: pd.DataFrame, **kwargs): + return segments[segments['speaker_type'] == kwargs["speaker"]]['voc_duration'].sum(min_count=1) @conversationFunction({'speaker'}) -def voc_contribution(annotations: pd.DataFrame, **kwargs): - speaker_total = annotations[annotations['speaker_type'] == kwargs["speaker"]]['voc_duration'].sum(min_count=1) - total = annotations['voc_duration'].sum() +def voc_contribution(segments: pd.DataFrame, **kwargs): + speaker_total = segments[segments['speaker_type'] == kwargs["speaker"]]['voc_duration'].sum(min_count=1) + total = segments['voc_duration'].sum() return speaker_total / total @conversationFunction() -def assign_conv_type(annotations: pd.DataFrame): +def assign_conv_type(segments: pd.DataFrame): #pd.Categorical(['overheard', 'dyadic_FEM', 'dyadic_MAL', 'peer', 'parent', 'triadic_FEM', 'triadic_MAL', 'multiparty']) speaker_present = {} for speaker in ['CHI', 'FEM', 'MAL', 'OCH']: - speaker_present[speaker] = [speaker in annotations['speaker_type'].tolist()] + speaker_present[speaker] = [speaker in segments['speaker_type'].tolist()] speaker_df = pd.DataFrame.from_dict(speaker_present).iloc[0, :] if not speaker_df['CHI']: From f284f697409a9e649ecfeee72421f15d4d73587d Mon Sep 17 00:00:00 2001 From: Loann Peurey Date: Fri, 28 Jun 2024 17:17:03 +0200 Subject: [PATCH 31/44] further optimization --- ChildProject/pipelines/conversations.py | 86 +++++++++---------------- 1 file changed, 29 insertions(+), 57 deletions(-) diff --git a/ChildProject/pipelines/conversations.py b/ChildProject/pipelines/conversations.py index a141defa..c89b33e7 100644 --- a/ChildProject/pipelines/conversations.py +++ b/ChildProject/pipelines/conversations.py @@ -110,7 +110,7 @@ def check_callable(row): "check spelling and make sure the set was properly imported." ) self.set = setname - self.features_list = features_list + self.features_dict = features_list.to_dict(orient="index") # necessary columns to construct the conversations join_columns = { @@ -196,8 +196,7 @@ def __init_subclass__(cls, **kwargs): super().__init_subclass__(**kwargs) pipelines[cls.SUBCOMMAND] = cls - def _process_conversation(self, conversation): #process recording line - #keep lines for which conv_count is not Na and group by conv + def _process_conversation(self, conversation, rec): #process recording line """for one unit (i.e. 1 recording) compute the list of required features and store the results in the current row of self.conversations @@ -206,40 +205,37 @@ def _process_conversation(self, conversation): #process recording line :return: Series containing all the computed features result for that unit :rtype: pandas.Series """ - start_sub = time.time()# RM - # meta, segments = conversation segments = conversation - meta = conversation.name - result = {} - # move to init or at least something done once only - index = self.features_list.to_dict(orient="index") - # for i in index: - # - # result[i] = index[i]["callable"](segments, **index[i]['args']) - result['test'] = segments.reset_index().iloc[0]['segment_onset'] + # results that are included regardless of the required list + result = {'conversation_onset': segments.iloc[0]['segment_onset'], + 'conversation_offset': segments['segment_offset'].max(), + 'voc_count': segments['speaker_type'].count(), + 'conv_count': conversation.name, + 'recording_filename': rec + } + # apply the functions required + for i in self.features_dict: + result[i] = self.features_dict[i]["callable"](segments, **self.features_dict[i]['args']) - # result['recording_filename'] = meta[0] - result['conv_count'] = meta - - return result#, time.time() - start_sub #RM last bit + return result def _process_recording(self, recording): grouper = 'conv_count' - start_sub = time.time() # RM segments = self.retrieve_segments(recording) segments['voc_duration'] = segments['segment_offset'] - segments['segment_onset'] conversations = segments.groupby(grouper, group_keys=True) - #keep as Series?? - extractions = conversations.apply(self._process_conversation).to_list() if len(conversations) else [] + # keep as Series?? + extractions = conversations.apply( + self._process_conversation, rec=recording).to_list() if len(conversations) else [] # extractions = [self._process_conversation(block) for block in conversations] - return extractions, time.time() - start_sub #RM last bit + return extractions def extract(self): - """from the initiated self.features_list, compute each row feature (handles threading) + """from the initiated self.features_dict, compute each row feature (handles threading) Once the Conversation class is initialized, call this function to extract the features and populate self.conversations @@ -256,24 +252,18 @@ def extract(self): extractions += [self._process_conversation(block) for block in conversations] self.conversations = pd.DataFrame(extractions) if len(extractions) else pd.DataFrame(columns=grouper) else: - import time with mp.Pool( processes=self.threads if self.threads >= 1 else mp.cpu_count() ) as pool: - results = pool.map(self._process_recording, self.recordings) - - split = [] - times = [] - for i in results: - split += i[0] - times.append(i[1]) - - results = list(itertools.chain.from_iterable(split)) - times = np.array(times) - print("total_process_rec_time = {} s".format(times.sum())) - print("avg_process_rec_time = {} s".format(times.mean())) + # TODO unify usage of np array (or unify not using them) + results = list(itertools.chain.from_iterable(pool.map(self._process_recording, self.recordings))) + # results = list(np.concatenate(np.array(pool.map(self._process_recording, self.recordings))).ravel()) + + # self.conversations = pd.DataFrame(split) if len(split) else pd.DataFrame(columns=grouper) self.conversations = pd.DataFrame(results) if len(results) else pd.DataFrame(columns=grouper) + # self.conversations = pd.concat(pool.map(self._process_recording, self.recordings)) + # now add the rec_cols and child_cols in the result if self.rec_cols: if self.child_cols: @@ -340,10 +330,6 @@ def retrieve_segments(self, recording: str): return pd.DataFrame(columns=list(set([c.name for c in AnnotationManager.SEGMENTS_COLUMNS if c.required] + list(annotations.columns) + ['conv_count']))) - segments['recording_filename'] = recording - - #TODO check that required columns exist - return segments.reset_index(drop=True) @@ -448,31 +434,17 @@ def __init__( ): features = np.array([ - ["conversation_onset", "conversation_onset", pd.NA], - ["conversation_offset", "conversation_offset", pd.NA], - ["conversation_duration", "conversation_duration", pd.NA], - ["vocalisations_count", "vocalisations_count", pd.NA], ["who_initiated", "initiator", pd.NA], ["who_finished", "finisher", pd.NA], - ["who_participates", "participants", pd.NA], ["total_duration_of_vocalisations", "total_duration_of_vocalisations", pd.NA], - ["is_speaker", "CHI_present", 'CHI'], - ["is_speaker", "FEM_present", 'FEM'], - ["is_speaker", "MAL_present", 'MAL'], - ["is_speaker", "OCH_present", 'OCH'], ["voc_counter", "CHI_voc_counter", 'CHI'], ["voc_counter", "FEM_voc_counter", 'FEM'], ["voc_counter", "MAL_voc_counter", 'MAL'], ["voc_counter", "OCH_voc_counter", 'OCH'], - ["voc_total", "CHI_voc_total", 'CHI'], - ["voc_total", "FEM_voc_total", 'FEM'], - ["voc_total", "MAL_voc_total", 'MAL'], - ["voc_total", "OCH_voc_total", 'OCH'], - ["voc_contribution", "CHI_voc_contribution", 'CHI'], - ["voc_contribution", "FEM_voc_contribution", 'FEM'], - ["voc_contribution", "MAL_voc_contribution", 'MAL'], - ["voc_contribution", "OCH_voc_contribution", 'OCH'], - ["assign_conv_type", "conversation_type", pd.NA], + ["voc_total", "CHI_voc_dur", 'CHI'], + ["voc_total", "FEM_voc_dur", 'FEM'], + ["voc_total", "MAL_voc_dur", 'MAL'], + ["voc_total", "OCH_voc_dur", 'OCH'], ]) features = pd.DataFrame(features, columns=["callable", "name", "speaker"]) From 76c23cb7277506f78d1a27a5d842d385e4cfc782 Mon Sep 17 00:00:00 2001 From: Loann Peurey Date: Tue, 2 Jul 2024 17:03:41 +0200 Subject: [PATCH 32/44] cleaning up and adding some doctrings --- .../pipelines/conversationFunctions.py | 53 +++++++++++++++--- ChildProject/pipelines/conversations.py | 56 +++++++++++-------- 2 files changed, 80 insertions(+), 29 deletions(-) diff --git a/ChildProject/pipelines/conversationFunctions.py b/ChildProject/pipelines/conversationFunctions.py index b3b44243..a64f8331 100644 --- a/ChildProject/pipelines/conversationFunctions.py +++ b/ChildProject/pipelines/conversationFunctions.py @@ -25,10 +25,7 @@ !! Metrics functions should still behave and return the correct result when receiving an empty dataframe """ RESERVED = {'name', 'callable'} # arguments reserved usage. use other keyword labels. -#TODO -# 1. Start and end time of each conversation in the recording -# 2. Duration of time between conversations (e.g., time between Convo 1 and Convo 2) -# 5. A string with a list of speaker tags in the conversation (e.g., "CHI, FEM, OCH") + def conversationFunction(args: set = set()): """Decorator for all metrics functions to make them ready to be called by the pipeline. @@ -65,39 +62,76 @@ def new_func(segments: pd.DataFrame, **kwargs): @conversationFunction() def who_initiated(segments: pd.DataFrame): + """speaker type who spoke first in the conversation + + Required keyword arguments: + """ return segments.iloc[0]['speaker_type'] @conversationFunction() def who_finished(segments: pd.DataFrame): + """speaker type who spoke last in the conversation + + Required keyword arguments: + """ return segments[segments['segment_offset'] == segments['segment_offset'].max()]['speaker_type'] @conversationFunction() def who_participates(segments: pd.DataFrame): + """list of speakers participating in the conversation, '/' separated + + Required keyword arguments: + """ return '/'.join(segments['speaker_type'].unique()) @conversationFunction() -def total_duration_of_vocalisations(segments: pd.DataFrame): +def voc_total_dur(segments: pd.DataFrame): + """summed duration of all speech in the conversation (ms) N.B. can be higher than conversation duration as + speakers may speak at the same time, resulting in multiple spoken segments happening simultaneously + + Required keyword arguments: + """ return segments['voc_duration'].sum() @conversationFunction({'speaker'}) def is_speaker(segments: pd.DataFrame, **kwargs): + """is a specific speaker type present in the conversation + + Required keyword arguments: + - speaker : speaker_type to evaluate presence of + """ return kwargs["speaker"] in segments['speaker_type'].tolist() @conversationFunction({'speaker'}) -def voc_counter(segments: pd.DataFrame, **kwargs): +def voc_speaker_count(segments: pd.DataFrame, **kwargs): + """number of vocalizations produced by a given speaker + + Required keyword arguments: + - speaker : speaker_type to count the vocalizations of + """ return segments[segments['speaker_type'] == kwargs["speaker"]]['speaker_type'].count() @conversationFunction({'speaker'}) -def voc_total(segments: pd.DataFrame, **kwargs): +def voc_speaker_dur(segments: pd.DataFrame, **kwargs): + """summed duration of speech for a given speaker in the conversation + + Required keyword arguments: + - speaker + """ return segments[segments['speaker_type'] == kwargs["speaker"]]['voc_duration'].sum(min_count=1) @conversationFunction({'speaker'}) def voc_contribution(segments: pd.DataFrame, **kwargs): + """contribution of a given speaker in the conversation compared to others, in terms of total speech duration + + Required keyword arguments: + - speaker + """ speaker_total = segments[segments['speaker_type'] == kwargs["speaker"]]['voc_duration'].sum(min_count=1) total = segments['voc_duration'].sum() return speaker_total / total @@ -105,6 +139,11 @@ def voc_contribution(segments: pd.DataFrame, **kwargs): @conversationFunction() def assign_conv_type(segments: pd.DataFrame): + """Compute the conversation type (overheard, dyadic_XXX, peer, parent, triadic_XXX, multiparty) depending on the + participants + + Required keyword arguments: + """ #pd.Categorical(['overheard', 'dyadic_FEM', 'dyadic_MAL', 'peer', 'parent', 'triadic_FEM', 'triadic_MAL', 'multiparty']) speaker_present = {} for speaker in ['CHI', 'FEM', 'MAL', 'OCH']: diff --git a/ChildProject/pipelines/conversations.py b/ChildProject/pipelines/conversations.py index c89b33e7..dccc88d7 100644 --- a/ChildProject/pipelines/conversations.py +++ b/ChildProject/pipelines/conversations.py @@ -197,13 +197,14 @@ def __init_subclass__(cls, **kwargs): pipelines[cls.SUBCOMMAND] = cls def _process_conversation(self, conversation, rec): #process recording line - """for one unit (i.e. 1 recording) compute the list of required features and store the results in the current - row of self.conversations - - :param row: index and Series of the unit to process, to be modified with the results - :type row: (int , pandas.Series) - :return: Series containing all the computed features result for that unit - :rtype: pandas.Series + """for one conversation block compute the list of required features and store return the results as a dictionary + + :param conversation: index and Series of the unit to process, to be modified with the results + :type conversation: pd.DataFrame + :param rec: recording_filename to which belongs that conversation + :type rec: str + :return: dict containing all the computed features result for that unit + :rtype: dict """ segments = conversation @@ -212,7 +213,8 @@ def _process_conversation(self, conversation, rec): #process recording line 'conversation_offset': segments['segment_offset'].max(), 'voc_count': segments['speaker_type'].count(), 'conv_count': conversation.name, - 'recording_filename': rec + 'interval_last_conv': conversation.iloc[0]['time_since_last_conv'], + 'recording_filename': rec, } # apply the functions required for i in self.features_dict: @@ -221,10 +223,25 @@ def _process_conversation(self, conversation, rec): #process recording line return result def _process_recording(self, recording): + """for one recording, get the segments required, group by conversation and launch computation for each block + + :param recording: recording_filename to which belongs that conversation + :type recording: str + :return: dict containing all the computed features result for that unit + :rtype: list[dict] + """ grouper = 'conv_count' segments = self.retrieve_segments(recording) segments['voc_duration'] = segments['segment_offset'] - segments['segment_onset'] + # compute the duration between conversation and previous one + terminals = segments[segments['conv_count'].shift(-1) != segments['conv_count']] + terminals.index += 1 + steps = (segments[segments['conv_count'].shift(1) != segments['conv_count']]['segment_onset'] - + terminals['segment_offset']).dropna() + steps.index = segments.loc[steps.index, 'conv_count'] + segments['time_since_last_conv'] = segments['conv_count'].map(steps) + conversations = segments.groupby(grouper, group_keys=True) # keep as Series?? @@ -255,15 +272,10 @@ def extract(self): with mp.Pool( processes=self.threads if self.threads >= 1 else mp.cpu_count() ) as pool: - # TODO unify usage of np array (or unify not using them) results = list(itertools.chain.from_iterable(pool.map(self._process_recording, self.recordings))) - # results = list(np.concatenate(np.array(pool.map(self._process_recording, self.recordings))).ravel()) - # self.conversations = pd.DataFrame(split) if len(split) else pd.DataFrame(columns=grouper) self.conversations = pd.DataFrame(results) if len(results) else pd.DataFrame(columns=grouper) - # self.conversations = pd.concat(pool.map(self._process_recording, self.recordings)) - # now add the rec_cols and child_cols in the result if self.rec_cols: if self.child_cols: @@ -436,15 +448,15 @@ def __init__( features = np.array([ ["who_initiated", "initiator", pd.NA], ["who_finished", "finisher", pd.NA], - ["total_duration_of_vocalisations", "total_duration_of_vocalisations", pd.NA], - ["voc_counter", "CHI_voc_counter", 'CHI'], - ["voc_counter", "FEM_voc_counter", 'FEM'], - ["voc_counter", "MAL_voc_counter", 'MAL'], - ["voc_counter", "OCH_voc_counter", 'OCH'], - ["voc_total", "CHI_voc_dur", 'CHI'], - ["voc_total", "FEM_voc_dur", 'FEM'], - ["voc_total", "MAL_voc_dur", 'MAL'], - ["voc_total", "OCH_voc_dur", 'OCH'], + ["voc_total_dur", "total_duration_of_vocalisations", pd.NA], + ["voc_speaker_count", "CHI_voc_count", 'CHI'], + ["voc_speaker_count", "FEM_voc_count", 'FEM'], + ["voc_speaker_count", "MAL_voc_count", 'MAL'], + ["voc_speaker_count", "OCH_voc_count", 'OCH'], + ["voc_speaker_dur", "CHI_voc_dur", 'CHI'], + ["voc_speaker_dur", "FEM_voc_dur", 'FEM'], + ["voc_speaker_dur", "MAL_voc_dur", 'MAL'], + ["voc_speaker_dur", "OCH_voc_dur", 'OCH'], ]) features = pd.DataFrame(features, columns=["callable", "name", "speaker"]) From 84f62d45fe0d3957354a162bb23eea0970862248 Mon Sep 17 00:00:00 2001 From: Loann Peurey Date: Thu, 4 Jul 2024 10:59:53 +0200 Subject: [PATCH 33/44] cleaning comment --- ChildProject/pipelines/conversationFunctions.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/ChildProject/pipelines/conversationFunctions.py b/ChildProject/pipelines/conversationFunctions.py index a64f8331..96dbbd21 100644 --- a/ChildProject/pipelines/conversationFunctions.py +++ b/ChildProject/pipelines/conversationFunctions.py @@ -1,4 +1,3 @@ -# define functions to calculate metrics import pandas as pd import numpy as np import ast @@ -78,7 +77,7 @@ def who_finished(segments: pd.DataFrame): return segments[segments['segment_offset'] == segments['segment_offset'].max()]['speaker_type'] @conversationFunction() -def who_participates(segments: pd.DataFrame): +def participants(segments: pd.DataFrame): """list of speakers participating in the conversation, '/' separated Required keyword arguments: @@ -100,7 +99,7 @@ def is_speaker(segments: pd.DataFrame, **kwargs): """is a specific speaker type present in the conversation Required keyword arguments: - - speaker : speaker_type to evaluate presence of + - speaker : speaker_type label """ return kwargs["speaker"] in segments['speaker_type'].tolist() @@ -110,7 +109,7 @@ def voc_speaker_count(segments: pd.DataFrame, **kwargs): """number of vocalizations produced by a given speaker Required keyword arguments: - - speaker : speaker_type to count the vocalizations of + - speaker : speaker_type label """ return segments[segments['speaker_type'] == kwargs["speaker"]]['speaker_type'].count() @@ -120,17 +119,17 @@ def voc_speaker_dur(segments: pd.DataFrame, **kwargs): """summed duration of speech for a given speaker in the conversation Required keyword arguments: - - speaker + - speaker : speaker_type label """ return segments[segments['speaker_type'] == kwargs["speaker"]]['voc_duration'].sum(min_count=1) @conversationFunction({'speaker'}) -def voc_contribution(segments: pd.DataFrame, **kwargs): +def voc_dur_contribution(segments: pd.DataFrame, **kwargs): """contribution of a given speaker in the conversation compared to others, in terms of total speech duration Required keyword arguments: - - speaker + - speaker : speaker_type label """ speaker_total = segments[segments['speaker_type'] == kwargs["speaker"]]['voc_duration'].sum(min_count=1) total = segments['voc_duration'].sum() From a6775e9e76ed017cf18608af9faf9a4aef58718b Mon Sep 17 00:00:00 2001 From: Loann Peurey Date: Thu, 4 Jul 2024 11:00:45 +0200 Subject: [PATCH 34/44] add doc for conversation extraction --- docs/source/_ext/directives.py | 16 +++++- docs/source/annotations.rst | 2 + docs/source/conversations.rst | 101 +++++++++++++++++++++++++++++++++ docs/source/index.rst | 1 + 4 files changed, 119 insertions(+), 1 deletion(-) create mode 100644 docs/source/conversations.rst diff --git a/docs/source/_ext/directives.py b/docs/source/_ext/directives.py index 2e7143ae..0ae9a410 100644 --- a/docs/source/_ext/directives.py +++ b/docs/source/_ext/directives.py @@ -10,7 +10,7 @@ from ChildProject.projects import ChildProject from ChildProject.annotations import AnnotationManager -from ChildProject.pipelines import metricsFunctions +from ChildProject.pipelines import metricsFunctions, conversationFunctions import subprocess @@ -119,6 +119,20 @@ def __init__(self, *args, **kwargs): 'Required arguments': wrap(arguments,25), } df.append(df_entry) + elif array == 'list-conversation-metrics': + ignores = {'conversationFunction'} + metrics = getmembers(conversationFunctions, isfunction) + for name, func in metrics: + if name in ignores : continue + doc = func.__doc__.split('Required keyword arguments:',1) + description = cleandoc(doc[0]) + arguments = cleandoc(doc[1]) if len(doc) > 1 else "" + df_entry = { + 'Callable': name, + 'Description': wrap(description, 45), + 'Required arguments': wrap(arguments,35), + } + df.append(df_entry) self.options['file'] = '{}.csv'.format(array) self.options['header-rows'] = 1 diff --git a/docs/source/annotations.rst b/docs/source/annotations.rst index 25d5a693..cb4a2c38 100644 --- a/docs/source/annotations.rst +++ b/docs/source/annotations.rst @@ -93,6 +93,8 @@ remove them from the index. child-project remove-annotations /path/to/dataset --set vtc +.. _derive-annotations: + Derive annotations ~~~~~~~~~~~~~~~~~~ diff --git a/docs/source/conversations.rst b/docs/source/conversations.rst new file mode 100644 index 00000000..a8cedefd --- /dev/null +++ b/docs/source/conversations.rst @@ -0,0 +1,101 @@ +Conversations summary extraction +-------------------------------- + +Overview +~~~~~~~~ + +This package allows to extract descriptive statistics on identified conversations in recordings. The set used for the extraction must contain conversation annotations which is to say have the columns ``segment_onset``, ``segment_offset``, ``speaker_type`` and ``conv_count``. +The :ref:`derive-annotations` pipeline can be used to derivate conversation annotations from diarized annotations; we recommend using this on vtc automated annotations to have automated conversation annotations. +A csv file containing the statistics is produced along with a YML parameter file storing all the options used for the extractions + +.. clidoc:: + + child-project conversations-summary --help + +The conversation extraction will always have the following columns: + +.. csv-table:: + :header: "column", "info" + :widths: 19, 30 + :stub-columns: 1 + + conversation_onset, start of the conversation (ms) inside the recording + conversation_offset, end of the conversation (ms) inside the recording + voc_count, number of vocalizations inside the conversation + conv_count, identifier of the conversation (unique across the recording) + interval_last_conv, interval (ms) between the end of previous conversation end start of the current conversation (NA for first) + recording_filename, recording of the conversation + +The list of supported functions is shown below: + +.. _list-conversation-metrics: + +.. custom-table:: + :header: list-conversation-metrics + +Standard Conversations +~~~~~~~~~~~~~~~~~~~~~~ + +The Standard pipeline will extract a list of usual metrics that can be obtained from conversations. Using this pipeline with a set containing conversation annotations + will output: + +.. csv-table:: + :header: "metric", "name", "speaker" + :widths: 19, 19, 19 + :stub-columns: 1 + + "who_initiated", "initiator", + "who_finished", "finisher", + "voc_total_dur", "total_duration_of_vocalisations", + "voc_speaker_count", "CHI_voc_count", 'CHI' + "voc_speaker_count", "FEM_voc_count", 'FEM' + "voc_speaker_count", "MAL_voc_count", 'MAL' + "voc_speaker_count", "OCH_voc_count", 'OCH' + "voc_speaker_dur", "CHI_voc_dur", 'CHI' + "voc_speaker_dur", "FEM_voc_dur", 'FEM' + "voc_speaker_dur", "MAL_voc_dur", 'MAL' + "voc_speaker_dur", "OCH_voc_dur", 'OCH' + +.. clidoc:: + + child-project conversations-summary /path/to/dataset output.csv standard --help + +Custom Conversations +~~~~~~~~~~~~~~~~~~~~ + +.. _list-structure: + +The Custom conversations pipeline allows you to provide your own list of desired metric to the pipeline to be extracted. +The list must be in a csv file containing the following colums: + +- callable (required) : name of the metric to extract, see :ref:`the list ` +- name (required) : name to use in the resulting metrics. If none is given, a default name will be used. Use this to extract the same metric for different sets and avoid name clashes. +- (depending on the requirements of the metric you chose) : For each required argument of a metric, add a column of that argument's name. + +This is an example of a csv file we use to extract conversation metrics. +We want to extract who initiated the conversation, who finished it, the list of speakers involved and the percentage of speech produced by the target child (CHI) in each conversation and the same for female adult speakers (FEM). +So we write 5 lines, one for each metric, we give the reference to the metric (as they are in the table above), the name that we want in the final output, and for some of them, the required argument(s). + +.. csv-table:: + :header: "metric", "name", "speaker" + :widths: 20, 10, 20 + + who_initiated, "initiator", + who_finished, "finisher", + participants, "participants", + voc_dur_contribution, chi_dur_contrib, CHI + voc_dur_contribution, fem_dur_contrib, FEM + +.. clidoc:: + + child-project conversations-summary /path/to/dataset output.csv custom --help + +Conversations extraction from parameter file +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +To facilitate the extraction of conversations, one can simply use an exhaustive yml parameter file to launch a new extraction. +This file has the exact same structure as the one produced by the pipeline. So you can use the output parameter file of a previous extraction to rerun the same analysis. + +.. clidoc:: + + child-project conversations-specification --help diff --git a/docs/source/index.rst b/docs/source/index.rst index e6bb07ab..9e40cf2b 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -30,6 +30,7 @@ Welcome to ChildProject's documentation! tools annotations metrics + conversations processors samplers elan From 4e2514f586d87e16b9d8713c24f3ede0114d0e14 Mon Sep 17 00:00:00 2001 From: Loann Peurey Date: Thu, 4 Jul 2024 11:33:06 +0200 Subject: [PATCH 35/44] contraint pillow as 10.4.0 breaks matplotlib --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index 3f7fc804..c9c8e3cc 100644 --- a/requirements.txt +++ b/requirements.txt @@ -12,6 +12,7 @@ numpy==1.26.4; python_version >= '3.10' pandas==1.3.5; python_version <= '3.10' pandas==2.2.1; python_version >= '3.11' panoptes-client==1.6.1 +pillow<10.4.0 # this is constrained as matplotlib versions used fail on pillow > 10.3.0 praat-parselmouth==0.4.3 pyannote.core==5.0.0 pydub==0.25.1 From 1181f84e765a4367bacc2dd3e494e690b49a274d Mon Sep 17 00:00:00 2001 From: Loann Peurey Date: Thu, 4 Jul 2024 18:55:48 +0200 Subject: [PATCH 36/44] fix some parameters and docstrings --- .../pipelines/conversationFunctions.py | 2 +- ChildProject/pipelines/conversations.py | 19 ++++++++----------- 2 files changed, 9 insertions(+), 12 deletions(-) diff --git a/ChildProject/pipelines/conversationFunctions.py b/ChildProject/pipelines/conversationFunctions.py index 96dbbd21..6328983b 100644 --- a/ChildProject/pipelines/conversationFunctions.py +++ b/ChildProject/pipelines/conversationFunctions.py @@ -74,7 +74,7 @@ def who_finished(segments: pd.DataFrame): Required keyword arguments: """ - return segments[segments['segment_offset'] == segments['segment_offset'].max()]['speaker_type'] + return segments[segments['segment_offset'] == segments['segment_offset'].max()].iloc[0]['speaker_type'] @conversationFunction() def participants(segments: pd.DataFrame): diff --git a/ChildProject/pipelines/conversations.py b/ChildProject/pipelines/conversations.py index dccc88d7..0ca3dedc 100644 --- a/ChildProject/pipelines/conversations.py +++ b/ChildProject/pipelines/conversations.py @@ -111,6 +111,8 @@ def check_callable(row): ) self.set = setname self.features_dict = features_list.to_dict(orient="index") + features_list['name'] = features_list.index + self.features_df = features_list # necessary columns to construct the conversations join_columns = { @@ -260,21 +262,15 @@ def extract(self): :rtype: pandas.DataFrame """ if self.threads == 1: - extractions = [] - for rec in self.recordings: - segments = self.retrieve_segments(rec) - conversations = segments.groupby(grouper) - - extractions += [self._process_conversation(block) for block in conversations] - self.conversations = pd.DataFrame(extractions) if len(extractions) else pd.DataFrame(columns=grouper) + results = list(itertools.chain.from_iterable(map(self._process_recording, self.recordings))) else: with mp.Pool( processes=self.threads if self.threads >= 1 else mp.cpu_count() ) as pool: results = list(itertools.chain.from_iterable(pool.map(self._process_recording, self.recordings))) - self.conversations = pd.DataFrame(results) if len(results) else pd.DataFrame(columns=grouper) + self.conversations = pd.DataFrame(results) if len(results) else pd.DataFrame(columns=grouper) # now add the rec_cols and child_cols in the result if self.rec_cols: @@ -336,7 +332,7 @@ def retrieve_segments(self, recording: str): # no annotations for that unit return pd.DataFrame(columns=list(set([c.name for c in AnnotationManager.SEGMENTS_COLUMNS if c.required] + list(annotations.columns) + ['conv_count']))) - segments = segments.dropna(subset='conv_count') + segments = segments.dropna(subset=['conv_count']) else: # no annotations for that unit return pd.DataFrame(columns=list(set([c.name for c in AnnotationManager.SEGMENTS_COLUMNS if c.required] @@ -509,7 +505,7 @@ def run(self, path, destination, pipeline, func=None, **kwargs): self.conversations.to_csv(self.destination, index=False) # get the df of features used from the Conversations class - features_df = conversations.features_list + features_df = conversations.features_df features_df['callable'] = features_df.apply(lambda row: row['callable'].__name__, axis=1) # from the callables used, find their name back parameters['features_list'] = [{k: v for k, v in m.items() if pd.notnull(v)} for m in @@ -658,7 +654,8 @@ def run(self, parameters_input, func=None): self.conversations.to_csv(self.destination, index=False) # get the df of features used from the Conversations class - features_df = conversations.features_list + features_df = conversations.features_df + print(features_df) features_df['callable'] = features_df.apply(lambda row: row['callable'].__name__, axis=1) # from the callables used, find their name back parameters['features_list'] = [{k: v for k, v in m.items() if pd.notnull(v)} for m in From 3949dfb0b035aa985800c99b4e54a360c5a524e3 Mon Sep 17 00:00:00 2001 From: Loann Peurey Date: Thu, 4 Jul 2024 18:56:05 +0200 Subject: [PATCH 37/44] start to add tests and testing material --- tests/data/conversations_parameters.yml | 56 +++++++ tests/data/list_features_conv.csv | 6 + tests/test_conversations.py | 206 ++++++++++++++++++++++++ tests/test_metrics.py | 2 +- tests/truth/custom_conversations.csv | 4 + tests/truth/python_conversations.csv | 4 + tests/truth/standard_conversations.csv | 4 + 7 files changed, 281 insertions(+), 1 deletion(-) create mode 100644 tests/data/conversations_parameters.yml create mode 100644 tests/data/list_features_conv.csv create mode 100644 tests/test_conversations.py create mode 100644 tests/truth/custom_conversations.csv create mode 100644 tests/truth/python_conversations.csv create mode 100644 tests/truth/standard_conversations.csv diff --git a/tests/data/conversations_parameters.yml b/tests/data/conversations_parameters.yml new file mode 100644 index 00000000..b41a89df --- /dev/null +++ b/tests/data/conversations_parameters.yml @@ -0,0 +1,56 @@ +package_version: 0.2.0 +date: '20240704_184950' +parameters: + path: output/conversations + destination: output/conversations/extra/std_conv.csv + pipeline: standard + setname: custom_conv + recordings: null + from_time: null + to_time: null + rec_cols: null + child_cols: null + threads: 1 + dataset_hash: 710ebfe1f4b118f8a48f69d896c6675bb72ef6ba + features_list: + - callable: who_initiated + args: {} + name: initiator + - callable: who_finished + args: {} + name: finisher + - callable: voc_total_dur + args: {} + name: total_duration_of_vocalisations + - callable: voc_speaker_count + args: + speaker: CHI + name: CHI_voc_count + - callable: voc_speaker_count + args: + speaker: FEM + name: FEM_voc_count + - callable: voc_speaker_count + args: + speaker: MAL + name: MAL_voc_count + - callable: voc_speaker_count + args: + speaker: OCH + name: OCH_voc_count + - callable: voc_speaker_dur + args: + speaker: CHI + name: CHI_voc_dur + - callable: voc_speaker_dur + args: + speaker: FEM + name: FEM_voc_dur + - callable: voc_speaker_dur + args: + speaker: MAL + name: MAL_voc_dur + - callable: voc_speaker_dur + args: + speaker: OCH + name: OCH_voc_dur diff --git a/tests/data/list_features_conv.csv b/tests/data/list_features_conv.csv new file mode 100644 index 00000000..1802ad41 --- /dev/null +++ b/tests/data/list_features_conv.csv @@ -0,0 +1,6 @@ +callable,name,speaker +who_initiated,initiator, +who_finished,finisher, +participants,participants, +voc_dur_contribution,chi_dur_contrib,CHI +voc_dur_contribution,fem_dur_contrib,FEM diff --git a/tests/test_conversations.py b/tests/test_conversations.py new file mode 100644 index 00000000..01791ce5 --- /dev/null +++ b/tests/test_conversations.py @@ -0,0 +1,206 @@ +from functools import partial +import numpy as np +import os +import pandas as pd +import pytest +import shutil +from pathlib import Path + +from ChildProject.projects import ChildProject +from ChildProject.annotations import AnnotationManager +from ChildProject.pipelines.conversations import (Conversations, StandardConversations, CustomConversations, + ConversationsSpecificationPipeline) + +from ChildProject.pipelines.conversationFunctions import conversationFunction, RESERVED + +PATH = Path('output/conversations') + + +def fake_vocs(data, filename): + return data + + +@pytest.fixture(scope="function") +def project(request): + if os.path.exists(PATH): + # shutil.copytree(src="examples/valid_raw_data", dst="output/annotations") + shutil.rmtree(PATH) + shutil.copytree(src="examples/valid_raw_data", dst=PATH) + + project = ChildProject(PATH) + project.read() + + yield project + + +@pytest.fixture(scope="function") +def am(request, project): + am = AnnotationManager(project) + project.recordings['duration'] = [100000000, 2000000] #force longer durations to allow for imports + yield am + +@pytest.fixture(scope="function") +def segments(request): + segments = pd.read_csv("tests/data/csv.csv") + segments.loc[2:4, 'conv_count'] = 1 + segments.loc[8:9, 'conv_count'] = 2 + segments.loc[10:11, 'conv_count'] = 3 + + yield segments + + +def test_failures(project): + features = pd.DataFrame([["who_initiated", "initiator", pd.NA], + ["who_finished", "finisher", pd.NA], + ["voc_speaker_count", "CHI_voc_count", 'CHI'], + ], columns=['callable', 'name', 'speaker']) + + exception_caught = False + try: + standard = StandardConversations(project, setname="unknown") + except ValueError as e: + exception_caught = True + + assert ( + exception_caught is True + ), "StandardConversations failed to throw an exception despite an invalid set being provided" + + exception_caught = False + try: + custom = CustomConversations(project, setname="unknown", features='tests/data/list_features_conv.csv') + except ValueError as e: + exception_caught = True + + assert ( + exception_caught is True + ), "CustomConversations failed to throw an exception despite an invalid set being provided" + + +@pytest.mark.parametrize("error,col_change,new_value", + [(ValueError, 'name', 'finisher'), + (ValueError, 'callable', 'made_up_function'), + (TypeError, 'speaker', 'FEM'), + (None, None, None), + ]) +def test_conversations(project, am, segments, error, col_change, new_value): + + am.import_annotations( + pd.DataFrame( + [{ "set": "custom_conv", + "raw_filename": "file.its", + "time_seek": 0, + "recording_filename": "sound.wav", + "range_onset": 0, + "range_offset": 30000000, + "format": "csv", + }] + ), + import_function=partial(fake_vocs, segments), + ) + + features = pd.DataFrame([["who_initiated", "initiator", pd.NA], + ["who_finished", "finisher", pd.NA], + ["voc_speaker_count", "CHI_voc_count", 'CHI'], + ], columns=['callable', 'name', 'speaker']) + + if error: + with pytest.raises(error): + features.iloc[0, features.columns.get_loc(col_change)] = new_value + cm = Conversations(project, 'custom_conv', features) + cm.extract() + else: + cm = Conversations(project, 'custom_conv', features) + results = cm.extract() + + cm.conversations.to_csv("tests/truth/python_conversations.csv",index=False) + truth = pd.read_csv("tests/truth/python_conversations.csv") + + pd.testing.assert_frame_equal(truth, results) + + pd.testing.assert_frame_equal(results, truth, check_like=True) + +#TODO adapt +def test_standard(project, am, segments): + am.import_annotations( + pd.DataFrame( + [{"set": "custom_conv", + "raw_filename": "file.its", + "time_seek": 0, + "recording_filename": "sound.wav", + "range_onset": 0, + "range_offset": 30000000, + "format": "csv", + }] + ), + import_function=partial(fake_vocs, segments), + ) + + std = StandardConversations(project, setname='custom_conv',rec_cols='date_iso', child_cols='experiment,child_dob') + std.extract() + + # std.conversations.to_csv("tests/truth/standard_conversations.csv", index=False) + truth = pd.read_csv("tests/truth/standard_conversations.csv") + + pd.testing.assert_frame_equal(std.conversations, truth, check_like=True) + + +#TODO adapt +def test_custom(project, am, segments): + am.import_annotations( + pd.DataFrame( + [{"set": "custom_conv", + "raw_filename": "file.its", + "time_seek": 0, + "recording_filename": "sound.wav", + "range_onset": 0, + "range_offset": 30000000, + "format": "csv", + }] + ), + import_function=partial(fake_vocs, segments), + ) + + parameters = "tests/data/list_features_conv.csv" + + cm = CustomConversations(project, 'custom_conv', parameters) + cm.extract() + + # cm.conversations.to_csv("tests/truth/custom_conversations.csv", index=False) + truth = pd.read_csv("tests/truth/custom_conversations.csv") + + pd.testing.assert_frame_equal(cm.conversations, truth, check_like=True) + + +#TODO adapt +def test_specs(project, am, segments): + am.import_annotations( + pd.DataFrame( + [{"set": "custom_conv", + "raw_filename": "file.its", + "time_seek": 0, + "recording_filename": "sound.wav", + "range_onset": 0, + "range_offset": 30000000, + "format": "csv", + }] + ), + import_function=partial(fake_vocs, segments), + ) + + csp = ConversationsSpecificationPipeline() + + parameters = "tests/data/conversations_parameters.yml" + csp.run(parameters) + + output = pd.read_csv(csp.destination) + output.to_csv("tests/truth/specs_conversations.csv", index=False) + truth = pd.read_csv("tests/truth/specs_conversations.csv") + + pd.testing.assert_frame_equal(output, truth, check_like=True) + + new_params = csp.parameters_path + csp.run(new_params) + + output = pd.read_csv(csp.destination) + + pd.testing.assert_frame_equal(output, truth, check_like=True) \ No newline at end of file diff --git a/tests/test_metrics.py b/tests/test_metrics.py index a7fed408..275fe5ff 100644 --- a/tests/test_metrics.py +++ b/tests/test_metrics.py @@ -36,7 +36,7 @@ def am(request, project): project.recordings['duration'] = [100000000, 2000000] #force longer durations to allow for imports yield am -#decorating functions with reserved kwargs should fail +# decorating functions with reserved kwargs should fail @pytest.mark.parametrize("error", [ValueError, ]) def test_decorator(error): for reserved in RESERVED: diff --git a/tests/truth/custom_conversations.csv b/tests/truth/custom_conversations.csv new file mode 100644 index 00000000..72e34c70 --- /dev/null +++ b/tests/truth/custom_conversations.csv @@ -0,0 +1,4 @@ +conversation_onset,conversation_offset,voc_count,conv_count,interval_last_conv,recording_filename,initiator,finisher,participants,chi_dur_contrib,fem_dur_contrib +1984136,1988951,3,1.0,,sound.wav,CHI,FEM,CHI/OCH/FEM,0.1286786786786787,0.5193693693693694 +28284010,28287945,2,2.0,26295059.0,sound.wav,OCH,OCH,OCH/MAL,, +28288492,28294692,2,3.0,2917.0,sound.wav,OCH,MAL,OCH/MAL,, diff --git a/tests/truth/python_conversations.csv b/tests/truth/python_conversations.csv new file mode 100644 index 00000000..3f1b46cc --- /dev/null +++ b/tests/truth/python_conversations.csv @@ -0,0 +1,4 @@ +conversation_onset,conversation_offset,voc_count,conv_count,interval_last_conv,recording_filename,initiator,finisher,CHI_voc_count +1984136,1988951,3,1.0,,sound.wav,CHI,FEM,1 +28284010,28287945,2,2.0,26295059.0,sound.wav,OCH,OCH,0 +28288492,28294692,2,3.0,2917.0,sound.wav,OCH,MAL,0 diff --git a/tests/truth/standard_conversations.csv b/tests/truth/standard_conversations.csv new file mode 100644 index 00000000..a01f6202 --- /dev/null +++ b/tests/truth/standard_conversations.csv @@ -0,0 +1,4 @@ +conversation_onset,conversation_offset,voc_count,conv_count,interval_last_conv,recording_filename,initiator,finisher,total_duration_of_vocalisations,CHI_voc_count,FEM_voc_count,MAL_voc_count,OCH_voc_count,CHI_voc_dur,FEM_voc_dur,MAL_voc_dur,OCH_voc_dur,child_id,date_iso,experiment,child_dob +1984136,1988951,3,1.0,,sound.wav,CHI,FEM,6660,1,1,0,1,857.0,3459.0,,2344,1,2020-04-20,test,2020-01-01 +28284010,28287945,2,2.0,26295059.0,sound.wav,OCH,OCH,4089,0,0,1,1,,,154.0,3935,1,2020-04-20,test,2020-01-01 +28288492,28294692,2,3.0,2917.0,sound.wav,OCH,MAL,1001,0,0,1,1,,,486.0,515,1,2020-04-20,test,2020-01-01 From 4242ed3f58c2ab48d2e3fde2e3851d391093b0a1 Mon Sep 17 00:00:00 2001 From: Loann Peurey Date: Fri, 5 Jul 2024 18:47:58 +0200 Subject: [PATCH 38/44] make parameters file pipeline ok with arguments --- ChildProject/pipelines/conversations.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/ChildProject/pipelines/conversations.py b/ChildProject/pipelines/conversations.py index 0ca3dedc..6842b1bc 100644 --- a/ChildProject/pipelines/conversations.py +++ b/ChildProject/pipelines/conversations.py @@ -88,6 +88,7 @@ def check_callable(row): raise ValueError( "{} cannot be evaluated as a feature, must be a callable object or a string".format(row["callable"])) + self.features_df = features_list # block checking presence of required columns and evaluates the callable functions if isinstance(features_list, pd.DataFrame): if ({'callable', 'name'}).issubset(features_list.columns): @@ -111,8 +112,6 @@ def check_callable(row): ) self.set = setname self.features_dict = features_list.to_dict(orient="index") - features_list['name'] = features_list.index - self.features_df = features_list # necessary columns to construct the conversations join_columns = { From 99ff0de671bbfa7edcf76646037585c2ec880dfa Mon Sep 17 00:00:00 2001 From: Loann Peurey Date: Fri, 5 Jul 2024 18:49:18 +0200 Subject: [PATCH 39/44] update parameter pipeline test --- tests/data/conversations_parameters.yml | 29 ++++++++----------------- tests/truth/specs_conversations.csv | 4 ++++ 2 files changed, 13 insertions(+), 20 deletions(-) create mode 100644 tests/truth/specs_conversations.csv diff --git a/tests/data/conversations_parameters.yml b/tests/data/conversations_parameters.yml index b41a89df..c245193a 100644 --- a/tests/data/conversations_parameters.yml +++ b/tests/data/conversations_parameters.yml @@ -1,5 +1,5 @@ package_version: 0.2.0 -date: '20240704_184950' +date: '20240705_184314' parameters: path: output/conversations destination: output/conversations/extra/std_conv.csv @@ -14,43 +14,32 @@ parameters: dataset_hash: 710ebfe1f4b118f8a48f69d896c6675bb72ef6ba features_list: - callable: who_initiated - args: {} name: initiator - callable: who_finished - args: {} name: finisher - callable: voc_total_dur - args: {} name: total_duration_of_vocalisations - callable: voc_speaker_count - args: - speaker: CHI name: CHI_voc_count + speaker: CHI - callable: voc_speaker_count - args: - speaker: FEM name: FEM_voc_count + speaker: FEM - callable: voc_speaker_count - args: - speaker: MAL name: MAL_voc_count + speaker: MAL - callable: voc_speaker_count - args: - speaker: OCH name: OCH_voc_count + speaker: OCH - callable: voc_speaker_dur - args: - speaker: CHI name: CHI_voc_dur + speaker: CHI - callable: voc_speaker_dur - args: - speaker: FEM name: FEM_voc_dur + speaker: FEM - callable: voc_speaker_dur - args: - speaker: MAL name: MAL_voc_dur + speaker: MAL - callable: voc_speaker_dur - args: - speaker: OCH name: OCH_voc_dur + speaker: OCH diff --git a/tests/truth/specs_conversations.csv b/tests/truth/specs_conversations.csv new file mode 100644 index 00000000..30bdc9c2 --- /dev/null +++ b/tests/truth/specs_conversations.csv @@ -0,0 +1,4 @@ +conversation_onset,conversation_offset,voc_count,conv_count,interval_last_conv,recording_filename,initiator,finisher,total_duration_of_vocalisations,CHI_voc_count,FEM_voc_count,MAL_voc_count,OCH_voc_count,CHI_voc_dur,FEM_voc_dur,MAL_voc_dur,OCH_voc_dur +1984136,1988951,3,1.0,,sound.wav,CHI,FEM,6660,1,1,0,1,857.0,3459.0,,2344 +28284010,28287945,2,2.0,26295059.0,sound.wav,OCH,OCH,4089,0,0,1,1,,,154.0,3935 +28288492,28294692,2,3.0,2917.0,sound.wav,OCH,MAL,1001,0,0,1,1,,,486.0,515 From e56c5d9a272f646de54e16f8d0f44c7fd9517b5d Mon Sep 17 00:00:00 2001 From: Loann Peurey Date: Mon, 8 Jul 2024 10:44:36 +0200 Subject: [PATCH 40/44] tests for individual conversation extraction functions --- tests/test_conversationFunctions.py | 31 +++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) create mode 100644 tests/test_conversationFunctions.py diff --git a/tests/test_conversationFunctions.py b/tests/test_conversationFunctions.py new file mode 100644 index 00000000..c1cc04e3 --- /dev/null +++ b/tests/test_conversationFunctions.py @@ -0,0 +1,31 @@ +import pandas as pd +import pytest + +import ChildProject.pipelines.conversationFunctions as cf + +@pytest.fixture(scope="function") +def segments(request): + segments = pd.read_csv("tests/data/csv.csv").dropna(subset=['speaker_type']) + segments['voc_duration'] = segments['segment_offset'] - segments['segment_onset'] + + yield segments + +@pytest.mark.parametrize("function,parameters,truth", + [(cf.who_initiated, {}, 'CHI'), + (cf.who_finished, {}, 'MAL'), + (cf.participants, {}, 'CHI/OCH/FEM/MAL'), + (cf.voc_total_dur, {}, 15034), + (cf.is_speaker, {'speaker':'XXX'}, False), + (cf.is_speaker, {'speaker':'OCH'}, True), + (cf.voc_speaker_count, {'speaker':'CHI'}, 1), + (cf.voc_speaker_count, {'speaker':'OCH'}, 3), + (cf.voc_speaker_dur, {'speaker':'MAL'}, 3924), + (cf.voc_speaker_dur, {'speaker':'FEM'}, 3459), + (cf.voc_dur_contribution, {'speaker':'FEM'}, 3459/15034), + (cf.voc_dur_contribution, {'speaker':'OCH'}, 6794/15034), + (cf.assign_conv_type, {}, 'multiparty'), + ]) +def test_conversations(segments, function, parameters, truth): + result = function(segments, **parameters) + + assert result == truth \ No newline at end of file From ed785229ea74ecb133ab02b524d56e08786cfd62 Mon Sep 17 00:00:00 2001 From: Loann Peurey Date: Mon, 8 Jul 2024 11:53:08 +0200 Subject: [PATCH 41/44] CHANGELOG --- CHANGELOG.md | 4 ++++ tests/test_conversationFunctions.py | 16 ++++++++-------- 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 73177736..2bc3d66f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,10 @@ All notable changes to this project will be documented in this file. ## [Unreleased] +### Added + +- conversations summary extraction pipeline + ## [0.2.2] 2024-06-26 ### Added diff --git a/tests/test_conversationFunctions.py b/tests/test_conversationFunctions.py index c1cc04e3..b3a122e8 100644 --- a/tests/test_conversationFunctions.py +++ b/tests/test_conversationFunctions.py @@ -15,14 +15,14 @@ def segments(request): (cf.who_finished, {}, 'MAL'), (cf.participants, {}, 'CHI/OCH/FEM/MAL'), (cf.voc_total_dur, {}, 15034), - (cf.is_speaker, {'speaker':'XXX'}, False), - (cf.is_speaker, {'speaker':'OCH'}, True), - (cf.voc_speaker_count, {'speaker':'CHI'}, 1), - (cf.voc_speaker_count, {'speaker':'OCH'}, 3), - (cf.voc_speaker_dur, {'speaker':'MAL'}, 3924), - (cf.voc_speaker_dur, {'speaker':'FEM'}, 3459), - (cf.voc_dur_contribution, {'speaker':'FEM'}, 3459/15034), - (cf.voc_dur_contribution, {'speaker':'OCH'}, 6794/15034), + (cf.is_speaker, {'speaker': 'XXX'}, False), + (cf.is_speaker, {'speaker': 'OCH'}, True), + (cf.voc_speaker_count, {'speaker': 'CHI'}, 1), + (cf.voc_speaker_count, {'speaker': 'OCH'}, 3), + (cf.voc_speaker_dur, {'speaker': 'MAL'}, 3924), + (cf.voc_speaker_dur, {'speaker': 'FEM'}, 3459), + (cf.voc_dur_contribution, {'speaker': 'FEM'}, 3459/15034), + (cf.voc_dur_contribution, {'speaker': 'OCH'}, 6794/15034), (cf.assign_conv_type, {}, 'multiparty'), ]) def test_conversations(segments, function, parameters, truth): From a2d151f993969d76a18bf2e60e8704c1eb2b6143 Mon Sep 17 00:00:00 2001 From: alix-bourree Date: Tue, 23 Jul 2024 11:02:38 +0200 Subject: [PATCH 42/44] add edge cases --- tests/test_conversations.py | 146 +++++++++++++++++++++++++++++++++++- 1 file changed, 145 insertions(+), 1 deletion(-) diff --git a/tests/test_conversations.py b/tests/test_conversations.py index 01791ce5..76708520 100644 --- a/tests/test_conversations.py +++ b/tests/test_conversations.py @@ -203,4 +203,148 @@ def test_specs(project, am, segments): output = pd.read_csv(csp.destination) - pd.testing.assert_frame_equal(output, truth, check_like=True) \ No newline at end of file + pd.testing.assert_frame_equal(output, truth, check_like=True) + + +def test_empty_conversations(project, am): + empty_segments = pd.DataFrame(columns=["segment_onset", "segment_offset", "speaker_type", "time_since_last_conv", "conv_count"]) + + am.import_annotations( + pd.DataFrame( + [{"set": "empty_conv", + "raw_filename": "file.its", + "time_seek": 0, + "recording_filename": "sound.wav", + "range_onset": 0, + "range_offset": 30000000, + "format": "csv", + }] + ), + import_function=partial(fake_vocs, empty_segments), + ) + + std = StandardConversations(project, setname='empty_conv') + results = std.extract() + + assert results.empty, "The result should be empty for an empty dataset" +def test_nan_values(project, am): + nan_segments = pd.DataFrame({ + "segment_onset": [np.nan, 10, 20], + "segment_offset": [5, np.nan, 25], + "speaker_type": ["CHI", np.nan, "FEM"], + "time_since_last_conv": [np.nan, 15, 5], + "conv_count": [1, 1, 2] + }) + + am.import_annotations( + pd.DataFrame( + [{"set": "nan_conv", + "raw_filename": "file.its", + "time_seek": 0, + "recording_filename": "sound.wav", + "range_onset": 0, + "range_offset": 30000000, + "format": "csv", + }] + ), + import_function=partial(fake_vocs, nan_segments), + ) + + std = StandardConversations(project, setname='nan_conv') + results = std.extract() + + assert not results.empty, "The result should not be empty for a dataset with NaN values" + + +def test_single_entry_conversation(project, am): + single_segment = pd.DataFrame({ + "segment_onset": [0], + "segment_offset": [5], + "speaker_type": ["CHI"], + "time_since_last_conv": [np.nan], + "conv_count": [1] + }) + + am.import_annotations( + pd.DataFrame( + [{"set": "single_conv", + "raw_filename": "file.its", + "time_seek": 0, + "recording_filename": "sound.wav", + "range_onset": 0, + "range_offset": 30000000, + "format": "csv", + }] + ), + import_function=partial(fake_vocs, single_segment), + ) + + std = StandardConversations(project, setname='single_conv') + results = std.extract() + + assert len(results) == 1, "The result should contain one conversation for a single entry dataset" + + +def test_incorrect_data_types(project, am): + incorrect_types = pd.DataFrame({ + "segment_onset": ["0", "10", "20"], + "segment_offset": ["5", "15", "25"], + "speaker_type": ["CHI", "FEM", "MAN"], + "time_since_last_conv": ["nan", "15", "5"], + "conv_count": [1, 1, 2] + }) + + am.import_annotations( + pd.DataFrame( + [{"set": "incorrect_types_conv", + "raw_filename": "file.its", + "time_seek": 0, + "recording_filename": "sound.wav", + "range_onset": 0, + "range_offset": 30000000, + "format": "csv", + }] + ), + import_function=partial(fake_vocs, incorrect_types), + ) + + std = StandardConversations(project, setname='incorrect_types_conv') + with pytest.raises(Exception): + std.extract(), "The code should raise an exception for incorrect data types" + + +def test_unsorted_annotations(project, am): + unsorted_segments = pd.DataFrame({ + "segment_onset": [20, 0, 10], + "segment_offset": [25, 5, 15], + "speaker_type": ["FEM", "CHI", "MAN"], + "time_since_last_conv": [5, np.nan, 15], + "conv_count": [2, 1, 1] + }) + + am.import_annotations( + pd.DataFrame( + [{"set": "unsorted_conv", + "raw_filename": "file.its", + "time_seek": 0, + "recording_filename": "sound.wav", + "range_onset": 0, + "range_offset": 30000000, + "format": "csv", + }] + ), + import_function=partial(fake_vocs, unsorted_segments), + ) + + std = StandardConversations(project, setname='unsorted_conv') + results = std.extract() + + assert not results.empty, "The result should not be empty for unsorted annotations" + + +def test_all_cases(project, am, segments): + test_empty_conversations(project, am) + test_nan_values(project, am) + test_single_entry_conversation(project, am) + test_incorrect_data_types(project, am) + test_unsorted_annotations(project, am) \ No newline at end of file From d522cba9d412f7ad282dc3c3cb5fa142fd5a7bcf Mon Sep 17 00:00:00 2001 From: Loann Peurey Date: Thu, 1 Aug 2024 11:56:22 +0200 Subject: [PATCH 43/44] no exception on empty dataframe --- ChildProject/pipelines/conversations.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/ChildProject/pipelines/conversations.py b/ChildProject/pipelines/conversations.py index 6842b1bc..4f1f25b1 100644 --- a/ChildProject/pipelines/conversations.py +++ b/ChildProject/pipelines/conversations.py @@ -5,6 +5,7 @@ import datetime import multiprocessing as mp import logging +import functools import numpy as np import pandas as pd @@ -223,7 +224,7 @@ def _process_conversation(self, conversation, rec): #process recording line return result - def _process_recording(self, recording): + def _process_recording(self, recording, grouper): """for one recording, get the segments required, group by conversation and launch computation for each block :param recording: recording_filename to which belongs that conversation @@ -231,7 +232,6 @@ def _process_recording(self, recording): :return: dict containing all the computed features result for that unit :rtype: list[dict] """ - grouper = 'conv_count' segments = self.retrieve_segments(recording) segments['voc_duration'] = segments['segment_offset'] - segments['segment_onset'] @@ -260,16 +260,17 @@ def extract(self): :return: DataFrame of computed features :rtype: pandas.DataFrame """ + grouper = 'conv_count' if self.threads == 1: - results = list(itertools.chain.from_iterable(map(self._process_recording, self.recordings))) + results = list(itertools.chain.from_iterable(map(functools.partial(self._process_recording, grouper=grouper), self.recordings))) else: with mp.Pool( processes=self.threads if self.threads >= 1 else mp.cpu_count() ) as pool: - results = list(itertools.chain.from_iterable(pool.map(self._process_recording, self.recordings))) + results = list(itertools.chain.from_iterable(pool.map(functools.partial(self._process_recording, grouper=grouper), self.recordings))) - self.conversations = pd.DataFrame(results) if len(results) else pd.DataFrame(columns=grouper) + self.conversations = pd.DataFrame(results) if len(results) else pd.DataFrame(columns=[grouper]) # now add the rec_cols and child_cols in the result if self.rec_cols: From e69ae028b79002a09a710427d3ee785ef4359bdf Mon Sep 17 00:00:00 2001 From: Loann Peurey Date: Thu, 1 Aug 2024 12:01:05 +0200 Subject: [PATCH 44/44] move test data types to annotations, na from conversation --- tests/test_annotations.py | 25 +++++++++++++++ tests/test_conversations.py | 63 ------------------------------------- 2 files changed, 25 insertions(+), 63 deletions(-) diff --git a/tests/test_annotations.py b/tests/test_annotations.py index 7808c34a..2d3f67b8 100644 --- a/tests/test_annotations.py +++ b/tests/test_annotations.py @@ -285,6 +285,31 @@ def test_multiple_imports(project, am, input_file, ow, rimported, rerrors, excep assert len(errors) == 0 and len(warnings) == 0, "malformed annotation indexes detected" +def test_import_incorrect_data_types(project, am): + incorrect_types = pd.DataFrame({ + "segment_onset": ["0", "10", "20"], + "segment_offset": ["5", "15", "25"], + "speaker_type": ["CHI", "FEM", "MAN"], + "time_since_last_conv": ["nan", "15", "5"], + "conv_count": [1, 1, 2] + }) + + with pytest.raises(Exception): + am.import_annotations( + pd.DataFrame( + [{"set": "incorrect_types_conv", + "raw_filename": "file.its", + "time_seek": 0, + "recording_filename": "sound.wav", + "range_onset": 0, + "range_offset": 30000000, + "format": "csv", + }] + ), + import_function=partial(fake_vocs, incorrect_types), + ) + + # function used as a derivation function, it should throw errors if not returning dataframe or without required columns def dv_func(a, b, x, type): if type == 'number': diff --git a/tests/test_conversations.py b/tests/test_conversations.py index 76708520..12dfa80c 100644 --- a/tests/test_conversations.py +++ b/tests/test_conversations.py @@ -227,33 +227,6 @@ def test_empty_conversations(project, am): results = std.extract() assert results.empty, "The result should be empty for an empty dataset" -def test_nan_values(project, am): - nan_segments = pd.DataFrame({ - "segment_onset": [np.nan, 10, 20], - "segment_offset": [5, np.nan, 25], - "speaker_type": ["CHI", np.nan, "FEM"], - "time_since_last_conv": [np.nan, 15, 5], - "conv_count": [1, 1, 2] - }) - - am.import_annotations( - pd.DataFrame( - [{"set": "nan_conv", - "raw_filename": "file.its", - "time_seek": 0, - "recording_filename": "sound.wav", - "range_onset": 0, - "range_offset": 30000000, - "format": "csv", - }] - ), - import_function=partial(fake_vocs, nan_segments), - ) - - std = StandardConversations(project, setname='nan_conv') - results = std.extract() - - assert not results.empty, "The result should not be empty for a dataset with NaN values" def test_single_entry_conversation(project, am): @@ -285,34 +258,6 @@ def test_single_entry_conversation(project, am): assert len(results) == 1, "The result should contain one conversation for a single entry dataset" -def test_incorrect_data_types(project, am): - incorrect_types = pd.DataFrame({ - "segment_onset": ["0", "10", "20"], - "segment_offset": ["5", "15", "25"], - "speaker_type": ["CHI", "FEM", "MAN"], - "time_since_last_conv": ["nan", "15", "5"], - "conv_count": [1, 1, 2] - }) - - am.import_annotations( - pd.DataFrame( - [{"set": "incorrect_types_conv", - "raw_filename": "file.its", - "time_seek": 0, - "recording_filename": "sound.wav", - "range_onset": 0, - "range_offset": 30000000, - "format": "csv", - }] - ), - import_function=partial(fake_vocs, incorrect_types), - ) - - std = StandardConversations(project, setname='incorrect_types_conv') - with pytest.raises(Exception): - std.extract(), "The code should raise an exception for incorrect data types" - - def test_unsorted_annotations(project, am): unsorted_segments = pd.DataFrame({ "segment_onset": [20, 0, 10], @@ -340,11 +285,3 @@ def test_unsorted_annotations(project, am): results = std.extract() assert not results.empty, "The result should not be empty for unsorted annotations" - - -def test_all_cases(project, am, segments): - test_empty_conversations(project, am) - test_nan_values(project, am) - test_single_entry_conversation(project, am) - test_incorrect_data_types(project, am) - test_unsorted_annotations(project, am) \ No newline at end of file