diff --git a/ChildProject/pipelines/conversationFunctions.py b/ChildProject/pipelines/conversationFunctions.py index 96dbbd21..6328983b 100644 --- a/ChildProject/pipelines/conversationFunctions.py +++ b/ChildProject/pipelines/conversationFunctions.py @@ -74,7 +74,7 @@ def who_finished(segments: pd.DataFrame): Required keyword arguments: """ - return segments[segments['segment_offset'] == segments['segment_offset'].max()]['speaker_type'] + return segments[segments['segment_offset'] == segments['segment_offset'].max()].iloc[0]['speaker_type'] @conversationFunction() def participants(segments: pd.DataFrame): diff --git a/ChildProject/pipelines/conversations.py b/ChildProject/pipelines/conversations.py index dccc88d7..0ca3dedc 100644 --- a/ChildProject/pipelines/conversations.py +++ b/ChildProject/pipelines/conversations.py @@ -111,6 +111,8 @@ def check_callable(row): ) self.set = setname self.features_dict = features_list.to_dict(orient="index") + features_list['name'] = features_list.index + self.features_df = features_list # necessary columns to construct the conversations join_columns = { @@ -260,21 +262,15 @@ def extract(self): :rtype: pandas.DataFrame """ if self.threads == 1: - extractions = [] - for rec in self.recordings: - segments = self.retrieve_segments(rec) - conversations = segments.groupby(grouper) - - extractions += [self._process_conversation(block) for block in conversations] - self.conversations = pd.DataFrame(extractions) if len(extractions) else pd.DataFrame(columns=grouper) + results = list(itertools.chain.from_iterable(map(self._process_recording, self.recordings))) else: with mp.Pool( processes=self.threads if self.threads >= 1 else mp.cpu_count() ) as pool: results = list(itertools.chain.from_iterable(pool.map(self._process_recording, self.recordings))) - self.conversations = pd.DataFrame(results) if len(results) else pd.DataFrame(columns=grouper) + self.conversations = pd.DataFrame(results) if len(results) else pd.DataFrame(columns=grouper) # now add the rec_cols and child_cols in the result if self.rec_cols: @@ -336,7 +332,7 @@ def retrieve_segments(self, recording: str): # no annotations for that unit return pd.DataFrame(columns=list(set([c.name for c in AnnotationManager.SEGMENTS_COLUMNS if c.required] + list(annotations.columns) + ['conv_count']))) - segments = segments.dropna(subset='conv_count') + segments = segments.dropna(subset=['conv_count']) else: # no annotations for that unit return pd.DataFrame(columns=list(set([c.name for c in AnnotationManager.SEGMENTS_COLUMNS if c.required] @@ -509,7 +505,7 @@ def run(self, path, destination, pipeline, func=None, **kwargs): self.conversations.to_csv(self.destination, index=False) # get the df of features used from the Conversations class - features_df = conversations.features_list + features_df = conversations.features_df features_df['callable'] = features_df.apply(lambda row: row['callable'].__name__, axis=1) # from the callables used, find their name back parameters['features_list'] = [{k: v for k, v in m.items() if pd.notnull(v)} for m in @@ -658,7 +654,8 @@ def run(self, parameters_input, func=None): self.conversations.to_csv(self.destination, index=False) # get the df of features used from the Conversations class - features_df = conversations.features_list + features_df = conversations.features_df + print(features_df) features_df['callable'] = features_df.apply(lambda row: row['callable'].__name__, axis=1) # from the callables used, find their name back parameters['features_list'] = [{k: v for k, v in m.items() if pd.notnull(v)} for m in