Merge pull request #466 from LAAC-LSCP/eaf-builder/subtree

Eaf builder/subtree
LAAC-LSCP · Apr 9, 2024 · 67525c1 · 67525c1
2 parents 17c5a6a + 19b3cc7
commit 67525c1
Show file tree

Hide file tree

Showing 3 changed files with 100 additions and 19 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,6 +4,10 @@ All notable changes to this project will be documented in this file.
 
 ## [Unreleased]
 
+### Fixed
+
+- eaf_builder replicating the subtree structure of recordings is not happening anymore. And individual files are not placed in individual subfolder anymore
+
 ## [0.1.2] 2024-03-14
 
 ### Added

diff --git a/ChildProject/pipelines/eafbuilder.py b/ChildProject/pipelines/eafbuilder.py
@@ -3,6 +3,7 @@
 import sys
 import os
 import shutil
+from pathlib import Path
 
 from ChildProject.projects import ChildProject
 from ChildProject.annotations import AnnotationManager
@@ -22,7 +23,6 @@ def create_eaf(
     eaf_type: str,
     contxt_on: int,
     contxt_off: int,
-    template: str,
     speech_segments: pd.DataFrame = None,
     imported_set: str = None,
     imported_format: str = None,
@@ -69,7 +69,7 @@ def create_eaf(
                 speaker_id = segment["speaker_id"]
             elif "speaker_type" in segment:
                 speaker_id = segment["speaker_type"]
-                if pd.isnull(speaker_id) and imported_format in FORMAT_SPEECH : speaker_id = "SPEECH" #replace  nan with SPEECH for some formats
+                if pd.isnull(speaker_id) and imported_format in FORMAT_SPEECH: speaker_id = "SPEECH" #replace  nan with SPEECH for some formats
 
             if speaker_id is None:
                 continue
@@ -183,7 +183,8 @@ def run(
             imported_set = import_speech_from
 
         for recording_filename, segs in segments.groupby("recording_filename"):
-            recording_prefix = os.path.splitext(recording_filename)[0]
+            full_recording = Path(recording_filename)
+            recording_prefix = full_recording.stem
             output_filename = (
                 recording_prefix + "_" + eaf_type + "_" + os.path.basename(template)
             )
@@ -217,7 +218,7 @@ def run(
                     imported_format = None
 
 
-            output_dir = os.path.join(destination, recording_prefix)
+            output_dir = os.path.join(destination, full_recording.parent)
 
             create_eaf(
                 etf_path,
@@ -228,7 +229,6 @@ def run(
                 eaf_type,
                 context_onset,
                 context_offset,
-                template,
                 speech_segments,
                 imported_set,
                 imported_format,

diff --git a/tests/test_eaf.py b/tests/test_eaf.py
@@ -5,26 +5,103 @@
 import shutil
 import pytest
 
+
 from ChildProject.projects import ChildProject
 from ChildProject.annotations import AnnotationManager
 from ChildProject.pipelines.samplers import PeriodicSampler
-from ChildProject.pipelines.eafbuilder import EafBuilderPipeline
+from ChildProject.pipelines.eafbuilder import EafBuilderPipeline, create_eaf
 
 IMP_FROM = 'vtc'
+PATH = os.path.join('output', 'eaf')
 
 def fake_vocs(data, filename):
     return data
 
 @pytest.fixture(scope="function")
 def project(request):
-    if not os.path.exists("output/eaf"):
-        shutil.copytree(src="examples/valid_raw_data", dst="output/eaf")
+    if os.path.exists(PATH):
+        shutil.rmtree(PATH)
+    shutil.copytree(src="examples/valid_raw_data", dst=PATH)
 
-    project = ChildProject("output/eaf")
+    project = ChildProject(PATH)
     project.read()
 
     yield project
 
+
+IMP = pd.DataFrame({'segment_onset': [10], 'segment_offset': [15], 'speaker_type': ['FEM']})
+TEMP = os.path.join('ChildProject', 'templates', 'basic.etf')
+@pytest.mark.parametrize(("etf_path,output_dir,recording_filename,timestamps_list,eaf_type,context_on,context_off,speech_segments,imported_set,error"),
+[[5, PATH, 'sound.wav', [], 'periodic', 0, 0, IMP, 'vtc', FileNotFoundError],
+['README.md', PATH, 'sound.wav', [], 'periodic', 0, 0, IMP, 'vtc', Exception],
+[TEMP, 6, 'sound.wav', [], 'periodic', 0, 0, IMP, 'vtc', TypeError],
+[TEMP, PATH, 8, [], 'periodic', 0, 0, IMP, 'vtc', TypeError],
+[TEMP, PATH, 'sound.wav', 5, 'periodic', 0, 0, IMP, 'vtc', TypeError],
+[TEMP, PATH, 'sound.wav', [(5, 'abc')], 'periodic', 0, 0, IMP, 'vtc', ValueError],
+[TEMP, PATH, 'sound.wav', [(5, 10)], 'periodic', 'xp', 0, IMP, 'vtc', TypeError],
+[TEMP, PATH, 'sound.wav', [(5, 10)], 'periodic', 0, 0, 'x', 'vtc', AttributeError],
+[TEMP, PATH, 'sound.wav', [(5, 10)], 'periodic', 0, 0, IMP.drop(columns=['segment_offset']), 'vtc', KeyError],
+[TEMP, PATH, 'sound.wav', [(5, 10)], 'periodic', 0, 0, IMP, 5, AttributeError],
+    ])
+def test_create_eaf_inputs(project, etf_path, output_dir, recording_filename, timestamps_list, eaf_type, context_on,
+                           context_off, speech_segments, imported_set, error):
+    with pytest.raises(error):
+        create_eaf(etf_path, 'sound', output_dir, recording_filename, timestamps_list, eaf_type, context_on, context_off,
+                   speech_segments, imported_set, 'vtc_rttm')
+
+def test_create_eaf(project):
+
+    timestamps_list = [(10, 20), (30, 40), (50, 60)]
+
+    create_eaf(TEMP, 'sound', os.path.join(PATH, 'extra/eaf'), 'sound.wav', timestamps_list, 'periodic', 10, 10,
+               IMP, 'vtc', 'vtc_rttm')
+
+    eaf = Eaf(os.path.join(PATH, 'extra/eaf/sound.eaf'))
+
+    code = eaf.tiers['code_periodic'][0]
+    segments = []
+
+    for pid in code:
+        (start_ts, end_ts, value, svg_ref) = code[pid]
+        (start_t, end_t) = (eaf.timeslots[start_ts], eaf.timeslots[end_ts])
+        segments.append({'segment_onset': int(start_t), 'segment_offset': int(end_t)})
+
+    timestamps = []
+    for pid in timestamps_list:
+        timestamps.append({'segment_onset': pid[0], 'segment_offset': pid[1]})
+
+    segments = pd.DataFrame(segments)
+    timestamps = pd.DataFrame(timestamps)
+
+    pd.testing.assert_frame_equal(
+        segments[['segment_onset', 'segment_offset']].sort_values(['segment_onset', 'segment_offset']).reset_index(
+            drop=True),
+        timestamps[['segment_onset', 'segment_offset']].sort_values(
+            ['segment_onset', 'segment_offset']).reset_index(drop=True),
+        check_dtype=False,
+    )
+
+    segments = []
+    vtc_speech = eaf.tiers['VTC-FEM'][0]
+    for pid in vtc_speech:
+        (start_ts, end_ts, value, svg_ref) = vtc_speech[pid]
+        (start_t, end_t) = (eaf.timeslots[start_ts], eaf.timeslots[end_ts])
+        segments.append({'segment_onset': int(start_t), 'segment_offset': int(end_t)})
+
+    segments = pd.DataFrame(segments)
+
+    pd.testing.assert_frame_equal(
+        segments[['segment_onset', 'segment_offset']].sort_values(['segment_onset', 'segment_offset']).reset_index(
+            drop=True),
+        IMP[['segment_onset', 'segment_offset']].sort_values(['segment_onset', 'segment_offset']).reset_index(
+            drop=True)
+    )
+
+    assert eaf.media_descriptors[0]['MEDIA_URL'] == 'sound.wav'
+
+
+# @pytest.mark.parametrize("segments,type,template,context_onset,context_offset,path,import_speech_from",
+#                          [])
 def test_periodic(project):
     """
     os.makedirs('output/eaf', exist_ok = True)
@@ -55,9 +132,9 @@ def test_periodic(project):
         import_function=partial(fake_vocs, data),
     )
 
-    sampler = PeriodicSampler(project, 500, 500, 250, recordings = ['sound.wav'])
+    sampler = PeriodicSampler(project, 500, 500, 250, recordings=['sound.wav'])
     sampler.sample()
-    sampler.segments.to_csv('output/eaf/segments.csv')
+    sampler.segments.to_csv(os.path.join(PATH, 'segments.csv'))
 
     ranges = sampler.segments.rename(
                     columns={
@@ -71,17 +148,17 @@ def test_periodic(project):
 
     eaf_builder = EafBuilderPipeline()
     eaf_builder.run(
-        destination = 'output/eaf',
-        segments = 'output/eaf/segments.csv',
-        eaf_type = 'periodic',
-        template = 'basic',
-        context_onset = 250,
-        context_offset = 250,
-        path='output/eaf',
+        destination=os.path.join(PATH, 'extra', 'eaf'),
+        segments=os.path.join(PATH, 'segments.csv'),
+        eaf_type='periodic',
+        template='basic',
+        context_onset=250,
+        context_offset=250,
+        path=PATH,
         import_speech_from='vtc',
     )
 
-    eaf = Eaf('output/eaf/sound/sound_periodic_basic.eaf')
+    eaf = Eaf(os.path.join(PATH, 'extra/eaf/sound_periodic_basic.eaf'))
 
     code = eaf.tiers['code_periodic'][0]
     segments = []