Skip to content

Commit

Permalink
Merge pull request #466 from LAAC-LSCP/eaf-builder/subtree
Browse files Browse the repository at this point in the history
Eaf builder/subtree
  • Loading branch information
LoannPeurey authored Apr 9, 2024
2 parents 17c5a6a + 19b3cc7 commit 67525c1
Show file tree
Hide file tree
Showing 3 changed files with 100 additions and 19 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,10 @@ All notable changes to this project will be documented in this file.

## [Unreleased]

### Fixed

- eaf_builder replicating the subtree structure of recordings is not happening anymore. And individual files are not placed in individual subfolder anymore

## [0.1.2] 2024-03-14

### Added
Expand Down
10 changes: 5 additions & 5 deletions ChildProject/pipelines/eafbuilder.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import sys
import os
import shutil
from pathlib import Path

from ChildProject.projects import ChildProject
from ChildProject.annotations import AnnotationManager
Expand All @@ -22,7 +23,6 @@ def create_eaf(
eaf_type: str,
contxt_on: int,
contxt_off: int,
template: str,
speech_segments: pd.DataFrame = None,
imported_set: str = None,
imported_format: str = None,
Expand Down Expand Up @@ -69,7 +69,7 @@ def create_eaf(
speaker_id = segment["speaker_id"]
elif "speaker_type" in segment:
speaker_id = segment["speaker_type"]
if pd.isnull(speaker_id) and imported_format in FORMAT_SPEECH : speaker_id = "SPEECH" #replace nan with SPEECH for some formats
if pd.isnull(speaker_id) and imported_format in FORMAT_SPEECH: speaker_id = "SPEECH" #replace nan with SPEECH for some formats

if speaker_id is None:
continue
Expand Down Expand Up @@ -183,7 +183,8 @@ def run(
imported_set = import_speech_from

for recording_filename, segs in segments.groupby("recording_filename"):
recording_prefix = os.path.splitext(recording_filename)[0]
full_recording = Path(recording_filename)
recording_prefix = full_recording.stem
output_filename = (
recording_prefix + "_" + eaf_type + "_" + os.path.basename(template)
)
Expand Down Expand Up @@ -217,7 +218,7 @@ def run(
imported_format = None


output_dir = os.path.join(destination, recording_prefix)
output_dir = os.path.join(destination, full_recording.parent)

create_eaf(
etf_path,
Expand All @@ -228,7 +229,6 @@ def run(
eaf_type,
context_onset,
context_offset,
template,
speech_segments,
imported_set,
imported_format,
Expand Down
105 changes: 91 additions & 14 deletions tests/test_eaf.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,26 +5,103 @@
import shutil
import pytest


from ChildProject.projects import ChildProject
from ChildProject.annotations import AnnotationManager
from ChildProject.pipelines.samplers import PeriodicSampler
from ChildProject.pipelines.eafbuilder import EafBuilderPipeline
from ChildProject.pipelines.eafbuilder import EafBuilderPipeline, create_eaf

IMP_FROM = 'vtc'
PATH = os.path.join('output', 'eaf')

def fake_vocs(data, filename):
return data

@pytest.fixture(scope="function")
def project(request):
if not os.path.exists("output/eaf"):
shutil.copytree(src="examples/valid_raw_data", dst="output/eaf")
if os.path.exists(PATH):
shutil.rmtree(PATH)
shutil.copytree(src="examples/valid_raw_data", dst=PATH)

project = ChildProject("output/eaf")
project = ChildProject(PATH)
project.read()

yield project


IMP = pd.DataFrame({'segment_onset': [10], 'segment_offset': [15], 'speaker_type': ['FEM']})
TEMP = os.path.join('ChildProject', 'templates', 'basic.etf')
@pytest.mark.parametrize(("etf_path,output_dir,recording_filename,timestamps_list,eaf_type,context_on,context_off,speech_segments,imported_set,error"),
[[5, PATH, 'sound.wav', [], 'periodic', 0, 0, IMP, 'vtc', FileNotFoundError],
['README.md', PATH, 'sound.wav', [], 'periodic', 0, 0, IMP, 'vtc', Exception],
[TEMP, 6, 'sound.wav', [], 'periodic', 0, 0, IMP, 'vtc', TypeError],
[TEMP, PATH, 8, [], 'periodic', 0, 0, IMP, 'vtc', TypeError],
[TEMP, PATH, 'sound.wav', 5, 'periodic', 0, 0, IMP, 'vtc', TypeError],
[TEMP, PATH, 'sound.wav', [(5, 'abc')], 'periodic', 0, 0, IMP, 'vtc', ValueError],
[TEMP, PATH, 'sound.wav', [(5, 10)], 'periodic', 'xp', 0, IMP, 'vtc', TypeError],
[TEMP, PATH, 'sound.wav', [(5, 10)], 'periodic', 0, 0, 'x', 'vtc', AttributeError],
[TEMP, PATH, 'sound.wav', [(5, 10)], 'periodic', 0, 0, IMP.drop(columns=['segment_offset']), 'vtc', KeyError],
[TEMP, PATH, 'sound.wav', [(5, 10)], 'periodic', 0, 0, IMP, 5, AttributeError],
])
def test_create_eaf_inputs(project, etf_path, output_dir, recording_filename, timestamps_list, eaf_type, context_on,
context_off, speech_segments, imported_set, error):
with pytest.raises(error):
create_eaf(etf_path, 'sound', output_dir, recording_filename, timestamps_list, eaf_type, context_on, context_off,
speech_segments, imported_set, 'vtc_rttm')

def test_create_eaf(project):

timestamps_list = [(10, 20), (30, 40), (50, 60)]

create_eaf(TEMP, 'sound', os.path.join(PATH, 'extra/eaf'), 'sound.wav', timestamps_list, 'periodic', 10, 10,
IMP, 'vtc', 'vtc_rttm')

eaf = Eaf(os.path.join(PATH, 'extra/eaf/sound.eaf'))

code = eaf.tiers['code_periodic'][0]
segments = []

for pid in code:
(start_ts, end_ts, value, svg_ref) = code[pid]
(start_t, end_t) = (eaf.timeslots[start_ts], eaf.timeslots[end_ts])
segments.append({'segment_onset': int(start_t), 'segment_offset': int(end_t)})

timestamps = []
for pid in timestamps_list:
timestamps.append({'segment_onset': pid[0], 'segment_offset': pid[1]})

segments = pd.DataFrame(segments)
timestamps = pd.DataFrame(timestamps)

pd.testing.assert_frame_equal(
segments[['segment_onset', 'segment_offset']].sort_values(['segment_onset', 'segment_offset']).reset_index(
drop=True),
timestamps[['segment_onset', 'segment_offset']].sort_values(
['segment_onset', 'segment_offset']).reset_index(drop=True),
check_dtype=False,
)

segments = []
vtc_speech = eaf.tiers['VTC-FEM'][0]
for pid in vtc_speech:
(start_ts, end_ts, value, svg_ref) = vtc_speech[pid]
(start_t, end_t) = (eaf.timeslots[start_ts], eaf.timeslots[end_ts])
segments.append({'segment_onset': int(start_t), 'segment_offset': int(end_t)})

segments = pd.DataFrame(segments)

pd.testing.assert_frame_equal(
segments[['segment_onset', 'segment_offset']].sort_values(['segment_onset', 'segment_offset']).reset_index(
drop=True),
IMP[['segment_onset', 'segment_offset']].sort_values(['segment_onset', 'segment_offset']).reset_index(
drop=True)
)

assert eaf.media_descriptors[0]['MEDIA_URL'] == 'sound.wav'


# @pytest.mark.parametrize("segments,type,template,context_onset,context_offset,path,import_speech_from",
# [])
def test_periodic(project):
"""
os.makedirs('output/eaf', exist_ok = True)
Expand Down Expand Up @@ -55,9 +132,9 @@ def test_periodic(project):
import_function=partial(fake_vocs, data),
)

sampler = PeriodicSampler(project, 500, 500, 250, recordings = ['sound.wav'])
sampler = PeriodicSampler(project, 500, 500, 250, recordings=['sound.wav'])
sampler.sample()
sampler.segments.to_csv('output/eaf/segments.csv')
sampler.segments.to_csv(os.path.join(PATH, 'segments.csv'))

ranges = sampler.segments.rename(
columns={
Expand All @@ -71,17 +148,17 @@ def test_periodic(project):

eaf_builder = EafBuilderPipeline()
eaf_builder.run(
destination = 'output/eaf',
segments = 'output/eaf/segments.csv',
eaf_type = 'periodic',
template = 'basic',
context_onset = 250,
context_offset = 250,
path='output/eaf',
destination=os.path.join(PATH, 'extra', 'eaf'),
segments=os.path.join(PATH, 'segments.csv'),
eaf_type='periodic',
template='basic',
context_onset=250,
context_offset=250,
path=PATH,
import_speech_from='vtc',
)

eaf = Eaf('output/eaf/sound/sound_periodic_basic.eaf')
eaf = Eaf(os.path.join(PATH, 'extra/eaf/sound_periodic_basic.eaf'))

code = eaf.tiers['code_periodic'][0]
segments = []
Expand Down

0 comments on commit 67525c1

Please sign in to comment.