From d4b92eefd492fc0608ccda2251f7c28743bb1688 Mon Sep 17 00:00:00 2001 From: Loann Peurey Date: Mon, 8 Jul 2024 15:33:20 +0200 Subject: [PATCH 1/9] docs and test cli call for init command --- docs/source/tools.rst | 16 ++++++++++++++++ tests/test_cli.py | 9 +++++++++ 2 files changed, 25 insertions(+) diff --git a/docs/source/tools.rst b/docs/source/tools.rst index 2b50287cf..c661178d2 100644 --- a/docs/source/tools.rst +++ b/docs/source/tools.rst @@ -1,6 +1,22 @@ Basic tools =========== +Dataset initialization +---------------------- + +This command allows you to create a new and empty dataset, with the correct structure + +.. clidoc:: + + child-project init /path/to/dataset --help + +Example: + +.. code:: bash + + # create a dataset in a folder named mydataset + child-project init mydataset + .. _tools-data-validation: Data validation diff --git a/tests/test_cli.py b/tests/test_cli.py index f0c4ee021..23fce07f1 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -41,6 +41,14 @@ def test_overview(project): assert exit_code == 0 +def test_init(): + shutil.rmtree(PATH, ignore_errors=True) + stdout, stderr, exit_code = cli( + ["child-project", "init", PATH] + ) + assert exit_code == 0 + + def test_import_annotations(project): stdout, stderr, exit_code = cli( [ @@ -53,6 +61,7 @@ def test_import_annotations(project): ) assert exit_code == 0 + def test_compute_durations(project): stdout, stderr, exit_code = cli( [ From 1fc2517d6ada386299162ccaf05a6b3a16b13cda Mon Sep 17 00:00:00 2001 From: Loann Peurey Date: Mon, 8 Jul 2024 15:39:12 +0200 Subject: [PATCH 2/9] changelog --- CHANGELOG.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 731777369..ae0c0cb4f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,10 @@ All notable changes to this project will be documented in this file. ## [Unreleased] +### Added + +- docs and tests for init command + ## [0.2.2] 2024-06-26 ### Added From 25066bb937004dc7635b18fbdf23f0ebfeff41e5 Mon Sep 17 00:00:00 2001 From: Loann Peurey Date: Mon, 8 Jul 2024 15:52:46 +0200 Subject: [PATCH 3/9] freeze pillow for matplotlib version --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index 3f7fc804c..c9c8e3cc6 100644 --- a/requirements.txt +++ b/requirements.txt @@ -12,6 +12,7 @@ numpy==1.26.4; python_version >= '3.10' pandas==1.3.5; python_version <= '3.10' pandas==2.2.1; python_version >= '3.11' panoptes-client==1.6.1 +pillow<10.4.0 # this is constrained as matplotlib versions used fail on pillow > 10.3.0 praat-parselmouth==0.4.3 pyannote.core==5.0.0 pydub==0.25.1 From 24cbecf0f1420cdb2739b89c350a05ce9d0d47a4 Mon Sep 17 00:00:00 2001 From: Loann Peurey Date: Mon, 8 Jul 2024 17:08:44 +0200 Subject: [PATCH 4/9] tests for cli automated import --- tests/test_cli.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/tests/test_cli.py b/tests/test_cli.py index f0c4ee021..562386648 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -53,6 +53,22 @@ def test_import_annotations(project): ) assert exit_code == 0 + +def test_import_automated(project): + stdout, stderr, exit_code = cli( + [ + "child-project", + "automated-import", + PATH, + "--set", + "vtc_rttm", + "--format", + "vtc_rttm", + ] + ) + assert exit_code == 0 + + def test_compute_durations(project): stdout, stderr, exit_code = cli( [ From 6d24a9e459c5a35d46c28e85eadd31ed2648a85c Mon Sep 17 00:00:00 2001 From: Loann Peurey Date: Mon, 8 Jul 2024 17:09:05 +0200 Subject: [PATCH 5/9] docs for automated import --- docs/source/annotations.rst | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/docs/source/annotations.rst b/docs/source/annotations.rst index 25d5a6933..eb4307be4 100644 --- a/docs/source/annotations.rst +++ b/docs/source/annotations.rst @@ -18,7 +18,7 @@ Importation Single annotation importation ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Annotations can be imported one by one or in bulk. Annotation +Annotations can be imported one by one, in bulk or through the automated command. Annotation importation does the following : 1. Convert all input annotations from their original format (e.g. rttm, @@ -62,6 +62,24 @@ Use this to do bulk importation of many annotation files. The input dataframe ``/path/to/dataframe.csv`` must have one entry per annotation to import, according to the format specified at :ref:`format-input-annotations`. +Automated importation +^^^^^^^^^^^^^^^^^^^^^ + +The automated method is mostly used for automated annotations. It is made to assume a certain number of parameters on importation, which allows us to perform the usual importations we are doing without additional input. The command will assume the following: +- the annotation files will cover the entirety of the audio they annotate (equivalent to range_onset 0 and range_offset ) +- the annotation files will have timestamps that are not offset compare to the recording (equivalent to time_seek 0) +- the annotation files will be named exactly like the recording they annotate (including the folder they are in) except for the extension, which depends on the format (equivalent to recording_filename = annotation_filename + extension) +- the format used is the same for all the files and needs to be given in the call, it determines the extension for all the annotation files +- the set to import is the same for all files, must be given in the call + +.. clidoc:: + + child-project automated-import . --help + +:: + + # import the vtc set by using the vtc_rttm format, all annotation files will need to be with extension ``.rttm`` + child-project automated-import . --set vtc --format vtc_rttm Rename a set of annotations ~~~~~~~~~~~~~~~~~~~~~~~~~~~ From ee8b8c1c0609db9eb90e776b4a7f5da4d630d201 Mon Sep 17 00:00:00 2001 From: Loann Peurey Date: Mon, 8 Jul 2024 17:12:08 +0200 Subject: [PATCH 6/9] freeze requirements for pillow for matplotlib --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index 3f7fc804c..c9c8e3cc6 100644 --- a/requirements.txt +++ b/requirements.txt @@ -12,6 +12,7 @@ numpy==1.26.4; python_version >= '3.10' pandas==1.3.5; python_version <= '3.10' pandas==2.2.1; python_version >= '3.11' panoptes-client==1.6.1 +pillow<10.4.0 # this is constrained as matplotlib versions used fail on pillow > 10.3.0 praat-parselmouth==0.4.3 pyannote.core==5.0.0 pydub==0.25.1 From f9f3958adccd7c2e53326ab34d13d6b498d6e55b Mon Sep 17 00:00:00 2001 From: Loann Peurey <100950340+LoannPeurey@users.noreply.github.com> Date: Tue, 9 Jul 2024 11:38:22 +0200 Subject: [PATCH 7/9] use default linking type in ffmpeg setup --- .github/workflows/tests.yml | 3 --- 1 file changed, 3 deletions(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 553b4b4f8..5eddb1905 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -20,7 +20,6 @@ jobs: with: ffmpeg-version: release architecture: '' - linking-type: static github-token: ${{ secrets.GITHUB }} - name: sndfile if: matrix.os == 'ubuntu-latest' @@ -56,7 +55,6 @@ jobs: with: ffmpeg-version: release architecture: '' - linking-type: static github-token: ${{ secrets.GITHUB }} - name: Setup package run: | @@ -86,7 +84,6 @@ jobs: with: ffmpeg-version: release architecture: '' - linking-type: static github-token: ${{ secrets.GITHUB }} - name: Setup package run: | From bf67b055a4ce8fe3fb914b92d902bce1164e1849 Mon Sep 17 00:00:00 2001 From: Loann Peurey <100950340+LoannPeurey@users.noreply.github.com> Date: Tue, 9 Jul 2024 12:09:58 +0200 Subject: [PATCH 8/9] Update CHANGELOG.md --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index ae0c0cb4f..5ff175d69 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,7 @@ All notable changes to this project will be documented in this file. ### Added - docs and tests for init command +- docs and tests for automated-import command ## [0.2.2] 2024-06-26 From b63e15af91d23f3f28262796a81cdb025a911a59 Mon Sep 17 00:00:00 2001 From: Loann Peurey Date: Tue, 9 Jul 2024 16:43:02 +0200 Subject: [PATCH 9/9] dont fail importation when input dataframe has duplicate columns with recordings datafarme e.g. --- ChildProject/annotations.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/ChildProject/annotations.py b/ChildProject/annotations.py index 3d38a58f3..23a0f7245 100644 --- a/ChildProject/annotations.py +++ b/ChildProject/annotations.py @@ -632,10 +632,11 @@ def import_annotations( assert (input_processed["range_onset"] >= 0).all(), "range_onset must be greater or equal to 0" if "duration" in self.project.recordings.columns: assert (input_processed["range_offset"] <= input_processed.merge(self.project.recordings, - how='left', - on='recording_filename', - validate='m:1' - ).reset_index()["duration"] + how='left', + on='recording_filename', + validate='m:1', + suffixes=('_input', ''), + ).reset_index()["duration"] ).all(), "range_offset must be smaller than the duration of the recording" missing_recordings = input_processed[