Merge branch 'master' into conversations/summary

LAAC-LSCP · Aug 1, 2024 · 7e02ede · 7e02ede
2 parents e69ae02 + b63e15a
commit 7e02ede
Show file tree

Hide file tree

Showing 6 changed files with 66 additions and 8 deletions.
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -20,7 +20,6 @@ jobs:
         with:
           ffmpeg-version: release
           architecture: ''
-          linking-type: static
           github-token: ${{ secrets.GITHUB }}
       - name: sndfile
         if: matrix.os == 'ubuntu-latest'
@@ -56,7 +55,6 @@ jobs:
         with:
           ffmpeg-version: release
           architecture: ''
-          linking-type: static
           github-token: ${{ secrets.GITHUB }}
       - name: Setup package
         run: |
@@ -86,7 +84,6 @@ jobs:
         with:
           ffmpeg-version: release
           architecture: ''
-          linking-type: static
           github-token: ${{ secrets.GITHUB }}
       - name: Setup package
         run: |

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,6 +7,8 @@ All notable changes to this project will be documented in this file.
 ### Added
 
 - conversations summary extraction pipeline
+- docs and tests for init command
+- docs and tests for automated-import command
 
 ## [0.2.2] 2024-06-26
 

diff --git a/ChildProject/annotations.py b/ChildProject/annotations.py
@@ -632,10 +632,11 @@ def import_annotations(
         assert (input_processed["range_onset"] >= 0).all(), "range_onset must be greater or equal to 0"
         if "duration" in self.project.recordings.columns:
             assert (input_processed["range_offset"] <= input_processed.merge(self.project.recordings,
-                                                                            how='left',
-                                                                            on='recording_filename',
-                                                                            validate='m:1'
-                                                                            ).reset_index()["duration"]
+                                                                             how='left',
+                                                                             on='recording_filename',
+                                                                             validate='m:1',
+                                                                             suffixes=('_input', ''),
+                                                                             ).reset_index()["duration"]
             ).all(), "range_offset must be smaller than the duration of the recording"
 
         missing_recordings = input_processed[

diff --git a/docs/source/annotations.rst b/docs/source/annotations.rst
@@ -18,7 +18,7 @@ Importation
 Single annotation importation
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-Annotations can be imported one by one or in bulk. Annotation
+Annotations can be imported one by one, in bulk or through the automated command. Annotation
 importation does the following :
 
 1. Convert all input annotations from their original format (e.g. rttm,
@@ -62,6 +62,24 @@ Use this to do bulk importation of many annotation files.
 The input dataframe ``/path/to/dataframe.csv`` must have one entry per
 annotation to import, according to the format specified at :ref:`format-input-annotations`.
 
+Automated importation
+^^^^^^^^^^^^^^^^^^^^^
+
+The automated method is mostly used for automated annotations. It is made to assume a certain number of parameters on importation, which allows us to perform the usual importations we are doing without additional input. The command will assume the following:
+- the annotation files will cover the entirety of the audio they annotate (equivalent to range_onset 0 and range_offset <duration of rec>)
+- the annotation files will have timestamps that are not offset compare to the recording (equivalent to time_seek 0)
+- the annotation files will be named exactly like the recording they annotate (including the folder they are in) except for the extension, which depends on the format (equivalent to recording_filename = annotation_filename + extension)
+- the format used is the same for all the files and needs to be given in the call, it determines the extension for all the annotation files
+- the set to import is the same for all files, must be given in the call
+
+.. clidoc::
+
+   child-project automated-import . --help
+
+::
+
+   # import the vtc set by using the vtc_rttm format, all annotation files will need to be with extension ``.rttm``
+   child-project automated-import . --set vtc --format vtc_rttm
 
 Rename a set of annotations
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~

diff --git a/docs/source/tools.rst b/docs/source/tools.rst
@@ -1,6 +1,22 @@
 Basic tools
 ===========
 
+Dataset initialization
+----------------------
+
+This command allows you to create a new and empty dataset, with the correct structure
+
+.. clidoc::
+
+   child-project init /path/to/dataset --help
+
+Example:
+
+.. code:: bash
+
+   # create a dataset in a folder named mydataset
+   child-project init mydataset
+
 .. _tools-data-validation:
 
 Data validation

diff --git a/tests/test_cli.py b/tests/test_cli.py
@@ -41,6 +41,14 @@ def test_overview(project):
     assert exit_code == 0
 
 
+def test_init():
+    shutil.rmtree(PATH, ignore_errors=True)
+    stdout, stderr, exit_code = cli(
+        ["child-project", "init", PATH]
+    )
+    assert exit_code == 0
+
+
 def test_import_annotations(project):
     stdout, stderr, exit_code = cli(
         [
@@ -53,6 +61,22 @@ def test_import_annotations(project):
     )
     assert exit_code == 0
 
+
+def test_import_automated(project):
+    stdout, stderr, exit_code = cli(
+        [
+            "child-project",
+            "automated-import",
+            PATH,
+            "--set",
+            "vtc_rttm",
+            "--format",
+            "vtc_rttm",
+        ]
+    )
+    assert exit_code == 0
+
+
 def test_compute_durations(project):
     stdout, stderr, exit_code = cli(
         [