reorganize constants for folders

LAAC-LSCP · Nov 23, 2023 · d0b3185 · d0b3185
1 parent 6daef51
commit d0b3185
Show file tree

Hide file tree

Showing 2 changed files with 97 additions and 59 deletions.
diff --git a/ChildProject/pipelines/processors.py b/ChildProject/pipelines/processors.py
@@ -35,11 +35,7 @@ def __init__(
         self.input_profile = input_profile
 
         if self.input_profile:
-            input_path = os.path.join(
-                self.project.path,
-                ChildProject.projects.CONVERTED_RECORDINGS,
-                self.input_profile,
-            )
+            input_path = self.project.path / ChildProject.projects.Paths.CONVERTED_RECORDINGS / self.input_profile
 
             assert os.path.exists(
                 input_path
@@ -52,11 +48,7 @@ def __init_subclass__(cls, **kwargs):
         pipelines[cls.SUBCOMMAND] = cls
 
     def output_directory(self):
-        return os.path.join(
-            self.project.path,
-            ChildProject.projects.CONVERTED_RECORDINGS,
-            self.name,
-        )
+        return self.project.path / ChildProject.projects.Paths.CONVERTED_RECORDINGS / self.name
 
     def read_metadata(self):
         path = os.path.join(self.output_directory(), "recordings.csv")

diff --git a/ChildProject/projects.py b/ChildProject/projects.py
@@ -6,6 +6,7 @@
 import pandas as pd
 import re
 import subprocess
+from pathlib import Path
 
 from .tables import (
     IndexTable,
@@ -16,16 +17,31 @@
 )
 from .utils import get_audio_duration, path_is_parent
 
-RAW_RECORDINGS = os.path.normpath("recordings/raw")
-CONVERTED_RECORDINGS = os.path.normpath("recordings/converted")
-STANDARD_SAMPLE_RATE = 16000
-STANDARD_PROFILE = 'standard' # profile that is expected to contain the standardized audios (16kHz). The existence and sampling rates of this profile are checked when <validating this profile> or <validating without profile and the raw recordings are not 16kHz>.
 
-METADATA_FOLDER = 'metadata'
-CHILDREN_CSV = 'children.csv'
-RECORDINGS_CSV = 'recordings.csv'
+class Folders:
+    RECORDINGS = Path("recordings")
+    ANNOTATIONS = Path("annotations")
+    METADATA = Path("metadata")
+    SCRIPTS = Path("scripts")
+    DOC = Path("doc")
+
+
+class FilesMeta:
+    RECORDINGS = Folders.METADATA / 'recordings.csv'
+    CHILDREN = Folders.METADATA / 'children.csv'
 
-PROJECT_FOLDERS = ["recordings", "annotations", "metadata", "doc", "scripts"]
+
+class Paths:
+    RAW_RECORDINGS = Folders.RECORDINGS / 'raw'
+    # RAW_RECORDINGS = os.path.normpath("recordings/raw")
+    CONVERTED_RECORDINGS = Folders.RECORDINGS / 'converted'
+    # CONVERTED_RECORDINGS = os.path.normpath("recordings/converted")
+
+
+STANDARD_SAMPLE_RATE = 16000
+STANDARD_PROFILE = 'standard'  # profile that is expected to contain the standardized audios (16kHz). The existence and
+# sampling rates of this profile are checked when <validating this profile> or <validating without profile and the raw
+# recordings are not 16kHz>.
 
 
 class ChildProject:
@@ -43,7 +59,7 @@ class ChildProject:
     
     Attributes:
     :param path: path to the root of the dataset.
-    :type path: str
+    :type path: Path
     :param recordings: pandas dataframe representation of this dataset metadata/recordings.csv 
     :type recordings: class:`pd.DataFrame`
     :param children: pandas dataframe representation of this dataset metadata/children.csv 
@@ -274,7 +290,7 @@ class ChildProject:
     def __init__(
         self, path: str, enforce_dtypes: bool = False, ignore_discarded: bool = True
     ):
-        self.path = path
+        self.path = Path(path)
         self.experiment = None
         self.enforce_dtypes = enforce_dtypes
         self.ignore_discarded = ignore_discarded
@@ -297,7 +313,7 @@ def accumulate_metadata(
         merge_column: str,
         verbose=False,
     ) -> pd.DataFrame:
-        md_path = os.path.join(self.path, METADATA_FOLDER, table)
+        md_path = self.path / Folders.METADATA / table
 
         if not os.path.exists(md_path):
             return df
@@ -371,13 +387,13 @@ def read(self, verbose=False, accumulate=True):
         """
         self.ct = IndexTable(
             "children",
-            os.path.join(self.path, METADATA_FOLDER,CHILDREN_CSV),
+            self.path / FilesMeta.CHILDREN,
             self.CHILDREN_COLUMNS,
             enforce_dtypes=self.enforce_dtypes,
         )
         self.rt = IndexTable(
             "recordings",
-            os.path.join(self.path, METADATA_FOLDER,RECORDINGS_CSV),
+            self.path / FilesMeta.RECORDINGS,
             self.RECORDINGS_COLUMNS,
             enforce_dtypes=self.enforce_dtypes,
         )
@@ -428,10 +444,10 @@ def write_recordings(self, keep_discarded: bool = True, keep_original_columns: b
         :rtype: pandas.DataFrame
         """
         if self.recordings is None:
-            #logger to add (can not write recordings file as recordings is not initialized)
+            # logger to add (can not write recordings file as recordings is not initialized)
             return None
-        #get the file as reference point
-        current_csv = pd.read_csv(os.path.join(self.path, METADATA_FOLDER,RECORDINGS_CSV))
+        # get the file as reference point
+        current_csv = pd.read_csv(self.path / FilesMeta.RECORDINGS)
 
         if 'discard' in current_csv.columns and keep_discarded:
             # put the discard column into a usable form
@@ -451,7 +467,7 @@ def write_recordings(self, keep_discarded: bool = True, keep_original_columns: b
         else:
             columns = self.recordings.columns
 
-        recs_to_write.sort_index().to_csv(os.path.join(self.path, METADATA_FOLDER, RECORDINGS_CSV),columns = columns,index=False)
+        recs_to_write.sort_index().to_csv(self.path / FilesMeta.RECORDINGS, columns=columns, index=False)
         return recs_to_write
 
     def validate(self, ignore_recordings: bool = False, profile: str = None, accumulate: bool = True) -> tuple:
@@ -506,16 +522,13 @@ def validate(self, ignore_recordings: bool = False, profile: str = None, accumul
 
                     try:
                         path = self.get_recording_path(raw_filename, profile)
-                    except:
+                    except Exception:
                         if profile:
-                            profile_metadata = os.path.join(
-                                self.path,
-                                CONVERTED_RECORDINGS,
-                                profile,
-                                RECORDINGS_CSV,
-                            )
+                            profile_metadata = (self.path / Paths.CONVERTED_RECORDINGS /
+                                                profile / FilesMeta.RECORDINGS.name)
                             self.errors.append(
-                                f"failed to recover the path for recording '{raw_filename}' and profile '{profile}'. Does the profile exist? Does {profile_metadata} exist?"
+                                f"failed to recover the path for recording '{raw_filename}' and profile '{profile}'. "
+                                f"Does the profile exist? Does {profile_metadata} exist?"
                             )
                         continue
 
@@ -529,29 +542,65 @@ def validate(self, ignore_recordings: bool = False, profile: str = None, accumul
                                         std_info = mediainfo(std_path)
                                         if 'sample_rate' not in std_info:
                                             self.warnings.append(
-                                                f"Could not read the sample rate of converted version of recording '{raw_filename}' at '{std_path}'. {STANDARD_SAMPLE_RATE}Hz is expected for profile {STANDARD_PROFILE}")
+                                                f"Could not read the sample rate of converted version of recording "
+                                                f"'{raw_filename}' at '{std_path}'. {STANDARD_SAMPLE_RATE}Hz is "
+                                                f"expected for profile {STANDARD_PROFILE}")
                                         elif int(std_info['sample_rate']) != STANDARD_SAMPLE_RATE:
-                                            self.warnings.append(f"converted version of recording '{raw_filename}' at '{std_path}' has unexpected sampling rate {std_info['sample_rate']}Hz when {STANDARD_SAMPLE_RATE}Hz is expected for profile {STANDARD_PROFILE}")
+                                            self.warnings.append(f"converted version of recording '{raw_filename}' at "
+                                                                 f"'{std_path}' has unexpected sampling rate "
+                                                                 f"{std_info['sample_rate']}Hz when "
+                                                                 f"{STANDARD_SAMPLE_RATE}Hz is expected for profile "
+                                                                 f"{STANDARD_PROFILE}")
                                     else:
                                         if 'sample_rate' in info:
                                             self.warnings.append(
-                                                f"recording '{raw_filename}' at '{path}' has a non standard sampling rate of {info['sample_rate']}Hz and no standard conversion in profile {STANDARD_PROFILE} was found. Does the standard profile exist? Does {profile_metadata} exist? you can create the standard profile with 'child-project process {self.path} {STANDARD_PROFILE} basic --format=wav --sampling={STANDARD_SAMPLE_RATE} --codec=pcm_s16le --skip-existing'")
+                                                f"recording '{raw_filename}' at '{path}' has a non standard sampling"
+                                                f" rate of {info['sample_rate']}Hz and no standard conversion in "
+                                                f"profile {STANDARD_PROFILE} was found. Does the standard profile "
+                                                f"exist? Does {profile_metadata} exist? you can create the standard "
+                                                f"profile with 'child-project process {self.path} {STANDARD_PROFILE} "
+                                                f"basic --format=wav --sampling={STANDARD_SAMPLE_RATE} "
+                                                f"--codec=pcm_s16le --skip-existing'")
                                         else:
                                             self.warnings.append(
-                                                f"Could not read the sample rate of recording '{raw_filename}' at '{path}' and no standard conversion in profile {STANDARD_PROFILE} was found. Does the standard profile exist? Does {profile_metadata} exist? you can create the standard profile with 'child-project process {self.path} {STANDARD_PROFILE} basic --format=wav --sampling={STANDARD_SAMPLE_RATE} --codec=pcm_s16le --skip-existing'")
-                                except:
-                                    profile_metadata = os.path.join(self.path,CONVERTED_RECORDINGS,STANDARD_PROFILE,RECORDINGS_CSV,)
+                                                f"Could not read the sample rate of recording '{raw_filename}' at "
+                                                f"'{path}' and no standard conversion in profile {STANDARD_PROFILE} "
+                                                f"was found. Does the standard profile exist? Does {profile_metadata} "
+                                                f"exist? you can create the standard profile with 'child-project "
+                                                f"process {self.path} {STANDARD_PROFILE} basic --format=wav --sampling"
+                                                f"={STANDARD_SAMPLE_RATE} --codec=pcm_s16le --skip-existing'")
+                                except Exception:
+                                    profile_metadata = (self.path / Paths.CONVERTED_RECORDINGS / STANDARD_PROFILE /
+                                                        FilesMeta.RECORDINGS.name)
                                     if 'sample_rate' in info:
-                                        self.warnings.append(f"recording '{raw_filename}' at '{path}' has a non standard sampling rate of {info['sample_rate']}Hz and no standard conversion in profile {STANDARD_PROFILE} was found. Does the standard profile exist? Does {profile_metadata} exist? you can create the standard profile with 'child-project process {self.path} {STANDARD_PROFILE} basic --format=wav --sampling={STANDARD_SAMPLE_RATE} --codec=pcm_s16le --skip-existing'")
+                                        self.warnings.append(f"recording '{raw_filename}' at '{path}' has a non standar"
+                                                             f"d sampling rate of {info['sample_rate']}Hz and no standa"
+                                                             f"rd conversion in profile {STANDARD_PROFILE} was found. D"
+                                                             f"oes the standard profile exist? Does {profile_metadata} "
+                                                             f"exist? you can create the standard profile with 'child-p"
+                                                             f"roject process {self.path} {STANDARD_PROFILE} basic --fo"
+                                                             f"rmat=wav --sampling={STANDARD_SAMPLE_RATE} --codec=pcm_s"
+                                                             f"16le --skip-existing'")
                                     else:
-                                        self.warnings.append(f"Could not read the sample rate of recording '{raw_filename}' at '{path}' and no standard conversion in profile {STANDARD_PROFILE} was found. Does the standard profile exist? Does {profile_metadata} exist? you can create the standard profile with 'child-project process {self.path} {STANDARD_PROFILE} basic --format=wav --sampling={STANDARD_SAMPLE_RATE} --codec=pcm_s16le --skip-existing'")
+                                        self.warnings.append(f"Could not read the sample rate of recording '"
+                                                             f"{raw_filename}' at '{path}' and no standard conversion i"
+                                                             f"n profile {STANDARD_PROFILE} was found. Does the standar"
+                                                             f"d profile exist? Does {profile_metadata} exist? you can "
+                                                             f"create the standard profile with 'child-project process "
+                                                             f"{self.path} {STANDARD_PROFILE} basic --format=wav --samp"
+                                                             f"ling={STANDARD_SAMPLE_RATE} --codec=pcm_s16le --skip-exi"
+                                                             f"sting'")
                         elif profile == STANDARD_PROFILE:
                             info = mediainfo(path)
                             if 'sample_rate' in info and int(info['sample_rate']) != STANDARD_SAMPLE_RATE:
-                                self.warnings.append(f"recording '{raw_filename}' at '{path}' has unexpected sampling rate {info['sample_rate']}Hz when {STANDARD_SAMPLE_RATE}Hz is expected for profile {STANDARD_PROFILE}")
+                                self.warnings.append(f"recording '{raw_filename}' at '{path}' has unexpected sampling r"
+                                                     f"ate {info['sample_rate']}Hz when {STANDARD_SAMPLE_RATE}Hz is exp"
+                                                     f"ected for profile {STANDARD_PROFILE}")
 
                     elif os.path.islink(path):
-                        message = self.warnings.append(f"the data content of recording '{raw_filename}' at path '{path}' is absent. See 'broken symlinks'") #The path is valid but there's no content. See broken symlinks (try 'Datalad get $filename')
+                        message = self.warnings.append(f"the data content of recording '{raw_filename}' at path '{path}"
+                                                       f"' is absent. See 'broken symlinks'")
+                        #  The path is valid but there's no content. See broken symlinks (try 'Datalad get $filename')
                     else:    
                         message = f"cannot find recording '{raw_filename}' at '{path}'"
                         if column_attr.required:
@@ -578,8 +627,9 @@ def validate(self, ignore_recordings: bool = False, profile: str = None, accumul
             )
 
             self.errors += [
-                f"Age at recording is negative in recordings on line {index} ({age:.1f} months). Check date_iso for that recording and child_dob for the corresponding child."
-                for index, age in ages[ages<0].iteritems()
+                (f"Age at recording is negative in recordings on line {index} ({age:.1f} months). Check date_iso for "
+                 f"that recording and child_dob for the corresponding child.")
+                for index, age in ages[ages < 0].iteritems()
             ]
 
         # detect un-indexed recordings and throw warnings
@@ -590,12 +640,12 @@ def validate(self, ignore_recordings: bool = False, profile: str = None, accumul
         ]
 
         indexed_files = [
-            os.path.abspath(os.path.join(self.path, RAW_RECORDINGS, str(f)))
+            os.path.abspath(self.path / Paths.RAW_RECORDINGS / str(f))
             for f in pd.core.common.flatten(files)
         ]
 
         recordings_files = glob.glob(
-            os.path.join(os.path.normcase(self.path), RAW_RECORDINGS, "**/*.*"), recursive=True
+            os.path.normcase(self.path / Paths.RAW_RECORDINGS / "**" / "*.*"), recursive=True
         )
 
         for rf in recordings_files:
@@ -631,11 +681,9 @@ def get_recording_path(self, recording_filename: str, profile: str = None) -> st
             if converted_filename is None:
                 return None
 
-            return os.path.join(
-                os.path.normcase(self.path), CONVERTED_RECORDINGS, profile, os.path.normpath(converted_filename),
-            )
+            return os.path.normcase(self.path / Paths.CONVERTED_RECORDINGS / profile / converted_filename)
         else:
-            return os.path.join(os.path.normcase(self.path), RAW_RECORDINGS, os.path.normpath(recording_filename))
+            return os.path.normcase(self.path / Paths.RAW_RECORDINGS / recording_filename)
 
     def get_converted_recording_filename(
         self, profile: str, recording_filename: str
@@ -657,9 +705,7 @@ def get_converted_recording_filename(
             return self.converted_recordings_hashtable[key]
 
         converted_recordings = pd.read_csv(
-            os.path.join(
-                self.path, CONVERTED_RECORDINGS, profile, RECORDINGS_CSV
-            )
+            self.path / Paths.CONVERTED_RECORDINGS / profile / FilesMeta.RECORDINGS.name
         )
         converted_recordings.dropna(subset=["converted_filename"], inplace=True)
 
@@ -683,9 +729,9 @@ def recording_from_path(self, path: str, profile: str = None) -> str:
             raise NotImplementedError(
                 "cannot recover recording from the path to a converted media yet"
             )
-            # media_path = os.path.join(self.path, CONVERTED_RECORDINGS, profile)
+            # media_path = self.path / Paths.CONVERTED_RECORDINGS / profile
         else:
-            media_path = os.path.join(self.path, RAW_RECORDINGS)
+            media_path = self.path / Paths.RAW_RECORDINGS
 
         if not path_is_parent(media_path, path):
             return None
@@ -728,7 +774,7 @@ def get_recordings_from_list(
                 recs = pd.Series(recordings)
                 missing_recs = recs[~recs.isin(self.recordings['recording_filename'])].tolist()
                 #self.recordings[~self.recordings['recording_filename'].isin(recordings)]['recording_filename'].tolist()
-                raise ValueError("recordings {} were not found in the dataset index. Check the names and make sure they exist in '{}'".format(missing_recs,os.path.join(METADATA_FOLDER,RECORDINGS_CSV)))
+                raise ValueError("recordings {} were not found in the dataset index. Check the names and make sure they exist in '{}'".format(missing_recs,FilesMeta.RECORDINGS))
 
 
         return _recordings