OpenSMILE Vocalic Features from Audio Files (#556)

* Processing vocalics done * Linting code * Saving all vocalics * Handling error in vocalics extraction * Fixing log issues * Optimize insertion * Adding chunk size * Adding more messages * Using the COPY method * Adding ID as PK * Fixing PK * Adding commit * Updating metadata --------- Co-authored-by: Paulo Soares <[email protected]>
ml4ai · Dec 19, 2023 · 06c94ed · 06c94ed
1 parent 76663cf
commit 06c94ed
Show file tree

Hide file tree

Showing 44 changed files with 2,090 additions and 307 deletions.
diff --git a/human_experiments/data_pre_processing/.flake8 b/human_experiments/data_pre_processing/.flake8
@@ -0,0 +1,2 @@
+[flake8]
+max-line-length = 99
diff --git a/human_experiments/data_pre_processing/.gitignore b/human_experiments/data_pre_processing/.gitignore
@@ -11,3 +11,4 @@
 inspect-data.json
 
 cache/
+.tmp/
diff --git a/human_experiments/data_pre_processing/Makefile b/human_experiments/data_pre_processing/Makefile
@@ -0,0 +1,8 @@
+lint:
+	black .
+	isort .
+	autoflake -r . --in-place --remove-all-unused-imports
+	flake8 .
+
+vocalics:
+	./bin/extract_vocalic_features.py
diff --git a/human_experiments/data_pre_processing/audio/entity/pcm_audio.py b/human_experiments/data_pre_processing/audio/entity/pcm_audio.py
@@ -10,7 +10,6 @@
 
 
 class PCMAudio:
-
     def __init__(self, filepath: str):
         self.filepath = filepath
 
@@ -22,13 +21,17 @@ def fix_header(self, out_filepath: str):
         """
 
         file_size = os.path.getsize(self.filepath)
-        chunksize = file_size - 8  # file size in bytes - 8 bytes of header (ChunkID and ChunkSize)
+        chunksize = (
+            file_size - 8
+        )  # file size in bytes - 8 bytes of header (ChunkID and ChunkSize)
         chunksize = chunksize.to_bytes(4, "little")
 
         subchunk2size = file_size - 44  # file size in bytes - 44 bytes of header
         subchunk2size_in_bytes = subchunk2size.to_bytes(4, "little")
 
-        with open(self.filepath, 'rb') as input_file, open(out_filepath, 'wb') as output_file:
+        with open(self.filepath, "rb") as input_file, open(
+            out_filepath, "wb"
+        ) as output_file:
             input_data = input_file.read()
             output_file.write(input_data)
 
@@ -50,10 +53,19 @@ def extract_vocalic_features(self, out_filepath: str):
 
         logs = logging.getLoggerClass().root.handlers[0].baseFilename
         with open(logs, "a") as log_file:
-            if subprocess.call(command, shell=True, stdout=log_file, stderr=subprocess.STDOUT) != 0:
-                logging.error(f"Error extracting vocalic features from {self.filepath}.")
-
-    def transcribe_annotated_utterances(self, transcriber: Transcriber, annotation: PraatAnnotation):
+            if (
+                subprocess.call(
+                    command, shell=True, stdout=log_file, stderr=subprocess.STDOUT
+                )
+                != 0
+            ):
+                logging.error(
+                    f"Error extracting vocalic features from {self.filepath}."
+                )
+
+    def transcribe_annotated_utterances(
+        self, transcriber: Transcriber, annotation: PraatAnnotation
+    ):
         annotation.reset_transcript_tier()
 
         full_audio = AudioSegment.from_wav(self.filepath)
@@ -66,7 +78,7 @@ def transcribe_annotated_utterances(self, transcriber: Transcriber, annotation:
 
             # Remove double quotes not to break the annotation, whitespaces in the extremities and capitalize the
             # first letter.
-            text = result["text"].replace('"', '').strip()
+            text = result["text"].replace('"', "").strip()
             if len(text) > 0:
                 text = text[0].upper() + text[1:]
             annotation.set_transcript(index, text)
diff --git a/human_experiments/data_pre_processing/audio/entity/praat_annotation.py b/human_experiments/data_pre_processing/audio/entity/praat_annotation.py
@@ -1,20 +1,21 @@
-from textgrids import TextGrid, Tier
-
-from typing import Tuple
-from copy import deepcopy
 import os
+from copy import deepcopy
+from typing import List, Tuple
 
-from typing import List
+from textgrids import TextGrid, Tier
 
 
 class PraatAnnotation:
-
     def __init__(self, filepath: str):
         self._grid = TextGrid(filepath)
 
     @property
     def sound_intervals(self) -> Tuple[float, float]:
-        tier_name = "standardized_silences" if "standardized_silences" in self._grid else "silences"
+        tier_name = (
+            "standardized_silences"
+            if "standardized_silences" in self._grid
+            else "silences"
+        )
         for index, sound_period in enumerate(self._grid[tier_name]):
             if sound_period.text == "s":
                 yield index, sound_period.xmin, sound_period.xmax
@@ -26,7 +27,11 @@ def transcripts(self) -> Tuple[float, float]:
                 yield index, sound_period.text
 
     def reset_transcript_tier(self):
-        tier_name = "standardized_silences" if "standardized_silences" in self._grid else "silences"
+        tier_name = (
+            "standardized_silences"
+            if "standardized_silences" in self._grid
+            else "silences"
+        )
         self._grid["transcripts"] = deepcopy(self._grid[tier_name])
         for i in range(len(self._grid["transcripts"])):
             self._grid["transcripts"][i].text = ""
@@ -40,15 +45,19 @@ def set_transcript(self, index: int, text: str):
         self._grid["transcripts"][index].text = text
 
     def set_labels(self, index: int, labels: List[str]):
-        self._grid["dialog_labels"][index].text = ",".join(map(lambda x: x.strip(), labels))
+        self._grid["dialog_labels"][index].text = ",".join(
+            map(lambda x: x.strip(), labels)
+        )
 
     def save(self, out_filepath: str):
         filename, file_extension = os.path.splitext(out_filepath)
         self._grid.write(filename + ".TextGrid")
 
     def standardize_silences(self, silence_threshold: float):
         merged_silences = self._merge_silences(ref_tier=self._grid["silences"])
-        merged_sounds = self._merge_sounds(ref_tier=merged_silences, silence_threshold=silence_threshold)
+        merged_sounds = self._merge_sounds(
+            ref_tier=merged_silences, silence_threshold=silence_threshold
+        )
         self._grid["standardized_silences"] = merged_sounds
 
     @staticmethod
@@ -97,7 +106,10 @@ def _merge_sounds(ref_tier: Tier, silence_threshold: float):
                     intervals_to_merge.append(tier[j])
                     j += 1
 
-                if j == num_intervals or tier[j].xmin - interval.xmax > silence_threshold:
+                if (
+                    j == num_intervals
+                    or tier[j].xmin - interval.xmax > silence_threshold
+                ):
                     # No merge necessary
                     merged_sounds.extend(intervals_to_merge)
                 else:

diff --git a/human_experiments/data_pre_processing/audio/entity/sentence_labeler.py b/human_experiments/data_pre_processing/audio/entity/sentence_labeler.py
@@ -1,22 +1,19 @@
-from copy import deepcopy
-from typing import Any
-
-from audio.entity.praat_annotation import PraatAnnotation
-from tqdm import tqdm
 import json
 from logging import error
+from typing import Any
 
 import requests
+from tqdm import tqdm
 
+from audio.entity.praat_annotation import PraatAnnotation
 
-class SentenceLabeler:
 
+class SentenceLabeler:
     def annotate_labels(self, transcripts_annotation: PraatAnnotation) -> Any:
         raise NotImplementedError
 
 
 class ToMCATDialogAgent(SentenceLabeler):
-
     def __init__(self, host: str = "localhost", port: int = 8080):
         self._api_url = f"http://{host}:{port}"
         response = requests.post(self._api_url, data={"message": "status"})
@@ -41,6 +38,8 @@ def _get_labels(self, sentence: str):
             for res in results:
                 labels.extend(res["labels"])
         else:
-            error(f"Server Error! Request status code {response.status_code}. Sentence: {sentence}.")
+            error(
+                f"Server Error! Request status code {response.status_code}. Sentence: {sentence}."
+            )
 
         return labels
diff --git a/human_experiments/data_pre_processing/audio/entity/transcriber.py b/human_experiments/data_pre_processing/audio/entity/transcriber.py
@@ -1,19 +1,17 @@
-import whisper
-from pydub.audio_segment import AudioSegment
+from typing import Any
+
 import numpy as np
 import torch
-
-from typing import Any
+import whisper
+from pydub.audio_segment import AudioSegment
 
 
 class Transcriber:
-
     def transcribe(self, audio_segment: AudioSegment) -> Any:
         raise NotImplementedError
 
 
 class Whisper(Transcriber):
-
     def __init__(self, model_name: str = "base.en"):
         self._model = whisper.load_model(model_name, download_root="cache")
 
@@ -23,7 +21,9 @@ def transcribe(self, audio_segment: AudioSegment) -> Any:
         # We need to normalize the entries as it is done when we provide an audio file and let
         # Whisper load the audio.
         # Ref: https://github.com/openai/whisper/blob/9f70a352f9f8630ab3aa0d06af5cb9532bd8c21d/whisper/audio.py#L49
-        model_input = torch.from_numpy(np.array(audio_segment.get_array_of_samples(), float)) / 32768.0
+        model_input = (
+            torch.from_numpy(np.array(audio_segment.get_array_of_samples(), float))
+            / 32768.0
+        )
         model_input = model_input.to(torch.float32)
         return self._model.transcribe(model_input, language="en")
-
diff --git a/human_experiments/data_pre_processing/audio/extract_vocalic_features.py b/human_experiments/data_pre_processing/audio/extract_vocalic_features.py
@@ -2,21 +2,24 @@
 import logging
 import os
 import sys
-from logging import info, error
+from logging import error, info
 from typing import Callable
 
 from tqdm import tqdm
 
 from audio.entity.pcm_audio import PCMAudio
-from common.config import EXP_DIR, OUT_DIR, LOG_DIR
-from utils import cd, is_directory_with_unified_xdf_files
+from common.config import EXP_DIR, LOG_DIR, OUT_DIR
+from utils import is_directory_with_unified_xdf_files
 
 
 def extract_vocalic_features(experiments_dir: str, out_dir: str, override: bool):
     info("Processing directories...")
 
-    directories_to_process = [directory for directory in os.listdir(experiments_dir) if
-                              os.path.basename(directory)[:4] == "exp_"]
+    directories_to_process = [
+        directory
+        for directory in os.listdir(experiments_dir)
+        if os.path.basename(directory)[:4] == "exp_"
+    ]
 
     for group_session in tqdm(sorted(directories_to_process), unit="directories"):
         info(f"Processing group session {group_session}")
@@ -30,33 +33,42 @@ def extract_vocalic_features(experiments_dir: str, out_dir: str, override: bool)
 
 
 def process_directory_v1(experiment_dir: str, out_dir: str, override: bool):
-    return process_directory(experiment_dir, out_dir, lambda g, s: f"{g}/{s}/audio", override)
+    return process_directory(
+        experiment_dir, out_dir, lambda g, s: f"{g}/{s}/audio", override
+    )
 
 
 def process_directory_v2(experiment_dir: str, out_dir: str, override: bool):
-    return process_directory(experiment_dir, out_dir, lambda g, s: f"{g}/{s}/audio/block_2", override)
+    return process_directory(
+        experiment_dir, out_dir, lambda g, s: f"{g}/{s}/audio/block_2", override
+    )
 
 
-def process_directory(experiment_dir: str, out_dir: str, audio_dir_fn: Callable, override: bool):
+def process_directory(
+    experiment_dir: str, out_dir: str, audio_dir_fn: Callable, override: bool
+):
     for station in ["lion", "tiger", "leopard"]:
         audio_dir = audio_dir_fn(experiment_dir, station)
         if not os.path.exists(audio_dir):
             error(
-                f"Audio folder does not exist for station {station} in group session {os.path.basename(experiment_dir)}.")
+                f"Audio folder does not exist for station {station} in group session {os.path.basename(experiment_dir)}."
+            )
             continue
 
         for audio_file in os.listdir(audio_dir):
-            if audio_file[audio_file.rfind("."):].lower() != ".wav":
+            if audio_file[audio_file.rfind(".") :].lower() != ".wav":
                 continue
 
-            sub_dir = audio_dir[audio_dir.find("exp_"):] + "/vocalics"
+            sub_dir = audio_dir[audio_dir.find("exp_") :] + "/vocalics"
             os.makedirs(f"{out_dir}/{sub_dir}", exist_ok=True)
 
-            vocalic_filename = audio_file[:audio_file.rfind(".")] + ".csv"
+            vocalic_filename = audio_file[: audio_file.rfind(".")] + ".csv"
             vocalic_filepath = f"{out_dir}/{sub_dir}/{vocalic_filename}"
 
             if os.path.exists(vocalic_filepath) and not override:
-                info(f"Skipping file {audio_file}. Vocalics already found in {out_dir}.")
+                info(
+                    f"Skipping file {audio_file}. Vocalics already found in {out_dir}."
+                )
                 continue
 
             audio = PCMAudio(filepath=f"{audio_dir}/{audio_file}")
@@ -68,16 +80,35 @@ def process_directory(experiment_dir: str, out_dir: str, audio_dir_fn: Callable,
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(
         description="Parses a collection of experiments, finds the audio files, extracts vocalic features from the audio "
-                    " and save them to a .csv file."
+        " and save them to a .csv file."
     )
 
-    parser.add_argument("--experiments_dir", type=str, required=False, default=EXP_DIR,
-                        help="Directory containing experiment folders.")
-    parser.add_argument("--out_dir", type=str, required=False, default=OUT_DIR,
-                        help="Directory where experiment folder structure containing vocalic features files must be saved.")
-    parser.add_argument("--log_dir", type=str, required=False, default=LOG_DIR,
-                        help="Directory where log files must be saved.")
-    parser.add_argument("--override", action='store_true', help="Do not reprocess data already processed.")
+    parser.add_argument(
+        "--experiments_dir",
+        type=str,
+        required=False,
+        default=EXP_DIR,
+        help="Directory containing experiment folders.",
+    )
+    parser.add_argument(
+        "--out_dir",
+        type=str,
+        required=False,
+        default=OUT_DIR,
+        help="Directory where experiment folder structure containing vocalic features files must be saved.",
+    )
+    parser.add_argument(
+        "--log_dir",
+        type=str,
+        required=False,
+        default=LOG_DIR,
+        help="Directory where log files must be saved.",
+    )
+    parser.add_argument(
+        "--override",
+        action="store_true",
+        help="Do not reprocess data already processed.",
+    )
 
     args = parser.parse_args()