Skip to content

Commit

Permalink
OpenSMILE Vocalic Features from Audio Files (#556)
Browse files Browse the repository at this point in the history
* Processing vocalics done

* Linting code

* Saving all vocalics

* Handling error in vocalics extraction

* Fixing log issues

* Optimize insertion

* Adding chunk size

* Adding more messages

* Using the COPY method

* Adding ID as PK

* Fixing PK

* Adding commit

* Updating metadata

---------

Co-authored-by: Paulo Soares <[email protected]>
  • Loading branch information
Paulo Soares and Paulo Soares authored Dec 19, 2023
1 parent 76663cf commit 06c94ed
Show file tree
Hide file tree
Showing 44 changed files with 2,090 additions and 307 deletions.
2 changes: 2 additions & 0 deletions human_experiments/data_pre_processing/.flake8
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
[flake8]
max-line-length = 99
1 change: 1 addition & 0 deletions human_experiments/data_pre_processing/.gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -11,3 +11,4 @@
inspect-data.json

cache/
.tmp/
8 changes: 8 additions & 0 deletions human_experiments/data_pre_processing/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
lint:
black .
isort .
autoflake -r . --in-place --remove-all-unused-imports
flake8 .

vocalics:
./bin/extract_vocalic_features.py
28 changes: 20 additions & 8 deletions human_experiments/data_pre_processing/audio/entity/pcm_audio.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@


class PCMAudio:

def __init__(self, filepath: str):
self.filepath = filepath

Expand All @@ -22,13 +21,17 @@ def fix_header(self, out_filepath: str):
"""

file_size = os.path.getsize(self.filepath)
chunksize = file_size - 8 # file size in bytes - 8 bytes of header (ChunkID and ChunkSize)
chunksize = (
file_size - 8
) # file size in bytes - 8 bytes of header (ChunkID and ChunkSize)
chunksize = chunksize.to_bytes(4, "little")

subchunk2size = file_size - 44 # file size in bytes - 44 bytes of header
subchunk2size_in_bytes = subchunk2size.to_bytes(4, "little")

with open(self.filepath, 'rb') as input_file, open(out_filepath, 'wb') as output_file:
with open(self.filepath, "rb") as input_file, open(
out_filepath, "wb"
) as output_file:
input_data = input_file.read()
output_file.write(input_data)

Expand All @@ -50,10 +53,19 @@ def extract_vocalic_features(self, out_filepath: str):

logs = logging.getLoggerClass().root.handlers[0].baseFilename
with open(logs, "a") as log_file:
if subprocess.call(command, shell=True, stdout=log_file, stderr=subprocess.STDOUT) != 0:
logging.error(f"Error extracting vocalic features from {self.filepath}.")

def transcribe_annotated_utterances(self, transcriber: Transcriber, annotation: PraatAnnotation):
if (
subprocess.call(
command, shell=True, stdout=log_file, stderr=subprocess.STDOUT
)
!= 0
):
logging.error(
f"Error extracting vocalic features from {self.filepath}."
)

def transcribe_annotated_utterances(
self, transcriber: Transcriber, annotation: PraatAnnotation
):
annotation.reset_transcript_tier()

full_audio = AudioSegment.from_wav(self.filepath)
Expand All @@ -66,7 +78,7 @@ def transcribe_annotated_utterances(self, transcriber: Transcriber, annotation:

# Remove double quotes not to break the annotation, whitespaces in the extremities and capitalize the
# first letter.
text = result["text"].replace('"', '').strip()
text = result["text"].replace('"', "").strip()
if len(text) > 0:
text = text[0].upper() + text[1:]
annotation.set_transcript(index, text)
Original file line number Diff line number Diff line change
@@ -1,20 +1,21 @@
from textgrids import TextGrid, Tier

from typing import Tuple
from copy import deepcopy
import os
from copy import deepcopy
from typing import List, Tuple

from typing import List
from textgrids import TextGrid, Tier


class PraatAnnotation:

def __init__(self, filepath: str):
self._grid = TextGrid(filepath)

@property
def sound_intervals(self) -> Tuple[float, float]:
tier_name = "standardized_silences" if "standardized_silences" in self._grid else "silences"
tier_name = (
"standardized_silences"
if "standardized_silences" in self._grid
else "silences"
)
for index, sound_period in enumerate(self._grid[tier_name]):
if sound_period.text == "s":
yield index, sound_period.xmin, sound_period.xmax
Expand All @@ -26,7 +27,11 @@ def transcripts(self) -> Tuple[float, float]:
yield index, sound_period.text

def reset_transcript_tier(self):
tier_name = "standardized_silences" if "standardized_silences" in self._grid else "silences"
tier_name = (
"standardized_silences"
if "standardized_silences" in self._grid
else "silences"
)
self._grid["transcripts"] = deepcopy(self._grid[tier_name])
for i in range(len(self._grid["transcripts"])):
self._grid["transcripts"][i].text = ""
Expand All @@ -40,15 +45,19 @@ def set_transcript(self, index: int, text: str):
self._grid["transcripts"][index].text = text

def set_labels(self, index: int, labels: List[str]):
self._grid["dialog_labels"][index].text = ",".join(map(lambda x: x.strip(), labels))
self._grid["dialog_labels"][index].text = ",".join(
map(lambda x: x.strip(), labels)
)

def save(self, out_filepath: str):
filename, file_extension = os.path.splitext(out_filepath)
self._grid.write(filename + ".TextGrid")

def standardize_silences(self, silence_threshold: float):
merged_silences = self._merge_silences(ref_tier=self._grid["silences"])
merged_sounds = self._merge_sounds(ref_tier=merged_silences, silence_threshold=silence_threshold)
merged_sounds = self._merge_sounds(
ref_tier=merged_silences, silence_threshold=silence_threshold
)
self._grid["standardized_silences"] = merged_sounds

@staticmethod
Expand Down Expand Up @@ -97,7 +106,10 @@ def _merge_sounds(ref_tier: Tier, silence_threshold: float):
intervals_to_merge.append(tier[j])
j += 1

if j == num_intervals or tier[j].xmin - interval.xmax > silence_threshold:
if (
j == num_intervals
or tier[j].xmin - interval.xmax > silence_threshold
):
# No merge necessary
merged_sounds.extend(intervals_to_merge)
else:
Expand Down
Original file line number Diff line number Diff line change
@@ -1,22 +1,19 @@
from copy import deepcopy
from typing import Any

from audio.entity.praat_annotation import PraatAnnotation
from tqdm import tqdm
import json
from logging import error
from typing import Any

import requests
from tqdm import tqdm

from audio.entity.praat_annotation import PraatAnnotation

class SentenceLabeler:

class SentenceLabeler:
def annotate_labels(self, transcripts_annotation: PraatAnnotation) -> Any:
raise NotImplementedError


class ToMCATDialogAgent(SentenceLabeler):

def __init__(self, host: str = "localhost", port: int = 8080):
self._api_url = f"http://{host}:{port}"
response = requests.post(self._api_url, data={"message": "status"})
Expand All @@ -41,6 +38,8 @@ def _get_labels(self, sentence: str):
for res in results:
labels.extend(res["labels"])
else:
error(f"Server Error! Request status code {response.status_code}. Sentence: {sentence}.")
error(
f"Server Error! Request status code {response.status_code}. Sentence: {sentence}."
)

return labels
Original file line number Diff line number Diff line change
@@ -1,19 +1,17 @@
import whisper
from pydub.audio_segment import AudioSegment
from typing import Any

import numpy as np
import torch

from typing import Any
import whisper
from pydub.audio_segment import AudioSegment


class Transcriber:

def transcribe(self, audio_segment: AudioSegment) -> Any:
raise NotImplementedError


class Whisper(Transcriber):

def __init__(self, model_name: str = "base.en"):
self._model = whisper.load_model(model_name, download_root="cache")

Expand All @@ -23,7 +21,9 @@ def transcribe(self, audio_segment: AudioSegment) -> Any:
# We need to normalize the entries as it is done when we provide an audio file and let
# Whisper load the audio.
# Ref: https://github.com/openai/whisper/blob/9f70a352f9f8630ab3aa0d06af5cb9532bd8c21d/whisper/audio.py#L49
model_input = torch.from_numpy(np.array(audio_segment.get_array_of_samples(), float)) / 32768.0
model_input = (
torch.from_numpy(np.array(audio_segment.get_array_of_samples(), float))
/ 32768.0
)
model_input = model_input.to(torch.float32)
return self._model.transcribe(model_input, language="en")

Original file line number Diff line number Diff line change
Expand Up @@ -2,21 +2,24 @@
import logging
import os
import sys
from logging import info, error
from logging import error, info
from typing import Callable

from tqdm import tqdm

from audio.entity.pcm_audio import PCMAudio
from common.config import EXP_DIR, OUT_DIR, LOG_DIR
from utils import cd, is_directory_with_unified_xdf_files
from common.config import EXP_DIR, LOG_DIR, OUT_DIR
from utils import is_directory_with_unified_xdf_files


def extract_vocalic_features(experiments_dir: str, out_dir: str, override: bool):
info("Processing directories...")

directories_to_process = [directory for directory in os.listdir(experiments_dir) if
os.path.basename(directory)[:4] == "exp_"]
directories_to_process = [
directory
for directory in os.listdir(experiments_dir)
if os.path.basename(directory)[:4] == "exp_"
]

for group_session in tqdm(sorted(directories_to_process), unit="directories"):
info(f"Processing group session {group_session}")
Expand All @@ -30,33 +33,42 @@ def extract_vocalic_features(experiments_dir: str, out_dir: str, override: bool)


def process_directory_v1(experiment_dir: str, out_dir: str, override: bool):
return process_directory(experiment_dir, out_dir, lambda g, s: f"{g}/{s}/audio", override)
return process_directory(
experiment_dir, out_dir, lambda g, s: f"{g}/{s}/audio", override
)


def process_directory_v2(experiment_dir: str, out_dir: str, override: bool):
return process_directory(experiment_dir, out_dir, lambda g, s: f"{g}/{s}/audio/block_2", override)
return process_directory(
experiment_dir, out_dir, lambda g, s: f"{g}/{s}/audio/block_2", override
)


def process_directory(experiment_dir: str, out_dir: str, audio_dir_fn: Callable, override: bool):
def process_directory(
experiment_dir: str, out_dir: str, audio_dir_fn: Callable, override: bool
):
for station in ["lion", "tiger", "leopard"]:
audio_dir = audio_dir_fn(experiment_dir, station)
if not os.path.exists(audio_dir):
error(
f"Audio folder does not exist for station {station} in group session {os.path.basename(experiment_dir)}.")
f"Audio folder does not exist for station {station} in group session {os.path.basename(experiment_dir)}."
)
continue

for audio_file in os.listdir(audio_dir):
if audio_file[audio_file.rfind("."):].lower() != ".wav":
if audio_file[audio_file.rfind(".") :].lower() != ".wav":
continue

sub_dir = audio_dir[audio_dir.find("exp_"):] + "/vocalics"
sub_dir = audio_dir[audio_dir.find("exp_") :] + "/vocalics"
os.makedirs(f"{out_dir}/{sub_dir}", exist_ok=True)

vocalic_filename = audio_file[:audio_file.rfind(".")] + ".csv"
vocalic_filename = audio_file[: audio_file.rfind(".")] + ".csv"
vocalic_filepath = f"{out_dir}/{sub_dir}/{vocalic_filename}"

if os.path.exists(vocalic_filepath) and not override:
info(f"Skipping file {audio_file}. Vocalics already found in {out_dir}.")
info(
f"Skipping file {audio_file}. Vocalics already found in {out_dir}."
)
continue

audio = PCMAudio(filepath=f"{audio_dir}/{audio_file}")
Expand All @@ -68,16 +80,35 @@ def process_directory(experiment_dir: str, out_dir: str, audio_dir_fn: Callable,
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Parses a collection of experiments, finds the audio files, extracts vocalic features from the audio "
" and save them to a .csv file."
" and save them to a .csv file."
)

parser.add_argument("--experiments_dir", type=str, required=False, default=EXP_DIR,
help="Directory containing experiment folders.")
parser.add_argument("--out_dir", type=str, required=False, default=OUT_DIR,
help="Directory where experiment folder structure containing vocalic features files must be saved.")
parser.add_argument("--log_dir", type=str, required=False, default=LOG_DIR,
help="Directory where log files must be saved.")
parser.add_argument("--override", action='store_true', help="Do not reprocess data already processed.")
parser.add_argument(
"--experiments_dir",
type=str,
required=False,
default=EXP_DIR,
help="Directory containing experiment folders.",
)
parser.add_argument(
"--out_dir",
type=str,
required=False,
default=OUT_DIR,
help="Directory where experiment folder structure containing vocalic features files must be saved.",
)
parser.add_argument(
"--log_dir",
type=str,
required=False,
default=LOG_DIR,
help="Directory where log files must be saved.",
)
parser.add_argument(
"--override",
action="store_true",
help="Do not reprocess data already processed.",
)

args = parser.parse_args()

Expand Down
Loading

0 comments on commit 06c94ed

Please sign in to comment.