Skip to content

Commit

Permalink
Merge pull request #445 from LAAC-LSCP/pipelines/standard-conversion
Browse files Browse the repository at this point in the history
Pipelines/standard conversion
  • Loading branch information
LoannPeurey authored Nov 13, 2023
2 parents 612adae + c59deb6 commit e7a1f94
Show file tree
Hide file tree
Showing 2 changed files with 157 additions and 4 deletions.
130 changes: 126 additions & 4 deletions ChildProject/pipelines/processors.py
Original file line number Diff line number Diff line change
Expand Up @@ -227,6 +227,8 @@ def process_recording(self, recording):
@staticmethod
def add_parser(subparsers, subcommand):
parser = subparsers.add_parser(subcommand, help="basic audio conversion")
parser.add_argument("name", help="name of the export profile")

parser.add_argument("--format", help="audio format (e.g. wav)", required=True)
parser.add_argument(
"--codec", help="audio codec (e.g. pcm_s16le)", required=True
Expand Down Expand Up @@ -330,6 +332,8 @@ def process_recording(self, recording):
@staticmethod
def add_parser(subparsers, subcommand):
parser = subparsers.add_parser(subcommand, help="vetting")
parser.add_argument("name", help="name of the export profile")

parser.add_argument(
"--segments-path",
help="path to the CSV dataframe containing the segments to be vetted",
Expand Down Expand Up @@ -415,6 +419,8 @@ def process_recording(self, recording):
@staticmethod
def add_parser(subparsers, subcommand):
parser = subparsers.add_parser(subcommand, help="channel mapping")
parser.add_argument("name", help="name of the export profile")

parser.add_argument(
"--channels",
help="lists of weigths for each channel",
Expand All @@ -428,6 +434,124 @@ def add_parser(subparsers, subcommand):
nargs="+",
)

class AudioStandard(AudioProcessor):
SUBCOMMAND = "standard"

def __init__(
self,
project: ChildProject.projects.ChildProject,
threads: int = 1,
recordings: Union[str, List[str], pd.DataFrame] = None,
skip_existing: bool = False,
input_profile: str = None,
):

super().__init__(
project,
name='standard',
threads=threads,
recordings=recordings,
input_profile=input_profile,
)

self.format = "wav"
self.codec = "pcm_s16le"
self.sampling = "16000"
self.skip_existing = bool(skip_existing)

def process_recording(self, recording):
if recording["recording_filename"] == "NA":
return pd.DataFrame()

original_file = self.project.get_recording_path(
recording["recording_filename"], self.input_profile
)

destination_file = os.path.join(
self.output_directory(),
os.path.splitext(recording["recording_filename"])[0]
+ "."
+ self.format,
)

os.makedirs(name=os.path.dirname(destination_file), exist_ok=True)

skip = self.skip_existing and (
os.path.exists(destination_file) or os.path.islink(destination_file)
)

if skip:
return pd.DataFrame()

args = [
"ffmpeg",
"-y",
"-loglevel",
"error",
"-i",
original_file,
"-c:a",
self.codec,
"-ar",
str(self.sampling),
"-map_channel",
"0.0.0",
]

args.append(destination_file)

proc = subprocess.Popen(args, stdout=subprocess.DEVNULL, stderr=subprocess.PIPE)
(stdout, stderr) = proc.communicate()
success = proc.returncode == 0

if not success:
print(stderr, file=sys.stderr)

return pd.DataFrame(
[
{
"original_filename": recording["recording_filename"],
"converted_filename": "",
"success": False,
"error": stderr,
}
]
)
else:
converted_files = [
os.path.splitext(recording["recording_filename"])[0]
+ "."
+ self.format
]

return pd.DataFrame(
[
{
"original_filename": recording["recording_filename"],
"converted_filename": cf,
"success": True,
}
for cf in converted_files
]
)

@staticmethod
def add_parser(subparsers, subcommand):
parser = subparsers.add_parser(subcommand, help="standard audio conversion")

parser.add_argument(
"--skip-existing",
dest="skip_existing",
required=False,
default=False,
action="store_true",
)
parser.add_argument(
"--recordings",
help="list of recordings to process, separated by whitespaces; only values of 'recording_filename' present in the metadata are supported.",
default=None,
nargs="+",
)

class AudioProcessingPipeline(Pipeline):
def __init__(self):
Expand All @@ -436,7 +560,6 @@ def __init__(self):
def run(
self,
path: str,
name: str,
processor: str,
threads: int = 1,
func=None,
Expand All @@ -457,7 +580,7 @@ def run(
if processor not in pipelines:
raise NotImplementedError(f"invalid pipeline '{processor}'")

proc = pipelines[processor](self.project, name, threads=threads, **kwargs)
proc = pipelines[processor](self.project, threads=threads, **kwargs)
proc.process(f"parameters_{date}.yml")

print("exported audio to {}".format(proc.output_directory()))
Expand All @@ -480,7 +603,7 @@ def run(
@staticmethod
def setup_parser(parser):
parser.add_argument("path", help="path to the dataset")
parser.add_argument("name", help="name of the export profile")
#parser.add_argument("name", help="name of the export profile")

subparsers = parser.add_subparsers(help="processor", dest="processor")
for pipeline in pipelines:
Expand All @@ -499,4 +622,3 @@ def setup_parser(parser):
help="profile of input recordings (process raw recordings by default)",
default=None,
)

31 changes: 31 additions & 0 deletions tests/test_convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,37 @@ def test_basic(project):
]
), "recording files are missing"

def test_standard(project):
# Starting the audio processing pipeline using the default settings
processed, parameters = AudioProcessingPipeline().run(
processor="standard",
path=project.path,
)

recordings = project.recordings
converted_recordings = pd.read_csv(processed)

assert np.isclose(
8000, project.compute_recordings_duration()["duration"].sum()
), "audio duration equals expected value"
assert os.path.exists(
os.path.join(project.path, CONVERTED_RECORDINGS, "test")
), "missing processed recordings folder"
assert (
recordings.shape[0] == converted_recordings.shape[0]
), "conversion table is incomplete"
assert all(
converted_recordings["success"].tolist()
), "not all recordings were successfully processed"
assert all(
[
os.path.exists(
os.path.join(project.path, CONVERTED_RECORDINGS, "test", f)
)
for f in converted_recordings["converted_filename"].tolist()
]
), "recording files are missing"


def test_vetting(project):
pd.DataFrame(
Expand Down

0 comments on commit e7a1f94

Please sign in to comment.