-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #2 from benjamin-cates/main
Make the code less ugly!
- Loading branch information
Showing
19 changed files
with
1,017 additions
and
1,007 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
636 changes: 318 additions & 318 deletions
636
data-augmentation/WTS_chunking.py → chunking_methods/WTS_chunking.py
Large diffs are not rendered by default.
Oops, something went wrong.
File renamed without changes.
File renamed without changes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,111 @@ | ||
import os | ||
import shutil | ||
import sys | ||
from random import shuffle | ||
from math import floor | ||
from file_utils import clear_files | ||
|
||
|
||
VALIDATION_DISTR = 0.2 | ||
|
||
|
||
def distribute_files(path): | ||
|
||
file_path = os.path.join(path, 'BirdCLEF2023_train_audio') | ||
wav_path = os.path.join(path, 'BirdCLEF2023_split_audio') | ||
chunk_path_old = os.path.join(path, 'BirdCLEF2023_train_audio_chunks') | ||
subfolders = [f.path for f in os.scandir(file_path) if f.is_dir()] | ||
|
||
train_path = os.path.join(wav_path, 'training') | ||
validation_path = os.path.join(wav_path, 'validation') | ||
|
||
def contains_chunks(f): | ||
file_name = f.path.split('/')[-1][:-4] | ||
species = f.path.split('/')[-2] | ||
chunk_path = os.path.join(chunk_path_old, species) | ||
return os.path.exists(chunk_path) and len([fn for fn in os.scandir(chunk_path) if file_name in fn.path]) > 0 | ||
|
||
for s in subfolders: | ||
species = s.split('/')[-1] | ||
num_files = int(len(os.listdir(s)) / 2) | ||
s_train_path = os.path.join(train_path, species) | ||
s_validation_path = os.path.join(validation_path, species) | ||
|
||
files = [f.path.split('/')[-1] for f in os.scandir(s) if f.path.endswith('.wav') and contains_chunks(f)] | ||
|
||
if len(files) == 0: | ||
continue | ||
|
||
if not os.path.exists(s_train_path): | ||
os.makedirs(s_train_path) | ||
if not os.path.exists(s_validation_path): | ||
os.makedirs(s_validation_path) | ||
|
||
# if the number of files is 1, we'll keep the same file in both training and validation | ||
if len(files) == 1: | ||
|
||
file = files[0] | ||
file_name = file.split('/')[-1] | ||
|
||
shutil.copyfile(os.path.join(s, file), os.path.join(s_train_path, file_name)) | ||
shutil.copyfile(os.path.join(s, file), os.path.join(s_validation_path, file_name)) | ||
|
||
else: | ||
|
||
num_validation = max(1, floor(num_files * VALIDATION_DISTR)) | ||
|
||
shuffle(files) | ||
|
||
train_files = files[:-num_validation] | ||
validation_files = files[-num_validation:] | ||
|
||
for file in train_files: | ||
shutil.copyfile(os.path.join(s, file), os.path.join(s_train_path, file.split('/')[-1])) | ||
|
||
for file in validation_files: | ||
shutil.copyfile(os.path.join(s, file), os.path.join(s_validation_path, file.split('/')[-1])) | ||
|
||
|
||
def distribute_chunks(path): | ||
wav_path = os.path.join(path, 'BirdCLEF2023_split_audio') | ||
chunk_path_old = os.path.join(path, 'BirdCLEF2023_train_audio_chunks') | ||
chunk_path_new = os.path.join(path, 'BirdCLEF2023_split_chunks') | ||
subfolders = [f.path for f in os.scandir(wav_path) if f.is_dir()] | ||
|
||
# for training and validation folders: | ||
for s in subfolders: | ||
# s: /share/acoustic_species_id/BirdCLEF2023_split_audio/validation | ||
|
||
species_folders = [f.path for f in os.scandir(s) if f.is_dir()] | ||
|
||
for species_path in species_folders: | ||
# species_path: /share/acoustic_species_id/BirdCLEF2023_split_audio/validation/rerswa1 | ||
|
||
species = species_path.split('/')[-1] | ||
chunk_path_dst = os.path.join(chunk_path_new, species_path.split('/')[-2], species) | ||
if not os.path.exists(chunk_path_dst): | ||
os.makedirs(chunk_path_dst) | ||
|
||
chunk_path_src = os.path.join(chunk_path_old, species) | ||
if not os.path.exists(chunk_path_src): | ||
continue | ||
|
||
wav_file_path = os.path.join(wav_path, species_path.split('/')[-2], species) | ||
wav_files = [f.path for f in os.scandir(wav_file_path) if f.is_file()] | ||
|
||
for wav_file in wav_files: | ||
|
||
chunks = [f.path for f in os.scandir(chunk_path_src) if wav_file.split('/')[-1][:-4] in f.path] | ||
|
||
for chunk in chunks: | ||
shutil.copyfile(chunk, os.path.join(chunk_path_dst, chunk.split('/')[-1])) | ||
|
||
|
||
if __name__ == '__main__': | ||
if len(sys.argv) != 2: | ||
print("Incorrect number of args", file=sys.stderr) | ||
print("USAGE: python distribute_chunks.py /path", file=sys.stderr) | ||
sys.exit(1) | ||
distribute_files(sys.argv[1]) | ||
distribute_chunks(sys.argv[1]) | ||
clear_files(os.path.join(sys.argv[1], 'BirdCLEF2023_split_audio')) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
import os | ||
|
||
def clear_files(path): | ||
subfolders = [f.path for f in os.scandir(path) if f.is_dir()] | ||
for s in subfolders: | ||
subfolders_type = [f.path for f in os.scandir(s) if f.is_dir()] | ||
for s_type in subfolders_type: | ||
files = [f.path for f in os.scandir(s_type) if f.is_file()] | ||
for file in files: | ||
os.remove(file) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,73 @@ | ||
import os | ||
import sys | ||
import pydub | ||
import pandas as pd | ||
from WTS_chunking import * | ||
|
||
|
||
def generate_chunked_df(path, save_chunks=True): | ||
unchunked_df = pd.read_csv(os.path.join(path, 'BirdCLEF2023_Strong_Labels.csv')) | ||
chunked_df = dynamic_yan_chunking(unchunked_df, chunk_count=5, chunk_duration=5, only_slide=False) | ||
if save_chunks: | ||
chunked_df.to_csv(os.path.join(path, 'BirdCLEF2023_TweetyNet_Chunks.csv')) | ||
return chunked_df | ||
|
||
def generate_wavs_from_chunks(path, chunk_duration): | ||
chunk_path = os.path.join(path, 'BirdCLEF2023_train_audio_chunks') | ||
if not os.path.exists(chunk_path): | ||
os.makedirs(chunk_path) | ||
chunked_df = pd.read_csv(os.path.join(path, 'BirdCLEF2023_TweetyNet_Chunks.csv')) | ||
file_name = '' | ||
label = '' | ||
folder_path = '' | ||
wav_file = None | ||
chunk_count = 0 | ||
chunk_duration *= 1000 | ||
test_count = 0 | ||
for _, row in chunked_df.iterrows(): | ||
if row['STRONG LABEL'] != label: | ||
label = row['STRONG LABEL'] | ||
folder_path = os.path.join(chunk_path, label) | ||
if not os.path.exists(folder_path): | ||
os.makedirs(folder_path) | ||
test_count += 1 | ||
|
||
if row['IN FILE'] != file_name: | ||
file_name = row['IN FILE'] | ||
wave_file_path = os.path.join(path, 'train_audio', label, file_name) | ||
wav_file = pydub.AudioSegment.from_wav(wave_file_path) | ||
chunk_count = 1 | ||
|
||
# splice wav file and save chunk | ||
offset = float(row['OFFSET']) * 1000 # pydub splices in milliseconds, so multiply by 1000 | ||
|
||
chunk = wav_file[offset : offset + chunk_duration] | ||
|
||
try: | ||
assert len(chunk) == chunk_duration, f"Chunk of length {chunk_duration / 1000}s could not be generated from {file_name}. \n \ | ||
Got chunk of length {len(chunk) / 1000}s. Check chunking script." | ||
chunk.export(os.path.join(folder_path, file_name[:-4] + '_' + str(chunk_count) + '.wav'), format='wav') | ||
except AssertionError as e: | ||
print(e) | ||
chunk_count += 1 | ||
|
||
def delete_chunks_with_len(path, length): | ||
chunk_path = os.path.join(path, 'BirdCLEF2023_train_audio_chunks') | ||
length *= 1000 | ||
subfolders = [f.path for f in os.scandir(chunk_path) if f.is_dir() ] | ||
for subfolder in subfolders: | ||
chunks = [f for f in os.listdir(subfolder) if os.path.isfile(os.path.join(subfolder, f))] | ||
for chunk in chunks: | ||
wav_file_path = os.path.join(subfolder, chunk) | ||
wav_file = pydub.AudioSegment.from_wav(wav_file_path) | ||
if len(wav_file) == length: | ||
os.remove(wav_file_path) | ||
|
||
|
||
if __name__ == '__main__': | ||
if len(sys.argv) != 2: | ||
print("Incorrect number of args", file=sys.stderr) | ||
print("USAGE: python gen_chunks_from_labels.py /path", file=sys.stderr) | ||
sys.exit(1) | ||
generate_chunked_df(sys.argv[1]) | ||
generate_wavs_from_chunks(sys.argv[1], 5) |
File renamed without changes.
File renamed without changes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,61 @@ | ||
# upsample classes with fewer than 50 samples to 50, limit classes to 500 samples | ||
import os | ||
import shutil | ||
from random import shuffle | ||
from random import choice | ||
from math import floor | ||
from file_utils import clear_files | ||
|
||
chunk_path_old = '/share/acoustic_species_id/pretraining_combined' | ||
chunk_path_new = '/share/acoustic_species_id/pretraining_combined_resampled' | ||
down_sample_from = 50 | ||
up_sample_to = 0 | ||
filetype = ".mp3" | ||
|
||
def distribute_files(): | ||
# get list of folders | ||
#file_path = os.path.join(share_path, 'BirdCLEF2023_train_audio') | ||
subfolders = [f.path for f in os.scandir(chunk_path_old) if f.is_dir()] | ||
upsampled_classes = 0 | ||
downsampled_classes = 0 | ||
|
||
# def contains_chunks(f): | ||
# file_name = f.path.split('/')[-1][:-4] | ||
# species = f.path.split('/')[-2] | ||
# chunk_path = os.path.join(chunk_path_old, species) | ||
# return os.path.exists(chunk_path) and len([fn for fn in os.scandir(chunk_path) if file_name in fn.path]) > 0 | ||
|
||
for s in subfolders: | ||
species = s.split('/')[-1] | ||
num_files = int(len(os.listdir(s)) / 2) | ||
s_path = os.path.join(chunk_path_new, species) | ||
files = [f.path.split('/')[-1] for f in os.scandir(s) if f.path.endswith(filetype)] #and contains_chunks(f)] | ||
if len(files) == 0: | ||
continue | ||
|
||
if not os.path.exists(s_path): | ||
os.makedirs(s_path) | ||
if len(files) > down_sample_from: | ||
shuffle(files) | ||
copy_files = files[1:down_sample_from] | ||
downsampled_classes += 1 | ||
elif len(files) >= up_sample_to: | ||
# copy everything over | ||
copy_files = files | ||
else: | ||
copy_files = [] | ||
for i in range(up_sample_to): | ||
copy_files.append(choice(files)) | ||
#print(f"upsampled {species} from {len(files)} to {up_sample_to}") | ||
upsampled_classes += 1 | ||
|
||
for file in copy_files: | ||
shutil.copyfile(os.path.join(s, file), os.path.join(s_path, file.split('/')[-1])) | ||
print(downsampled_classes) | ||
print(upsampled_classes) | ||
|
||
|
||
|
||
if __name__ == '__main__': | ||
clear_files(chunk_path_new) | ||
distribute_files() |
Empty file.
Oops, something went wrong.