Skip to content

Commit

Permalink
Merge pull request #2 from benjamin-cates/main
Browse files Browse the repository at this point in the history
Make the code less ugly!
  • Loading branch information
Sean1572 authored Jun 30, 2023
2 parents e03f81a + 138bc44 commit fbc6c41
Show file tree
Hide file tree
Showing 19 changed files with 1,017 additions and 1,007 deletions.
13 changes: 8 additions & 5 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,13 @@ classification/wandb/*
*.pt
*.out
wandb/*
data-augmentation/move_missing_files.py
data-augmentation/find_failing_species.py
data-augmentation/birdclef2023_tweety_labels_log.txt
data-augmentation/__pycache__/*
data-augmentation/gen_data_stats.py
chunking_methods/move_missing_files.py
chunking_methods/find_failing_species.py
chunking_methods/birdclef2023_tweety_labels_log.txt
chunking_methods/__pycache__/*
chunking_methods/gen_data_stats.py
*.mp3
cosmos_labeled_data_files_added.csv
models/*
data/*
__pycache__/
636 changes: 318 additions & 318 deletions data-augmentation/WTS_chunking.py → chunking_methods/WTS_chunking.py

Large diffs are not rendered by default.

File renamed without changes.
File renamed without changes.
111 changes: 111 additions & 0 deletions chunking_methods/distribute_chunks.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
import os
import shutil
import sys
from random import shuffle
from math import floor
from file_utils import clear_files


VALIDATION_DISTR = 0.2


def distribute_files(path):

file_path = os.path.join(path, 'BirdCLEF2023_train_audio')
wav_path = os.path.join(path, 'BirdCLEF2023_split_audio')
chunk_path_old = os.path.join(path, 'BirdCLEF2023_train_audio_chunks')
subfolders = [f.path for f in os.scandir(file_path) if f.is_dir()]

train_path = os.path.join(wav_path, 'training')
validation_path = os.path.join(wav_path, 'validation')

def contains_chunks(f):
file_name = f.path.split('/')[-1][:-4]
species = f.path.split('/')[-2]
chunk_path = os.path.join(chunk_path_old, species)
return os.path.exists(chunk_path) and len([fn for fn in os.scandir(chunk_path) if file_name in fn.path]) > 0

for s in subfolders:
species = s.split('/')[-1]
num_files = int(len(os.listdir(s)) / 2)
s_train_path = os.path.join(train_path, species)
s_validation_path = os.path.join(validation_path, species)

files = [f.path.split('/')[-1] for f in os.scandir(s) if f.path.endswith('.wav') and contains_chunks(f)]

if len(files) == 0:
continue

if not os.path.exists(s_train_path):
os.makedirs(s_train_path)
if not os.path.exists(s_validation_path):
os.makedirs(s_validation_path)

# if the number of files is 1, we'll keep the same file in both training and validation
if len(files) == 1:

file = files[0]
file_name = file.split('/')[-1]

shutil.copyfile(os.path.join(s, file), os.path.join(s_train_path, file_name))
shutil.copyfile(os.path.join(s, file), os.path.join(s_validation_path, file_name))

else:

num_validation = max(1, floor(num_files * VALIDATION_DISTR))

shuffle(files)

train_files = files[:-num_validation]
validation_files = files[-num_validation:]

for file in train_files:
shutil.copyfile(os.path.join(s, file), os.path.join(s_train_path, file.split('/')[-1]))

for file in validation_files:
shutil.copyfile(os.path.join(s, file), os.path.join(s_validation_path, file.split('/')[-1]))


def distribute_chunks(path):
wav_path = os.path.join(path, 'BirdCLEF2023_split_audio')
chunk_path_old = os.path.join(path, 'BirdCLEF2023_train_audio_chunks')
chunk_path_new = os.path.join(path, 'BirdCLEF2023_split_chunks')
subfolders = [f.path for f in os.scandir(wav_path) if f.is_dir()]

# for training and validation folders:
for s in subfolders:
# s: /share/acoustic_species_id/BirdCLEF2023_split_audio/validation

species_folders = [f.path for f in os.scandir(s) if f.is_dir()]

for species_path in species_folders:
# species_path: /share/acoustic_species_id/BirdCLEF2023_split_audio/validation/rerswa1

species = species_path.split('/')[-1]
chunk_path_dst = os.path.join(chunk_path_new, species_path.split('/')[-2], species)
if not os.path.exists(chunk_path_dst):
os.makedirs(chunk_path_dst)

chunk_path_src = os.path.join(chunk_path_old, species)
if not os.path.exists(chunk_path_src):
continue

wav_file_path = os.path.join(wav_path, species_path.split('/')[-2], species)
wav_files = [f.path for f in os.scandir(wav_file_path) if f.is_file()]

for wav_file in wav_files:

chunks = [f.path for f in os.scandir(chunk_path_src) if wav_file.split('/')[-1][:-4] in f.path]

for chunk in chunks:
shutil.copyfile(chunk, os.path.join(chunk_path_dst, chunk.split('/')[-1]))


if __name__ == '__main__':
if len(sys.argv) != 2:
print("Incorrect number of args", file=sys.stderr)
print("USAGE: python distribute_chunks.py /path", file=sys.stderr)
sys.exit(1)
distribute_files(sys.argv[1])
distribute_chunks(sys.argv[1])
clear_files(os.path.join(sys.argv[1], 'BirdCLEF2023_split_audio'))
10 changes: 10 additions & 0 deletions chunking_methods/file_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
import os

def clear_files(path):
subfolders = [f.path for f in os.scandir(path) if f.is_dir()]
for s in subfolders:
subfolders_type = [f.path for f in os.scandir(s) if f.is_dir()]
for s_type in subfolders_type:
files = [f.path for f in os.scandir(s_type) if f.is_file()]
for file in files:
os.remove(file)
73 changes: 73 additions & 0 deletions chunking_methods/gen_chunks_from_labels.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
import os
import sys
import pydub
import pandas as pd
from WTS_chunking import *


def generate_chunked_df(path, save_chunks=True):
unchunked_df = pd.read_csv(os.path.join(path, 'BirdCLEF2023_Strong_Labels.csv'))
chunked_df = dynamic_yan_chunking(unchunked_df, chunk_count=5, chunk_duration=5, only_slide=False)
if save_chunks:
chunked_df.to_csv(os.path.join(path, 'BirdCLEF2023_TweetyNet_Chunks.csv'))
return chunked_df

def generate_wavs_from_chunks(path, chunk_duration):
chunk_path = os.path.join(path, 'BirdCLEF2023_train_audio_chunks')
if not os.path.exists(chunk_path):
os.makedirs(chunk_path)
chunked_df = pd.read_csv(os.path.join(path, 'BirdCLEF2023_TweetyNet_Chunks.csv'))
file_name = ''
label = ''
folder_path = ''
wav_file = None
chunk_count = 0
chunk_duration *= 1000
test_count = 0
for _, row in chunked_df.iterrows():
if row['STRONG LABEL'] != label:
label = row['STRONG LABEL']
folder_path = os.path.join(chunk_path, label)
if not os.path.exists(folder_path):
os.makedirs(folder_path)
test_count += 1

if row['IN FILE'] != file_name:
file_name = row['IN FILE']
wave_file_path = os.path.join(path, 'train_audio', label, file_name)
wav_file = pydub.AudioSegment.from_wav(wave_file_path)
chunk_count = 1

# splice wav file and save chunk
offset = float(row['OFFSET']) * 1000 # pydub splices in milliseconds, so multiply by 1000

chunk = wav_file[offset : offset + chunk_duration]

try:
assert len(chunk) == chunk_duration, f"Chunk of length {chunk_duration / 1000}s could not be generated from {file_name}. \n \
Got chunk of length {len(chunk) / 1000}s. Check chunking script."
chunk.export(os.path.join(folder_path, file_name[:-4] + '_' + str(chunk_count) + '.wav'), format='wav')
except AssertionError as e:
print(e)
chunk_count += 1

def delete_chunks_with_len(path, length):
chunk_path = os.path.join(path, 'BirdCLEF2023_train_audio_chunks')
length *= 1000
subfolders = [f.path for f in os.scandir(chunk_path) if f.is_dir() ]
for subfolder in subfolders:
chunks = [f for f in os.listdir(subfolder) if os.path.isfile(os.path.join(subfolder, f))]
for chunk in chunks:
wav_file_path = os.path.join(subfolder, chunk)
wav_file = pydub.AudioSegment.from_wav(wav_file_path)
if len(wav_file) == length:
os.remove(wav_file_path)


if __name__ == '__main__':
if len(sys.argv) != 2:
print("Incorrect number of args", file=sys.stderr)
print("USAGE: python gen_chunks_from_labels.py /path", file=sys.stderr)
sys.exit(1)
generate_chunked_df(sys.argv[1])
generate_wavs_from_chunks(sys.argv[1], 5)
File renamed without changes.
File renamed without changes.
61 changes: 61 additions & 0 deletions chunking_methods/resample.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
# upsample classes with fewer than 50 samples to 50, limit classes to 500 samples
import os
import shutil
from random import shuffle
from random import choice
from math import floor
from file_utils import clear_files

chunk_path_old = '/share/acoustic_species_id/pretraining_combined'
chunk_path_new = '/share/acoustic_species_id/pretraining_combined_resampled'
down_sample_from = 50
up_sample_to = 0
filetype = ".mp3"

def distribute_files():
# get list of folders
#file_path = os.path.join(share_path, 'BirdCLEF2023_train_audio')
subfolders = [f.path for f in os.scandir(chunk_path_old) if f.is_dir()]
upsampled_classes = 0
downsampled_classes = 0

# def contains_chunks(f):
# file_name = f.path.split('/')[-1][:-4]
# species = f.path.split('/')[-2]
# chunk_path = os.path.join(chunk_path_old, species)
# return os.path.exists(chunk_path) and len([fn for fn in os.scandir(chunk_path) if file_name in fn.path]) > 0

for s in subfolders:
species = s.split('/')[-1]
num_files = int(len(os.listdir(s)) / 2)
s_path = os.path.join(chunk_path_new, species)
files = [f.path.split('/')[-1] for f in os.scandir(s) if f.path.endswith(filetype)] #and contains_chunks(f)]
if len(files) == 0:
continue

if not os.path.exists(s_path):
os.makedirs(s_path)
if len(files) > down_sample_from:
shuffle(files)
copy_files = files[1:down_sample_from]
downsampled_classes += 1
elif len(files) >= up_sample_to:
# copy everything over
copy_files = files
else:
copy_files = []
for i in range(up_sample_to):
copy_files.append(choice(files))
#print(f"upsampled {species} from {len(files)} to {up_sample_to}")
upsampled_classes += 1

for file in copy_files:
shutil.copyfile(os.path.join(s, file), os.path.join(s_path, file.split('/')[-1]))
print(downsampled_classes)
print(upsampled_classes)



if __name__ == '__main__':
clear_files(chunk_path_new)
distribute_files()
Empty file.
Loading

0 comments on commit fbc6c41

Please sign in to comment.