Merge pull request #2 from benjamin-cates/main

Make the code less ugly!
UCSD-E4E · Jun 30, 2023 · fbc6c41 · fbc6c41
2 parents e03f81a + 138bc44
commit fbc6c41
Show file tree

Hide file tree

Showing 19 changed files with 1,017 additions and 1,007 deletions.
diff --git a/.gitignore b/.gitignore
@@ -11,10 +11,13 @@ classification/wandb/*
 *.pt
 *.out
 wandb/*
-data-augmentation/move_missing_files.py
-data-augmentation/find_failing_species.py
-data-augmentation/birdclef2023_tweety_labels_log.txt
-data-augmentation/__pycache__/*
-data-augmentation/gen_data_stats.py
+chunking_methods/move_missing_files.py
+chunking_methods/find_failing_species.py
+chunking_methods/birdclef2023_tweety_labels_log.txt
+chunking_methods/__pycache__/*
+chunking_methods/gen_data_stats.py
 *.mp3
 cosmos_labeled_data_files_added.csv
+models/*
+data/*
+__pycache__/
diff --git a/data-augmentation/WTS_chunking.py → chunking_methods/WTS_chunking.py b/data-augmentation/WTS_chunking.py → chunking_methods/WTS_chunking.py
diff --git a/data-augmentation/attach_strong_labels.py → chunking_methods/attach_strong_labels.py b/data-augmentation/attach_strong_labels.py → chunking_methods/attach_strong_labels.py
diff --git a/data-augmentation/combine_folders.py → chunking_methods/combine_folders.py b/data-augmentation/combine_folders.py → chunking_methods/combine_folders.py
diff --git a/chunking_methods/distribute_chunks.py b/chunking_methods/distribute_chunks.py
@@ -0,0 +1,111 @@
+import os
+import shutil
+import sys
+from random import shuffle
+from math import floor
+from file_utils import clear_files
+
+
+VALIDATION_DISTR = 0.2
+
+
+def distribute_files(path):
+
+    file_path = os.path.join(path, 'BirdCLEF2023_train_audio')
+    wav_path = os.path.join(path, 'BirdCLEF2023_split_audio')
+    chunk_path_old = os.path.join(path, 'BirdCLEF2023_train_audio_chunks')
+    subfolders = [f.path for f in os.scandir(file_path) if f.is_dir()]
+
+    train_path = os.path.join(wav_path, 'training')
+    validation_path = os.path.join(wav_path, 'validation')
+
+    def contains_chunks(f):
+        file_name = f.path.split('/')[-1][:-4]
+        species = f.path.split('/')[-2]
+        chunk_path = os.path.join(chunk_path_old, species)
+        return os.path.exists(chunk_path) and len([fn for fn in os.scandir(chunk_path) if file_name in fn.path]) > 0
+
+    for s in subfolders:
+        species = s.split('/')[-1]
+        num_files = int(len(os.listdir(s)) / 2)
+        s_train_path = os.path.join(train_path, species)
+        s_validation_path = os.path.join(validation_path, species)
+
+        files = [f.path.split('/')[-1] for f in os.scandir(s) if f.path.endswith('.wav') and contains_chunks(f)]
+
+        if len(files) == 0:
+            continue
+
+        if not os.path.exists(s_train_path):
+            os.makedirs(s_train_path)
+        if not os.path.exists(s_validation_path):
+            os.makedirs(s_validation_path)
+
+        # if the number of files is 1, we'll keep the same file in both training and validation
+        if len(files) == 1:
+
+            file = files[0]
+            file_name = file.split('/')[-1]
+
+            shutil.copyfile(os.path.join(s, file), os.path.join(s_train_path, file_name))
+            shutil.copyfile(os.path.join(s, file), os.path.join(s_validation_path, file_name))
+
+        else:
+
+            num_validation = max(1, floor(num_files * VALIDATION_DISTR))
+
+            shuffle(files)
+
+            train_files = files[:-num_validation]
+            validation_files = files[-num_validation:]
+
+            for file in train_files:
+                shutil.copyfile(os.path.join(s, file), os.path.join(s_train_path, file.split('/')[-1]))
+
+            for file in validation_files:
+                shutil.copyfile(os.path.join(s, file), os.path.join(s_validation_path, file.split('/')[-1]))
+
+
+def distribute_chunks(path):
+    wav_path = os.path.join(path, 'BirdCLEF2023_split_audio')
+    chunk_path_old = os.path.join(path, 'BirdCLEF2023_train_audio_chunks')
+    chunk_path_new = os.path.join(path, 'BirdCLEF2023_split_chunks')
+    subfolders = [f.path for f in os.scandir(wav_path) if f.is_dir()]
+
+    # for training and validation folders:
+    for s in subfolders:
+        # s: /share/acoustic_species_id/BirdCLEF2023_split_audio/validation
+
+        species_folders = [f.path for f in os.scandir(s) if f.is_dir()]
+
+        for species_path in species_folders:
+            # species_path: /share/acoustic_species_id/BirdCLEF2023_split_audio/validation/rerswa1
+
+            species = species_path.split('/')[-1]
+            chunk_path_dst = os.path.join(chunk_path_new, species_path.split('/')[-2], species)
+            if not os.path.exists(chunk_path_dst):
+                os.makedirs(chunk_path_dst)
+
+            chunk_path_src = os.path.join(chunk_path_old, species)
+            if not os.path.exists(chunk_path_src):
+                continue
+
+            wav_file_path = os.path.join(wav_path, species_path.split('/')[-2], species)
+            wav_files = [f.path for f in os.scandir(wav_file_path) if f.is_file()]
+
+            for wav_file in wav_files:
+
+                chunks = [f.path for f in os.scandir(chunk_path_src) if wav_file.split('/')[-1][:-4] in f.path]
+
+                for chunk in chunks:
+                    shutil.copyfile(chunk, os.path.join(chunk_path_dst, chunk.split('/')[-1]))
+
+
+if __name__ == '__main__':
+    if len(sys.argv) != 2:
+        print("Incorrect number of args", file=sys.stderr)
+        print("USAGE: python distribute_chunks.py /path", file=sys.stderr)
+        sys.exit(1)
+    distribute_files(sys.argv[1])
+    distribute_chunks(sys.argv[1])
+    clear_files(os.path.join(sys.argv[1], 'BirdCLEF2023_split_audio'))
diff --git a/chunking_methods/file_utils.py b/chunking_methods/file_utils.py
@@ -0,0 +1,10 @@
+import os
+
+def clear_files(path):
+    subfolders = [f.path for f in os.scandir(path) if f.is_dir()]
+    for s in subfolders:
+        subfolders_type = [f.path for f in os.scandir(s) if f.is_dir()]
+        for s_type in subfolders_type:
+            files = [f.path for f in os.scandir(s_type) if f.is_file()]
+            for file in files:
+                os.remove(file)
diff --git a/chunking_methods/gen_chunks_from_labels.py b/chunking_methods/gen_chunks_from_labels.py
@@ -0,0 +1,73 @@
+import os
+import sys
+import pydub
+import pandas as pd
+from WTS_chunking import *
+
+
+def generate_chunked_df(path, save_chunks=True):
+     unchunked_df = pd.read_csv(os.path.join(path, 'BirdCLEF2023_Strong_Labels.csv'))
+     chunked_df = dynamic_yan_chunking(unchunked_df, chunk_count=5, chunk_duration=5, only_slide=False)
+     if save_chunks:
+        chunked_df.to_csv(os.path.join(path, 'BirdCLEF2023_TweetyNet_Chunks.csv'))
+     return chunked_df
+
+def generate_wavs_from_chunks(path, chunk_duration):
+     chunk_path = os.path.join(path, 'BirdCLEF2023_train_audio_chunks')
+     if not os.path.exists(chunk_path):
+           os.makedirs(chunk_path)
+     chunked_df = pd.read_csv(os.path.join(path, 'BirdCLEF2023_TweetyNet_Chunks.csv'))
+     file_name = ''
+     label = ''
+     folder_path = ''
+     wav_file = None
+     chunk_count = 0
+     chunk_duration *= 1000
+     test_count = 0
+     for _, row in chunked_df.iterrows():
+          if row['STRONG LABEL'] != label:
+               label = row['STRONG LABEL']
+               folder_path = os.path.join(chunk_path, label)
+               if not os.path.exists(folder_path):
+                    os.makedirs(folder_path)
+               test_count += 1
+
+          if row['IN FILE'] != file_name:
+               file_name = row['IN FILE']
+               wave_file_path = os.path.join(path, 'train_audio', label, file_name)
+               wav_file = pydub.AudioSegment.from_wav(wave_file_path)
+               chunk_count = 1
+
+          # splice wav file and save chunk
+          offset = float(row['OFFSET']) * 1000 # pydub splices in milliseconds, so multiply by 1000
+
+          chunk = wav_file[offset : offset + chunk_duration]
+
+          try:
+               assert len(chunk) == chunk_duration, f"Chunk of length {chunk_duration / 1000}s could not be generated from {file_name}. \n \
+                                                       Got chunk of length {len(chunk) / 1000}s. Check chunking script."
+               chunk.export(os.path.join(folder_path, file_name[:-4] + '_' + str(chunk_count) + '.wav'), format='wav')
+          except AssertionError as e:
+               print(e)
+          chunk_count += 1
+
+def delete_chunks_with_len(path, length):
+     chunk_path = os.path.join(path, 'BirdCLEF2023_train_audio_chunks')
+     length *= 1000
+     subfolders = [f.path for f in os.scandir(chunk_path) if f.is_dir() ]
+     for subfolder in subfolders:
+          chunks = [f for f in os.listdir(subfolder) if os.path.isfile(os.path.join(subfolder, f))]
+          for chunk in chunks:
+               wav_file_path = os.path.join(subfolder, chunk)
+               wav_file = pydub.AudioSegment.from_wav(wav_file_path)
+               if len(wav_file) == length:
+                    os.remove(wav_file_path)
+
+
+if __name__ == '__main__':
+     if len(sys.argv) != 2:
+        print("Incorrect number of args", file=sys.stderr)
+        print("USAGE: python gen_chunks_from_labels.py /path", file=sys.stderr)
+        sys.exit(1)
+     generate_chunked_df(sys.argv[1])
+     generate_wavs_from_chunks(sys.argv[1], 5)
diff --git a/data-augmentation/gen_raw_chunks.py → chunking_methods/gen_raw_chunks.py b/data-augmentation/gen_raw_chunks.py → chunking_methods/gen_raw_chunks.py
diff --git a/data-augmentation/gen_tweety_labels.py → chunking_methods/gen_tweety_labels.py b/data-augmentation/gen_tweety_labels.py → chunking_methods/gen_tweety_labels.py
diff --git a/chunking_methods/resample.py b/chunking_methods/resample.py
@@ -0,0 +1,61 @@
+# upsample classes with fewer than 50 samples to 50, limit classes to 500 samples
+import os
+import shutil
+from random import shuffle
+from random import choice
+from math import floor
+from file_utils import clear_files
+
+chunk_path_old = '/share/acoustic_species_id/pretraining_combined'
+chunk_path_new = '/share/acoustic_species_id/pretraining_combined_resampled'
+down_sample_from = 50
+up_sample_to = 0
+filetype = ".mp3"
+
+def distribute_files():
+    # get list of folders
+    #file_path = os.path.join(share_path, 'BirdCLEF2023_train_audio')
+    subfolders = [f.path for f in os.scandir(chunk_path_old) if f.is_dir()]
+    upsampled_classes = 0
+    downsampled_classes = 0
+
+    # def contains_chunks(f):
+    #     file_name = f.path.split('/')[-1][:-4]
+    #     species = f.path.split('/')[-2]
+    #     chunk_path = os.path.join(chunk_path_old, species)
+    #     return os.path.exists(chunk_path) and len([fn for fn in os.scandir(chunk_path) if file_name in fn.path]) > 0
+
+    for s in subfolders:
+        species = s.split('/')[-1]
+        num_files = int(len(os.listdir(s)) / 2)
+        s_path = os.path.join(chunk_path_new, species)
+        files = [f.path.split('/')[-1] for f in os.scandir(s) if f.path.endswith(filetype)] #and contains_chunks(f)]
+        if len(files) == 0:
+            continue
+
+        if not os.path.exists(s_path):
+            os.makedirs(s_path)
+        if len(files) > down_sample_from:
+            shuffle(files)
+            copy_files = files[1:down_sample_from]
+            downsampled_classes += 1
+        elif len(files) >= up_sample_to:
+            # copy everything over
+            copy_files = files
+        else:
+            copy_files = []
+            for i in range(up_sample_to):
+                copy_files.append(choice(files))
+            #print(f"upsampled {species} from {len(files)} to {up_sample_to}")
+            upsampled_classes += 1
+
+        for file in copy_files:
+            shutil.copyfile(os.path.join(s, file), os.path.join(s_path, file.split('/')[-1]))
+    print(downsampled_classes)
+    print(upsampled_classes)
+
+
+
+if __name__ == '__main__':
+    clear_files(chunk_path_new)
+    distribute_files()
diff --git a/classification/data_aug/__init__.py b/classification/data_aug/__init__.py