asif-hanif
diff --git a/‎.gitignore
+2 b/‎.gitignore
+2
diff --git a/‎DATASETS.md
+34-34 b/‎DATASETS.md
+34-34
diff --git a/‎README.md
+13-13 b/‎README.md
+13-13
@@ -0,0 +1,2 @@
+__pycache__/*
+pengi/configs/base.pth
@@ -46,18 +46,18 @@ Each CSV file includes the following columns:
 <br>
 
 | Dataset | Type | Classes | Split | Size |
-|:-- |:-- |:--: |:--: | :--: |
-| [Beijing-Opera](#beijing-opera) | Instrument Classification | 4 | Five-Fold | 68 MB
-| [CREMA-D](#crema-d) | Emotion Recognition | 6 | Train-Test | 653M
-| [ESC50](#esc50) | Sound Event Classification | 50 | Five-Fold | 777M
-| [ESC50-Actions](#esc50-actions) | Sound Event Classification | 10 | Five-Fold | 772M 
-| [GT-Music-Genre](#gt-music-genre) | Music Analysis | 10 | Train-Test | 1.4G
-| [NS-Instruments](#ns-instruments) | Instrument Classification | 10 | Train-Test | 14G 
-| [RAVDESS](#ravdess) | Emotion Recognition | 8 | Train-Test | 683M
-| [SESA](#sesa) | Surveillance Sound Classification | 4 | Train-Test | 51M
-| [TUT2017](#tut2017) | Acoustic Scene Classification | 15 | Four-Fold | 12G 
-| [UrbanSound8K](#urbansound8k) | Sound Event Classification | 10 | Ten-Fold | 6.8G 
-| [VocalSound](#vocalsound) | Vocal Sound Classification | 6 | Train-Test | 6.9G
+|:-- |:-- |:--: |:--: | --: |
+| [Beijing-Opera](#beijing-opera) | Instrument Classification | 4 | Five-Fold | 69 MB |
+| [CREMA-D](#crema-d) | Emotion Recognition | 6 | Train-Test | 606 MB |
+| [ESC50](#esc50) | Sound Event Classification | 50 | Five-Fold | 881 MB |
+| [ESC50-Actions](#esc50-actions) | Sound Event Classification | 10 | Five-Fold | 881 MB | 
+| [GT-Music-Genre](#gt-music-genre) | Music Analysis | 10 | Train-Test | 1.3 GB |
+| [NS-Instruments](#ns-instruments) | Instrument Classification | 10 | Train-Test | 18.5 GB
+| [RAVDESS](#ravdess) | Emotion Recognition | 8 | Train-Test | 1.1 GB |
+| [SESA](#sesa) | Surveillance Sound Classification | 4 | Train-Test | 70 MB |
+| [TUT2017](#tut2017) | Acoustic Scene Classification | 15 | Four-Fold | 12.3 GB | 
+| [UrbanSound8K](#urbansound8k) | Sound Event Classification | 10 | Ten-Fold | 6.8 GB | 
+| [VocalSound](#vocalsound) | Vocal Sound Classification | 6 | Train-Test | 8.2 GB |
 
 <br><br>
 <hr><hr>
@@ -78,8 +78,8 @@ if not os.path.exists(audio_datasets_path): print(f"Given {audio_datasets_path=}
 huggingface_hub.snapshot_download(repo_id="MahiA/Beijing-Opera", repo_type="dataset", local_dir=os.path.join(audio_datasets_path, "Beijing-Opera"))
 ```
 |Type | Classes | Split | Size |
-|:-- |:--: |:--: | :--: |
-| Instrument Classification | 4 | Five-Fold | 68 MB |
+|:-- |:--: |:--: | --: |
+| Instrument Classification | 4 | Five-Fold | 69 MB |
 
 <br>
 <hr>
@@ -94,8 +94,8 @@ if not os.path.exists(audio_datasets_path): print(f"Given {audio_datasets_path=}
 huggingface_hub.snapshot_download(repo_id="MahiA/CREMA-D", repo_type="dataset", local_dir=os.path.join(audio_datasets_path, "CREMA-D"))
 ```
 |Type | Classes | Split | Size |
-|:-- |:--: |:--: | :--: |
-| Emotion Recognition | 6 | Train-Test |  |
+|:-- |:--: |:--: | --: |
+| Emotion Recognition | 6 | Train-Test | 606 MB |
 
 <br>
 <hr>
@@ -110,8 +110,8 @@ if not os.path.exists(audio_datasets_path): print(f"Given {audio_datasets_path=}
 huggingface_hub.snapshot_download(repo_id="MahiA/ESC50", repo_type="dataset", local_dir=os.path.join(audio_datasets_path, "ESC50"))
 ```
 |Type | Classes | Split | Size |
-|:-- |:--: |:--: | :--: |
-| Sound Event Classification | 50 | Five-Fold |  |
+|:-- |:--: |:--: | --: |
+| Sound Event Classification | 50 | Five-Fold | 881 MB |
 
 <br>
 <hr>
@@ -126,8 +126,8 @@ if not os.path.exists(audio_datasets_path): print(f"Given {audio_datasets_path=}
 huggingface_hub.snapshot_download(repo_id="MahiA/ESC50-Actions", repo_type="dataset", local_dir=os.path.join(audio_datasets_path, "ESC50-Actions"))
 ```
 |Type | Classes | Split | Size |
-|:-- |:--: |:--: | :--: |
-| Sound Event Classification | 10 | Five-Fold |  |
+|:-- |:--: |:--: | --: |
+| Sound Event Classification | 10 | Five-Fold | 881 MB |
 
 <br>
 <hr>
@@ -142,8 +142,8 @@ if not os.path.exists(audio_datasets_path): print(f"Given {audio_datasets_path=}
 huggingface_hub.snapshot_download(repo_id="MahiA/GT-Music-Genre", repo_type="dataset", local_dir=os.path.join(audio_datasets_path, "GT-Music-Genre"))
 ```
 |Type | Classes | Split | Size |
-|:-- |:--: |:--: | :--: |
-| Music Analysis | 10 | Train-Test |  |
+|:-- |:--: |:--: | --: |
+| Music Analysis | 10 | Train-Test | 1.3 GB |
 
 <br>
 <hr>
@@ -158,8 +158,8 @@ if not os.path.exists(audio_datasets_path): print(f"Given {audio_datasets_path=}
 huggingface_hub.snapshot_download(repo_id="MahiA/NS-Instruments", repo_type="dataset", local_dir=os.path.join(audio_datasets_path, "NS-Instruments"))
 ```
 |Type | Classes | Split | Size |
-|:-- |:--: |:--: | :--: |
-| Instrument Classification | 10 | Train-Test |  |
+|:-- |:--: |:--: | --: |
+| Instrument Classification | 10 | Train-Test | 18.5 GB |
 
 <br>
 <hr>
@@ -174,8 +174,8 @@ if not os.path.exists(audio_datasets_path): print(f"Given {audio_datasets_path=}
 huggingface_hub.snapshot_download(repo_id="MahiA/RAVDESS", repo_type="dataset", local_dir=os.path.join(audio_datasets_path, "RAVDESS"))
 ```
 |Type | Classes | Split | Size |
-|:-- |:--: |:--: | :--: |
-| Emotion Recognition | 8 | Train-Test |  |
+|:-- |:--: |:--: | --: |
+| Emotion Recognition | 8 | Train-Test | 1.1 GB |
 
 <br>
 <hr>
@@ -190,8 +190,8 @@ if not os.path.exists(audio_datasets_path): print(f"Given {audio_datasets_path=}
 huggingface_hub.snapshot_download(repo_id="MahiA/SESA", repo_type="dataset", local_dir=os.path.join(audio_datasets_path, "SESA"))
 ```
 |Type | Classes | Split | Size |
-|:-- |:--: |:--: | :--: |
-| Surveillance Sound Classification | 4 | Train-Test |  |
+|:-- |:--: |:--: | --: |
+| Surveillance Sound Classification | 4 | Train-Test | 70 MB |
 
 <br>
 <hr>
@@ -206,8 +206,8 @@ if not os.path.exists(audio_datasets_path): print(f"Given {audio_datasets_path=}
 huggingface_hub.snapshot_download(repo_id="MahiA/TUT2017", repo_type="dataset", local_dir=os.path.join(audio_datasets_path, "TUT2017"))
 ```
 |Type | Classes | Split | Size |
-|:-- |:--: |:--: | :--: |
-| Acoustic Scene Classification | 15 | Four-Fold |  |
+|:-- |:--: |:--: | --: |
+| Acoustic Scene Classification | 15 | Four-Fold | 12.3 GB |
 
 
 <br>
@@ -223,8 +223,8 @@ if not os.path.exists(audio_datasets_path): print(f"Given {audio_datasets_path=}
 huggingface_hub.snapshot_download(repo_id="MahiA/UrbanSound8K", repo_type="dataset", local_dir=os.path.join(audio_datasets_path, "UrbanSound8K"))
 ```
 |Type | Classes | Split | Size |
-|:-- |:--: |:--: | :--: |
-| Sound Event Classification | 10 | Ten-Fold |  |
+|:-- |:--: |:--: | --: |
+| Sound Event Classification | 10 | Ten-Fold | 6.8 GB |
 
 <br>
 <hr>
@@ -239,8 +239,8 @@ if not os.path.exists(audio_datasets_path): print(f"Given {audio_datasets_path=}
 huggingface_hub.snapshot_download(repo_id="MahiA/VocalSound", repo_type="dataset", local_dir=os.path.join(audio_datasets_path, "VocalSound"))
 ```
 |Type | Classes | Split | Size |
-|:-- |:--: |:--: | :--: |
-| Vocal Sound Classification | 6 | Train-Test |  |
+|:-- |:--: |:--: | --: |
+| Vocal Sound Classification | 6 | Train-Test | 8.2 GB |
 
 <br>
 <hr>
 
@@ -97,23 +97,23 @@ wget https://zenodo.org/records/8387083/files/base.pth
 We have performed experiments on 11 audio classification datasets.  Instructions for downloading/processing datasets used by our method have been provided in the [DATASETS.md](DATASETS.md). 
 
 | Dataset | Type | Classes | Size | Link |
-|:-- |:-- |:--: |:--: |:-- |
-| [Beijing-Opera](https://compmusic.upf.edu/bo-perc-dataset) | Instrument Classification | 4 | | [Instructions](DATASETS.md#beijing-opera) |
-| [CREMA-D](https://github.com/CheyneyComputerScience/CREMA-D) | Emotion Recognition | 6 | | [Instructions](DATASETS.md#crema-d) |
-| [ESC50](https://github.com/karolpiczak/ESC-50) | Sound Event Classification | 50 | | [Instructions](DATASETS.md#esc50) |
-| [ESC50-Actions](https://github.com/karolpiczak/ESC-50) | Sound Event Classification | 10 | | [Instructions](DATASETS.md#esc50-actions) |
-| [GT-Music-Genre](https://www.kaggle.com/datasets/andradaolteanu/gtzan-dataset-music-genre-classification) | Music Analysis | 10 | | [Instructions](DATASETS.md#gt-music-genre) |
-| [NS-Instruments](https://magenta.tensorflow.org/datasets/nsynth) | Instrument Classification | 10 | | [Instructions](DATASETS.md#ns-instruments) |
-| [RAVDESS](https://zenodo.org/records/1188976#.YFZuJ0j7SL8) | Emotion Recognition | 8 | | [Instructions](DATASETS.md#ravdess) |
-| [SESA](https://zenodo.org/records/3519845) | Surveillance Sound Classification | 4 | | [Instructions](DATASETS.md#sesa) |
-| [TUT2017](https://zenodo.org/records/400515) | Acoustic Scene Classification | 15 | | [Instructions](DATASETS.md#tut2017) |
-| [UrbanSound8K](https://urbansounddataset.weebly.com/urbansound8k.html) | Sound Event Classification | 10 | | [Instructions](DATASETS.md#urbansound8k) |
-| [VocalSound](https://github.com/YuanGongND/vocalsound) | Vocal Sound Classification | 6 | | [Instructions](DATASETS.md#vocalsound) |
+|:-- |:-- |:--: |--: |:-- |
+| [Beijing-Opera](https://compmusic.upf.edu/bo-perc-dataset) | Instrument Classification | 4 | 69 MB | [Instructions](DATASETS.md#beijing-opera) |
+| [CREMA-D](https://github.com/CheyneyComputerScience/CREMA-D) | Emotion Recognition | 6 | 606 MB | [Instructions](DATASETS.md#crema-d) |
+| [ESC50](https://github.com/karolpiczak/ESC-50) | Sound Event Classification | 50 | 881 MB | [Instructions](DATASETS.md#esc50) |
+| [ESC50-Actions](https://github.com/karolpiczak/ESC-50) | Sound Event Classification | 10 | 881 MB | [Instructions](DATASETS.md#esc50-actions) |
+| [GT-Music-Genre](https://www.kaggle.com/datasets/andradaolteanu/gtzan-dataset-music-genre-classification) | Music Analysis | 10 | 1.3 GB | [Instructions](DATASETS.md#gt-music-genre) |
+| [NS-Instruments](https://magenta.tensorflow.org/datasets/nsynth) | Instrument Classification | 10 | 18.5 GB | [Instructions](DATASETS.md#ns-instruments) |
+| [RAVDESS](https://zenodo.org/records/1188976#.YFZuJ0j7SL8) | Emotion Recognition | 8 | 1.1 GB | [Instructions](DATASETS.md#ravdess) |
+| [SESA](https://zenodo.org/records/3519845) | Surveillance Sound Classification | 4 | 70 MB | [Instructions](DATASETS.md#sesa) |
+| [TUT2017](https://zenodo.org/records/400515) | Acoustic Scene Classification | 15 | 12.3 GB | [Instructions](DATASETS.md#tut2017) |
+| [UrbanSound8K](https://urbansounddataset.weebly.com/urbansound8k.html) | Sound Event Classification | 10 | 6.8 GB | [Instructions](DATASETS.md#urbansound8k) |
+| [VocalSound](https://github.com/YuanGongND/vocalsound) | Vocal Sound Classification | 6 | 8.2 GB | [Instructions](DATASETS.md#vocalsound) |
 
 </br>
 </br>
 
-All datasets should be placed in a directory named `Audio-Datasets,` and the path of this directory should be specified in the variable `DATASET_ROOT` in the shell [`scripts`](/scripts/). The directory structure should be as follows:
+All datasets should be placed in a directory named `Audio-Datasets` and the path of this directory should be specified in the variable `DATASET_ROOT` in the shell [`scripts`](/scripts/). The directory structure should be as follows:
 ```
 Audio-Datasets/
     ├── Beijing-Opera/
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+__pycache__/*`
	`2`	`+pengi/configs/base.pth`