ebezzam · ebezzam · Aug 18, 2024 · Aug 18, 2024 · Aug 18, 2024 · Aug 18, 2024
diff --git a/.gitignore b/.gitignore
@@ -1,5 +1,6 @@
 project_env/
 .DS_Store
+dataset/*
 
 # from hydra
 outputs/   

diff --git a/README.rst b/README.rst
@@ -61,7 +61,8 @@ choose the features that you like. This flexibility is one of the best
 * Code formatting.
 * Unit tests and continuous integration.
 * Packaging and distribution.
-* Remove development.
+* Remote development.
+* Creating and sharing datasets with Hugging Face.
 
 The accompanying 
 `slides <https://docs.google.com/presentation/d/1D1_JywMl2rjaeuVzpykPBOJsDIuwQKGOJB4EFZjej2s/edit#slide=id.g2eaa4b61f15_0_1346>`__ 

diff --git a/examples/configs/hf_dataset.yaml b/examples/configs/hf_dataset.yaml
@@ -0,0 +1,29 @@
+# python examples/create_huggingface_dataset.py hf_token=YOUR_TOKEN
+hydra:
+  job:
+    chdir: True    # change to output folder
+  job_logging:
+    formatters:
+      simple:
+        format: '[%(levelname)s] - %(message)s'
+
+repo_id: bezzam/dummy-dataset
+seed: 0
+test_size: 0.15
+hf_token:
+
+data_dir:
+  audio:
+    dir: dataset/data_audio
+    type: wav
+  image:
+    dir: dataset/data_images
+    type: png
+  text:
+    dir: dataset/data_text
+    type: txt
+  label:
+    file: dataset/data_labels.csv
+    label: True
+
+stratify_by_column: label
diff --git a/examples/create_huggingface_dataset.py b/examples/create_huggingface_dataset.py
@@ -0,0 +1,232 @@
+"""
+We will create a dataset with images, audios, and text data
+so that you can see how various data types can be pushed to
+Hugging Face!
+
+The default configuration is in `examples/configs/hf_dataset.yaml`:
+
+```bash
+# install
+pip install datasets huggingface_hub soundfile
+
+# make a WRITE token on HuggingFace: https://huggingface.co/settings/tokens
+
+# run
+python examples/create_huggingface_dataset.py \
+hf_token=... \
+```
+"""
+
+import hydra
+from hydra.utils import to_absolute_path
+import os
+import time
+import glob
+import numpy as np
+import soundfile as sf
+from PIL import Image as PILImage
+from datasets import Dataset, Image, Audio, ClassLabel
+from omegaconf import open_dict
+from huggingface_hub import upload_file
+import re
+import pandas as pd
+
+
+# -- helper functions
+def convert(text):
+    return int(text) if text.isdigit() else text.lower()
+
+
+def alphanum_key(key):
+    return [convert(c) for c in re.split("([0-9]+)", key)]
+
+
+def natural_sort(arr):
+    return sorted(arr, key=alphanum_key)
+
+
+@hydra.main(version_base=None, config_path="configs", config_name="hf_dataset")
+def main(config):
+
+    start_time = time.time()
+
+    # extract and check parameters
+    repo_id = config.repo_id
+    hf_token = config.hf_token
+    test_size = config.test_size
+
+    assert repo_id is not None, "Please provide a Hugging Face repo_id."
+    assert hf_token is not None, "Please provide a Hugging Face token."
+
+    # to absolute path, as needed by Hugging Face upload
+    for data in config.data_dir:
+        if "dir" in config.data_dir[data]:
+            config.data_dir[data]["dir"] = to_absolute_path(config.data_dir[data]["dir"])
+        elif "file" in config.data_dir[data]:
+            config.data_dir[data]["file"] = to_absolute_path(config.data_dir[data]["file"])
+
+    # Step 1: Check data (create dummy data if not present)
+    n_files = 100  # number of dummy files to create
+    for data in config.data_dir:
+
+        # for directory of data
+        if "dir" in config.data_dir[data]:
+            input_dir = config.data_dir[data]["dir"]
+            data_type = config.data_dir[data]["type"]
+
+            if not os.path.exists(input_dir):
+                # create dummy data
+                print(f"-- Creating {n_files} dummy {data_type} files in {input_dir}")
+                os.makedirs(input_dir, exist_ok=True)
+                for i in range(n_files):
+                    if data_type == "png":
+                        dim = np.random.randint(100, 200)
+                        img = np.random.randint(0, 255, (dim, dim, 3), dtype=np.uint8)
+                        img_path = os.path.join(input_dir, f"{i}.png")
+                        PILImage.fromarray(img).save(img_path)
+                    elif data_type == "wav":
+                        duration = np.random.randint(1, 4)
+                        sample_rate = 16000
+                        audio = np.random.randn(duration * sample_rate)
+                        audio_path = os.path.join(input_dir, f"{i}.wav")
+                        sf.write(audio_path, audio, samplerate=sample_rate)
+                    elif data_type == "txt":
+                        text = f"Hello, this is file {i}"
+                        text_path = os.path.join(input_dir, f"{i}.txt")
+                        with open(text_path, "w") as f:
+                            f.write(text)
+
+            # check number of files
+            files = glob.glob(os.path.join(input_dir, "*." + data_type))
+            n_files = len(files)
+            print(f"Found {n_files} {data_type} files in {input_dir}")
+
+        # for CSV file where each line is a data point
+        elif "file" in config.data_dir[data]:
+            input_file = config.data_dir[data]["file"]
+
+            if not os.path.exists(input_file):
+                # create dummy labels
+                labels = ["good", "ok", "bad"]
+                file_labels = np.random.choice(labels, n_files)
+                with open(input_file, "w") as f:
+                    for i in range(n_files):
+                        f.write(f"{i},{file_labels[i]}\n")
+                print(f"-- Created dummy labels file at {input_file}")
+
+            # check number of unique labels (open with Pandas)
+            df = pd.read_csv(input_file, header=None)
+            n_files = len(df)
+            labels = df[1].unique()
+            n_labels = len(df[1].unique())
+            print(f"Found {n_files} lines with {n_labels} unique labels ({labels}) in {input_file}")
+
+        else:
+            raise ValueError("Please provide either `dir` or `file` in data_dir")
+
+    # -- only keep common files across all datasets
+    bn = [os.path.basename(f).split(".")[0] for f in files]
+    for data in config.data_dir:
+        if "dir" in config.data_dir[data]:
+            input_dir = config.data_dir[data]["dir"]
+            data_type = config.data_dir[data]["type"]
+            files = glob.glob(os.path.join(input_dir, "*." + data_type))
+            bn_data = [os.path.basename(f).split(".")[0] for f in files]
+            common_files = list(set(bn).intersection(bn_data))
+    common_files = natural_sort(common_files)
+    print(f"Number of common files: {len(common_files)}")
+
+    # -- add common files into dictionary
+    for data in config.data_dir:
+        if "dir" in config.data_dir[data]:
+            with open_dict(config):
+                config.data_dir[data]["data"] = common_files
+        if "file" in config.data_dir[data]:
+            # take row according to common_files
+            df = pd.read_csv(config.data_dir[data]["file"], header=None)
+            # -- make first column string
+            df[0] = df[0].astype(str)
+            df = df[df[0].isin(common_files)]
+            with open_dict(config):
+                config.data_dir[data]["data"] = df[1].tolist()
+
+    # Step 2: Create train and test data
+    dataset_dict = {}
+
+    # -- create dictionary of content
+    for data in config.data_dir:
+        if "dir" in config.data_dir[data]:
+            files = config.data_dir[data]["data"]
+            data_type = config.data_dir[data]["type"]
+            data_files = [
+                os.path.join(config.data_dir[data]["dir"], f"{f}.{data_type}") for f in files
+            ]
+
+            if data_type in ["txt"]:
+                # open file content for text files
+                data_files = [open(f).read() for f in data_files]
+            dataset_dict[data] = data_files
+        elif "file" in config.data_dir[data]:
+            dataset_dict[data] = config.data_dir[data]["data"]
+
+    # -- create dataset
+    dataset = Dataset.from_dict(dataset_dict)
+    for data in config.data_dir:
+        if "dir" in config.data_dir[data]:
+            if config.data_dir[data]["type"] in ["png", "jpg", "jpeg", "tiff"]:
+                dataset = dataset.cast_column(data, Image())
+            elif config.data_dir[data]["type"] in ["wav", "mp3", "flac", "ogg"]:
+                dataset = dataset.cast_column(data, Audio())
+        elif "file" in config.data_dir[data]:
+            if config.data_dir[data]["label"]:
+                labels = list(set(config.data_dir[data]["data"]))
+                dataset = dataset.cast_column(data, ClassLabel(names=labels))
+
+    # -- split into train and test
+    dataset = dataset.train_test_split(
+        test_size=test_size,
+        seed=config.seed,
+        shuffle=True,
+        stratify_by_column=config.stratify_by_column,  # shuffle must be True
+    )
+    print(dataset)
+
+    """
+    DatasetDict({
+        train: Dataset({
+            features: ['audio', 'images', 'text', 'labels'],
+            num_rows: 85
+        })
+        test: Dataset({
+            features: ['audio', 'images', 'text', 'labels'],
+            num_rows: 15
+        })
+    })
+    """
+
+    # Step 3: Push to Hugging Face
+    dataset.push_to_hub(repo_id, token=hf_token)
+
+    # -- push individual files
+    for data in config.data_dir:
+        if "dir" in config.data_dir[data]:
+            # push first file
+            local_fp = os.path.join(
+                config.data_dir[data]["dir"],
+                config.data_dir[data]["data"][0] + "." + config.data_dir[data]["type"],
+            )
+            remote_fn = "example." + config.data_dir[data]["type"]
+            upload_file(
+                path_or_fileobj=local_fp,
+                path_in_repo=remote_fn,
+                repo_id=repo_id,
+                repo_type="dataset",
+                token=hf_token,
+            )
+
+    # total time in minutes
+    print(f"Total time: {(time.time() - start_time) / 60} minutes")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/use_huggingface_dataset.py b/examples/use_huggingface_dataset.py
@@ -0,0 +1,50 @@
+"""
+In this script, we use the Hugging Face dataset made
+from the script examples/create_huggingface_dataset.py
+
+The dataset is available at:
+https://huggingface.co/datasets/bezzam/dummy-dataset
+
+```bash
+# install
+pip install datasets librosa soundfile
+
+# run
+python examples/use_huggingface_dataset.py
+```
+
+During the first run, the dataset will be downloaded and cached.
+Subsequent runs will use the cached dataset.
+
+"""
+
+from datasets import load_dataset
+import numpy as np
+
+
+# load train and test splits
+ds_train = load_dataset("bezzam/dummy-dataset", split="train")
+ds_test = load_dataset("bezzam/dummy-dataset", split="test")
+print(f"Number of training samples: {len(ds_train)}")
+print(f"Number of test samples: {len(ds_test)}")
+
+# load first example
+print("\n---- First example:")
+example = ds_train[0]
+
+# -- audio duration
+duration = len(example["audio"]["array"]) / example["audio"]["sampling_rate"]
+print(f"Duration of audio: {duration:.2f} seconds")
+
+# -- image size
+image = np.array(example["image"])
+print(f"Size of image: {image.shape}")
+
+# -- text
+text = example["text"]
+print(f"Text: {text}")
+
+# -- label
+label = example["label"]
+label_str = ds_train.features["label"].int2str(label)
+print(f"Label: {label_str}")