mlcommons · monke6942021 · Feb 8, 2024 · Sep 14, 2023 · Sep 14, 2023 · Sep 14, 2023
diff --git a/datasets/audio_test/data/Clap.mp3 b/datasets/audio_test/data/Clap.mp3
diff --git a/datasets/audio_test/data/Snap.mp3 b/datasets/audio_test/data/Snap.mp3
diff --git a/datasets/audio_test/metadata.json b/datasets/audio_test/metadata.json
@@ -0,0 +1,75 @@
+{
+  "@context": {
+    "@language": "en",
+    "@vocab": "https://schema.org/",
+    "column": "ml:column",
+    "conformsTo": "dct:conformsTo",
+    "data": {
+      "@id": "ml:data",
+      "@type": "@json"
+    },
+    "dataBiases": "ml:dataBiases",
+    "dataCollection": "ml:dataCollection",
+    "dataType": {
+      "@id": "ml:dataType",
+      "@type": "@vocab"
+    },
+    "dct": "http://purl.org/dc/terms/",
+    "extract": "ml:extract",
+    "field": "ml:field",
+    "fileProperty": "ml:fileProperty",
+    "format": "ml:format",
+    "includes": "ml:includes",
+    "isEnumeration": "ml:isEnumeration",
+    "jsonPath": "ml:jsonPath",
+    "ml": "http://mlcommons.org/schema/",
+    "parentField": "ml:parentField",
+    "path": "ml:path",
+    "personalSensitiveInformation": "ml:personalSensitiveInformation",
+    "recordSet": "ml:recordSet",
+    "references": "ml:references",
+    "regex": "ml:regex",
+    "repeated": "ml:repeated",
+    "replace": "ml:replace",
+    "sc": "https://schema.org/",
+    "separator": "ml:separator",
+    "source": "ml:source",
+    "subField": "ml:subField",
+    "transform": "ml:transform",
+    "wd": "https://www.wikidata.org/wiki/"
+  },
+  "@type": "sc:Dataset",
+  "name": "audio_test",
+  "description": "This is the basic test case for audio files",
+  "conformsTo": "http://mlcommons.org/croissant/1.0",
+  "url": "None",
+  "distribution": [
+    {
+      "@type": "sc:FileSet",
+      "name": "files",
+      "encodingFormat": "audio/mpeg",
+      "includes": "data/*.mp3"
+    }
+  ],
+  "recordSet": [
+    {
+      "@type": "ml:RecordSet",
+      "name": "records",
+      "description": "These are the records.",
+      "field": [
+        {
+          "@type": "ml:Field",
+          "name": "audio",
+          "description": "These are the sounds.",
+          "dataType": "sc:AudioObject",
+          "source": {
+            "distribution": "files",
+            "extract": {
+              "fileProperty": "content"
+            }
+          }
+        }
+      ]
+    }
+  ]
+}
diff --git a/datasets/audio_test/output/records.jsonl b/datasets/audio_test/output/records.jsonl
diff --git a/editor/events/resources.py b/editor/events/resources.py
@@ -72,6 +72,6 @@ def _create_instance1_from_instance2(instance1: Resource, instance2: type):
     attributes1 = set((field.name for field in dataclasses.fields(instance1)))
     attributes2 = set((field.name for field in dataclasses.fields(instance2)))
     common_attributes = attributes2.intersection(attributes1)
-    return instance2(**{
-        attribute: getattr(instance1, attribute) for attribute in common_attributes
-    })
+    return instance2(
+        **{attribute: getattr(instance1, attribute) for attribute in common_attributes}
+    )
@@ -55,6 +55,14 @@
 SCHEMA_ORG_CREATOR = namespace.SDO.creator
 SCHEMA_ORG_DATE_PUBLISHED = namespace.SDO.datePublished
 SCHEMA_ORG_DATASET = namespace.SDO.Dataset
+SCHEMA_ORG_DATA_TYPE_AUDIO_OBJECT = namespace.SDO.AudioObject
+SCHEMA_ORG_DATA_TYPE_BOOL = namespace.SDO.Boolean
+SCHEMA_ORG_DATA_TYPE_DATE = namespace.SDO.Date
+SCHEMA_ORG_DATA_TYPE_FLOAT = namespace.SDO.Float
+SCHEMA_ORG_DATA_TYPE_IMAGE_OBJECT = namespace.SDO.ImageObject
+SCHEMA_ORG_DATA_TYPE_INTEGER = namespace.SDO.Integer
+SCHEMA_ORG_DATA_TYPE_TEXT = namespace.SDO.Text
+SCHEMA_ORG_DATA_TYPE_URL = namespace.SDO.URL
 SCHEMA_ORG_DESCRIPTION = namespace.SDO.description
 SCHEMA_ORG_DISTRIBUTION = namespace.SDO.distribution
 SCHEMA_ORG_EMAIL = namespace.SDO.email
@@ -124,8 +132,10 @@ class EncodingFormat:
 
     CSV = "text/csv"
     GIT = "git+https"
+    JPG = "image/jpeg"
     JSON = "application/json"
     JSON_LINES = "application/jsonlines"
+    MP3 = "audio/mpeg"
     PARQUET = "application/x-parquet"
     TEXT = "text/plain"
     TSV = "text/tsv"
@@ -136,6 +146,7 @@ class EncodingFormat:
 class DataType:
     """Data types supported by Croissant."""
 
+    AUDIO_OBJECT = namespace.SDO.AudioObject
     BOOL = namespace.SDO.Boolean
     BOUNDING_BOX = ML_COMMONS.BoundingBox
     DATE = namespace.SDO.Date

@@ -86,5 +86,10 @@ def PIL_Image(cls) -> types.ModuleType:  # pylint: disable=invalid-name
         """Cached git module."""
         return _try_import("PIL.Image", package_name="Pillow")
 
+    @cached_class_property
+    def LIB_Audio(cls) -> types.ModuleType:  # pylint: disable=invalid-name
+        """Cached git module."""
+        return _try_import("librosa", package_name="librosa")
+
 
 deps = OptionalDependencies
@@ -127,6 +127,11 @@ def test_hermetic_loading(dataset_name, record_set_name, num_records):
         ["huggingface-c4/metadata.json", "en", 1],
         ["huggingface-mnist/metadata.json", "default", 10],
         ["titanic/metadata.json", "passengers", -1],
+        [
+            "audio_test/metadata.json",
+            "records",
+            -1,
+        ],  # Switch the number to 10 if nessacary
     ],
 )
 def test_nonhermetic_loading(dataset_name, record_set_name, num_records):

@@ -31,6 +31,9 @@ def _cast_value(value: Any, data_type: type | term.URIRef | None):
             return deps.PIL_Image.open(io.BytesIO(value))
         else:
             raise ValueError(f"Type {type(value)} is not accepted for an image.")
+    elif data_type == DataType.AUDIO_OBJECT:
+        output = deps.LIB_Audio.load(io.BytesIO(value))
+        return str([output[0].tolist(), output[1]])
     elif data_type == DataType.BOUNDING_BOX:
         return bounding_box.parse(value)
     elif not isinstance(data_type, type):

@@ -120,6 +120,13 @@ def _read_file_content(self, encoding_format: str, file: Path) -> pd.DataFrame:
                     return pd.DataFrame({
                         FileProperty.content: [file.read()],
                     })
+            elif (
+                encoding_format == EncodingFormat.MP3
+                or encoding_format == EncodingFormat.JPG
+            ):
+                return pd.DataFrame({
+                    FileProperty.content: [file.read()],
+                })
             else:
                 raise ValueError(
                     f"Unsupported encoding format for file: {encoding_format}"

@@ -101,6 +101,7 @@ def data_type(self) -> type | term.URIRef | None:
                 elif data_type in [
                     DataType.IMAGE_OBJECT,
                     DataType.BOUNDING_BOX,
+                    DataType.AUDIO_OBJECT,
                 ]:
                     return term.URIRef(data_type)
         # The data_type has to be found on a predecessor:

@@ -7,6 +7,7 @@ version = "0.0.5"
 authors = [
   { name = "Joaquin Vanschoren" },
   { name = "Jos van der Velde" },
+  { name = "Monjish Bhattacharyya" },
   { name = "Omar Benjelloun" },
   { name = "Peter Mattson" },
   { name = "Pieter Gijsbers" },
@@ -18,6 +19,7 @@ authors = [
 # pip dependencies of the project
 # Installed locally with `pip install -e .`
 dependencies = [
+  "black[jupyter]",
   "absl-py",
   "etils[epath]",
   "jsonpath-rw",
@@ -27,7 +29,7 @@ dependencies = [
   "python-dateutil",
   "rdflib",
   "requests",
-  "tqdm",
+  "tqdm"
 ]
 readme = "README.md"
 
@@ -38,6 +40,7 @@ dev = [
   "black==23.11.0",
   "datasets",
   "flake8-docstrings",
+  "mlcroissant[audio]",
   "mlcroissant[git]",
   "mlcroissant[image]",
   "mlcroissant[parquet]",
@@ -48,6 +51,7 @@ dev = [
   "pytest",
   "pytype",
 ]
+audio = ["librosa"]
 git = ["GitPython"]
 image = ["Pillow"]
 parquet = ["pyarrow"]
@@ -79,9 +83,10 @@ module = [
   "datasets",
   "etils.*",
   "jsonpath_rw",
+  "librosa",
   "networkx",
   "pandas",
-  "pillow",
+  "pillow"
 ]
 ignore_missing_imports = true